diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 27894d27e7..762d436028 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -12,6 +12,7 @@ dependencies:
 - clang-tools=16.0.6
 - clang==16.0.6
 - cmake>=3.26.4
+- cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
 - cuda-python>=11.7.1,<12.0a0
 - cuda-version=11.8
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 96e5227186..48cd34c6ca 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -12,6 +12,7 @@ dependencies:
 - clang-tools=16.0.6
 - clang==16.0.6
 - cmake>=3.26.4
+- cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
 - cuda-python>=11.7.1,<12.0a0
 - cuda-version=11.8
diff --git a/conda/environments/all_cuda-120_arch-aarch64.yaml b/conda/environments/all_cuda-120_arch-aarch64.yaml
index e4ba4281c2..9108fb6215 100644
--- a/conda/environments/all_cuda-120_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-120_arch-aarch64.yaml
@@ -14,6 +14,7 @@ dependencies:
 - cmake>=3.26.4
 - cuda-cudart-dev
 - cuda-nvcc
+- cuda-nvtx-dev
 - cuda-profiler-api
 - cuda-python>=12.0,<13.0a0
 - cuda-version=12.0
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index d0430e10f6..8f1fbf6744 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -14,6 +14,7 @@ dependencies:
 - cmake>=3.26.4
 - cuda-cudart-dev
 - cuda-nvcc
+- cuda-nvtx-dev
 - cuda-profiler-api
 - cuda-python>=12.0,<13.0a0
 - cuda-version=12.0
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index 579a8a0ceb..b5fc4e3bd5 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -12,6 +12,7 @@ dependencies:
 - clang-tools=16.0.6
 - clang==16.0.6
 - cmake>=3.26.4
+- cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
 - cuda-version=11.8
 - cudatoolkit
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 2c92ad0a99..b868f26e15 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -12,6 +12,7 @@ dependencies:
 - clang-tools=16.0.6
 - clang==16.0.6
 - cmake>=3.26.4
+- cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
 - cuda-version=11.8
 - cudatoolkit
diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
new file mode 100644
index 0000000000..4a3818fe5d
--- /dev/null
+++ b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
@@ -0,0 +1,40 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- conda-forge
+- nvidia
+dependencies:
+- benchmark>=1.8.2
+- c-compiler
+- clang-tools=16.0.6
+- clang==16.0.6
+- cmake>=3.26.4
+- cuda-cudart-dev
+- cuda-nvcc
+- cuda-nvtx-dev
+- cuda-profiler-api
+- cuda-version=12.0
+- cxx-compiler
+- cython>=3.0.0
+- gcc_linux-aarch64=11.*
+- glog>=0.6.0
+- h5py>=3.8.0
+- hnswlib=0.7.0
+- libcublas-dev
+- libcurand-dev
+- libcusolver-dev
+- libcusparse-dev
+- matplotlib
+- nccl>=2.9.9
+- ninja
+- nlohmann_json>=3.11.2
+- openblas
+- pandas
+- pyyaml
+- rmm==23.12.*
+- scikit-build>=0.13.1
+- sysroot_linux-aarch64==2.17
+name: bench_ann_cuda-120_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
new file mode 100644
index 0000000000..3d6f8c4ec1
--- /dev/null
+++ b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
@@ -0,0 +1,40 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- conda-forge
+- nvidia
+dependencies:
+- benchmark>=1.8.2
+- c-compiler
+- clang-tools=16.0.6
+- clang==16.0.6
+- cmake>=3.26.4
+- cuda-cudart-dev
+- cuda-nvcc
+- cuda-nvtx-dev
+- cuda-profiler-api
+- cuda-version=12.0
+- cxx-compiler
+- cython>=3.0.0
+- gcc_linux-64=11.*
+- glog>=0.6.0
+- h5py>=3.8.0
+- hnswlib=0.7.0
+- libcublas-dev
+- libcurand-dev
+- libcusolver-dev
+- libcusparse-dev
+- matplotlib
+- nccl>=2.9.9
+- ninja
+- nlohmann_json>=3.11.2
+- openblas
+- pandas
+- pyyaml
+- rmm==23.12.*
+- scikit-build>=0.13.1
+- sysroot_linux-64==2.17
+name: bench_ann_cuda-120_arch-x86_64
diff --git a/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp b/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp
index 97d1bbf307..8a0a3ffd37 100644
--- a/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp
+++ b/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp
@@ -49,7 +49,7 @@ void parse_build_param(const nlohmann::json& conf,
                        typename raft::bench::ann::FaissCpuIVFPQ<T>::BuildParam& param)
 {
   parse_base_build_param<T>(conf, param);
-  param.M = conf.at("M");
+  param.M_ratio = conf.at("M_ratio");
   if (conf.contains("usePrecomputed")) {
     param.usePrecomputed = conf.at("usePrecomputed");
   } else {
diff --git a/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
index 028a444530..626e52b086 100644
--- a/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
+++ b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
@@ -229,7 +229,7 @@ template <typename T>
 class FaissCpuIVFPQ : public FaissCpu<T> {
  public:
   struct BuildParam : public FaissCpu<T>::BuildParam {
-    int M;
+    int M_ratio;
     int bitsPerCode;
     bool usePrecomputed;
   };
@@ -237,8 +237,12 @@ class FaissCpuIVFPQ : public FaissCpu<T> {
   FaissCpuIVFPQ(Metric metric, int dim, const BuildParam& param) : FaissCpu<T>(metric, dim, param)
   {
     this->init_quantizer(dim);
-    this->index_ = std::make_unique<faiss::IndexIVFPQ>(
-      this->quantizer_.get(), dim, param.nlist, param.M, param.bitsPerCode, this->metric_type_);
+    this->index_ = std::make_unique<faiss::IndexIVFPQ>(this->quantizer_.get(),
+                                                       dim,
+                                                       param.nlist,
+                                                       dim / param.M_ratio,
+                                                       param.bitsPerCode,
+                                                       this->metric_type_);
   }
 
   void save(const std::string& file) const override
diff --git a/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu b/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu
index 8b04ba1980..8de8973f16 100644
--- a/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu
+++ b/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu
@@ -50,7 +50,7 @@ void parse_build_param(const nlohmann::json& conf,
                        typename raft::bench::ann::FaissGpuIVFPQ<T>::BuildParam& param)
 {
   parse_base_build_param<T>(conf, param);
-  param.M = conf.at("M");
+  param.M_ratio = conf.at("M_ratio");
   if (conf.contains("usePrecomputed")) {
     param.usePrecomputed = conf.at("usePrecomputed");
   } else {
diff --git a/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
index 38eeddf813..8db8e29ef7 100644
--- a/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
+++ b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
@@ -263,7 +263,7 @@ template <typename T>
 class FaissGpuIVFPQ : public FaissGpu<T> {
  public:
   struct BuildParam : public FaissGpu<T>::BuildParam {
-    int M;
+    int M_ratio;
     bool useFloat16;
     bool usePrecomputed;
   };
@@ -274,11 +274,12 @@ class FaissGpuIVFPQ : public FaissGpu<T> {
     config.useFloat16LookupTables = param.useFloat16;
     config.usePrecomputedTables   = param.usePrecomputed;
     config.device                 = this->device_;
+
     this->index_ =
       std::make_unique<faiss::gpu::GpuIndexIVFPQ>(&(this->gpu_resource_),
                                                   dim,
                                                   param.nlist,
-                                                  param.M,
+                                                  dim / param.M_ratio,
                                                   8,  // FAISS only supports bitsPerCode=8
                                                   this->metric_type_,
                                                   config);
diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu
index 3b9bcc7e15..fa20c5c223 100644
--- a/cpp/bench/ann/src/raft/raft_benchmark.cu
+++ b/cpp/bench/ann/src/raft/raft_benchmark.cu
@@ -272,13 +272,5 @@ REGISTER_ALGO_INSTANCE(std::uint8_t);
 
 #ifdef ANN_BENCH_BUILD_MAIN
 #include "../common/benchmark.hpp"
-int main(int argc, char** argv)
-{
-  rmm::mr::cuda_memory_resource cuda_mr;
-  // Construct a resource that uses a coalescing best-fit pool allocator
-  rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> pool_mr{&cuda_mr};
-  rmm::mr::set_current_device_resource(
-    &pool_mr);  // Updates the current device resource pointer to `pool_mr`
-  return raft::bench::ann::run_main(argc, argv);
-}
+int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
 #endif
diff --git a/dependencies.yaml b/dependencies.yaml
index 8b2526a4ff..aba81d7ed9 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -22,7 +22,7 @@ files:
   bench_ann:
     output: conda
     matrix:
-      cuda: ["11.8"]
+      cuda: ["11.8", "12.0"]
       arch: [x86_64, aarch64]
     includes:
       - build
@@ -246,6 +246,7 @@ dependencies:
               cuda: "12.0"
             packages:
               - cuda-version=12.0
+              - cuda-nvtx-dev
               - cuda-cudart-dev
               - cuda-profiler-api
               - libcublas-dev
@@ -257,6 +258,7 @@ dependencies:
             packages:
               - cuda-version=11.8
               - cudatoolkit
+              - cuda-nvtx=11.8
               - cuda-profiler-api=11.8.86
               - libcublas-dev=11.11.3.6
               - libcublas=11.11.3.6
@@ -271,6 +273,7 @@ dependencies:
             packages:
               - cuda-version=11.5
               - cudatoolkit
+              - cuda-nvtx=11.5
               - cuda-profiler-api>=11.4.240,<=11.8.86 # use any `11.x` version since pkg is missing several CUDA/arch packages
               - libcublas-dev>=11.7.3.1,<=11.7.4.6
               - libcublas>=11.7.3.1,<=11.7.4.6
@@ -285,6 +288,7 @@ dependencies:
             packages:
               - cuda-version=11.4
               - cudatoolkit
+              - &cudanvtx114 cuda-nvtx=11.4
               - cuda-profiler-api>=11.4.240,<=11.8.86 # use any `11.x` version since pkg is missing several CUDA/arch packages
               - &libcublas_dev114 libcublas-dev>=11.5.2.43,<=11.6.5.2
               - &libcublas114 libcublas>=11.5.2.43,<=11.6.5.2
@@ -299,6 +303,7 @@ dependencies:
             packages:
               - cuda-version=11.2
               - cudatoolkit
+              - *cudanvtx114
               - cuda-profiler-api>=11.4.240,<=11.8.86 # use any `11.x` version since pkg is missing several CUDA/arch packages
               # The NVIDIA channel doesn't publish pkgs older than 11.4 for these libs,
               # so 11.2 uses 11.4 packages (the oldest available).
diff --git a/docs/source/ann_benchmarks_param_tuning.md b/docs/source/ann_benchmarks_param_tuning.md
index 075d82a135..90a5e54e32 100644
--- a/docs/source/ann_benchmarks_param_tuning.md
+++ b/docs/source/ann_benchmarks_param_tuning.md
@@ -15,34 +15,34 @@ IVF-flat uses an inverted-file index, which partitions the vectors into a series
 
 IVF-flat is a simple algorithm which won't save any space, but it provides competitive search times even at higher levels of recall.
 
-| Parameter             | Type             | Required | Data Type                  | Default  | Description                                                                                                                                                                       |
-|-----------------------|------------------|----------|----------------------------|----------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `nlists`              | `build_param`    | Y        | Positive Integer >0        |          | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
-| `niter`               | `build_param`    | N        | Positive Integer >0        | 20       | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
-| `ratio`               | `build_param`    | N        | Positive Integer >0        | 2        | `1/ratio` is the number of training points which should be used to train the clusters.                                                                                            |
+| Parameter            | Type             | Required | Data Type                  | Default  | Description                                                                                                                                                                       |
+|----------------------|------------------|----------|----------------------------|----------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `nlist`              | `build_param`    | Y        | Positive Integer >0        |          | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
+| `niter`              | `build_param`    | N        | Positive Integer >0        | 20       | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
+| `ratio`              | `build_param`    | N        | Positive Integer >0        | 2        | `1/ratio` is the number of training points which should be used to train the clusters.                                                                                            |
 | `dataset_memory_type` | `build_param` | N | ["device", "host", "mmap"] | "device" | What memory type should the dataset reside?                                                                                                                                       |
-| `query_memory_type`   | `search_params` | N | ["device", "host", "mmap"] | "device | What memory type should the queries reside? |
-| `nprobe`              | `search_params`  | Y        | Positive Integer >0        |          | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                       |
+| `query_memory_type`  | `search_params` | N | ["device", "host", "mmap"] | "device | What memory type should the queries reside? |
+| `nprobe`             | `search_params`  | Y        | Positive Integer >0        |          | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                       |
 
 
 ### `raft_ivf_pq`
 
 IVF-pq is an inverted-file index, which partitions the vectors into a series of clusters, or lists, in a similar way to IVF-flat above. The difference is that IVF-PQ uses product quantization to also compress the vectors, giving the index a smaller memory footprint. Unfortunately, higher levels of compression can also shrink recall, which a refinement step can improve when the original vectors are still available.
 
-| Parameter               | Type           | Required | Data Type                        | Default | Description                                                                                                                                                                     |
-|-------------------------|----------------|---|----------------------------------|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `nlists`                | `build_param`  | Y | Positive Integer >0              |         | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
-| `niter`                 | `build_param`  | N | Positive Integer >0              | 20      | Number of k-means iterations to use when training the clusters.                                                                                                                 |
-| `ratio`                 | `build_param`  | N | Positive Integer >0              | 2       | `1/ratio` is the number of training points which should be used to train the clusters.                                                                                            |
-| `pq_dim`                | `build_param`  | N | Positive Integer. Multiple of 8. | 0       | Dimensionality of the vector after product quantization. When 0, a heuristic is used to select this value. `pq_dim` * `pq_bits` must be a multiple of 8.                        |
-| `pq_bits`               | `build_param`  | N | Positive Integer. [4-8]          | 8       | Bit length of the vector element after quantization.                                                                                                                            |
-| `codebook_kind`         | `build_param`  | N | ["cluster", "subspace"]          | "subspace" | Type of codebook. See the [API docs](https://docs.rapids.ai/api/raft/nightly/cpp_api/neighbors_ivf_pq/#_CPPv412codebook_gen) for more detail                                 |
-| `dataset_memory_type`   | `build_param` | N | ["device", "host", "mmap"]       | "device" | What memory type should the dataset reside?                                                                                                                                       |
-| `query_memory_type`     | `search_params` | N | ["device", "host", "mmap"]       | "device | What memory type should the queries reside? |
-| `nprobe`                | `search_params` | Y | Positive Integer >0              |         | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                     |
+| Parameter              | Type           | Required | Data Type                        | Default | Description                                                                                                                                                                     |
+|------------------------|----------------|---|----------------------------------|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `nlist`                | `build_param`  | Y | Positive Integer >0              |         | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
+| `niter`                | `build_param`  | N | Positive Integer >0              | 20      | Number of k-means iterations to use when training the clusters.                                                                                                                 |
+| `ratio`                | `build_param`  | N | Positive Integer >0              | 2       | `1/ratio` is the number of training points which should be used to train the clusters.                                                                                            |
+| `pq_dim`               | `build_param`  | N | Positive Integer. Multiple of 8. | 0       | Dimensionality of the vector after product quantization. When 0, a heuristic is used to select this value. `pq_dim` * `pq_bits` must be a multiple of 8.                        |
+| `pq_bits`              | `build_param`  | N | Positive Integer. [4-8]          | 8       | Bit length of the vector element after quantization.                                                                                                                            |
+| `codebook_kind`        | `build_param`  | N | ["cluster", "subspace"]          | "subspace" | Type of codebook. See the [API docs](https://docs.rapids.ai/api/raft/nightly/cpp_api/neighbors_ivf_pq/#_CPPv412codebook_gen) for more detail                                 |
+| `dataset_memory_type`  | `build_param` | N | ["device", "host", "mmap"]       | "device" | What memory type should the dataset reside?                                                                                                                                       |
+| `query_memory_type`    | `search_params` | N | ["device", "host", "mmap"]       | "device | What memory type should the queries reside? |
+| `nprobe`               | `search_params` | Y | Positive Integer >0              |         | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                     |
 | `internalDistanceDtype` | `search_params` | N | [`float`, `half`]                | `half`  | The precision to use for the distance computations. Lower precision can increase performance at the cost of accuracy.                                                           |
-| `smemLutDtype`          | `search_params` | N | [`float`, `half`, `fp8`]         | `half`  | The precision to use for the lookup table in shared memory. Lower precision can increase performance at the cost of accuracy.                                                   |
-| `refine_ratio`          | `search_params` | N| Positive Number >=0              | 0       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.           |
+| `smemLutDtype`         | `search_params` | N | [`float`, `half`, `fp8`]         | `half`  | The precision to use for the lookup table in shared memory. Lower precision can increase performance at the cost of accuracy.                                                   |
+| `refine_ratio`         | `search_params` | N| Positive Number >=0              | 0       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.           |
 
 
 ### `raft_cagra`
@@ -86,13 +86,13 @@ IVF-pq is an inverted-file index, which partitions the vectors into a series of
 
 | Parameter        | Type           | Required | Data Type                        | Default | Description                                                                                                                                                                       |
 |------------------|----------------|----------|----------------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `nlists`         | `build_param`  | Y        | Positive Integer >0              |         | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
+| `nlist`          | `build_param`  | Y        | Positive Integer >0              |         | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
 | `ratio`          | `build_param`  | N        | Positive Integer >0              | 2       | `1/ratio` is the number of training points which should be used to train the clusters.                                                                                            |
-| `M`              | `build_param`  | Y        | Positive Integer Power of 2 [8-64] |         | Number of chunks or subquantizers for each vector.                                                                                                                                |
+| `M_ratio`        | `build_param`  | Y        | Positive Integer Power of 2 [8-64] |         | Ratio of numbeer of chunks or subquantizers for each vector. Computed by `dims` / `M_ratio`                                                                                         |
 | `usePrecomputed` | `build_param`  | N        | Boolean. Default=`false`         | `false` | Use pre-computed lookup tables to speed up search at the cost of increased memory usage.                                                                                          |
 | `useFloat16`     | `build_param`  | N        | Boolean. Default=`false`         | `false`  | Use half-precision floats for clustering step.                                                                                                                                    |
 | `numProbes`      | `search_params` | Y        | Positive Integer >0              |         | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                       |
-| `refine_ratio`   | `search_params` | N| Positive Number >=0          | 0       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.           |
+| `refine_ratio`   | `search_params` | N| Positive Number >=0          | 0       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.             |
 
 ### `faiss_cpu_flat`
 
@@ -108,26 +108,26 @@ Use FAISS flat index on the CPU, which performs an exact search using brute-forc
 Use FAISS IVF-Flat index on CPU
 
 | Parameter | Type           | Required | Data Type           | Default | Description                                                                                                                                                                       |
-|-----------|----------------|----------|---------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `nlists`  | `build_param`  | Y        | Positive Integer >0 |         | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
-| `ratio`   | `build_param`  | N        | Positive Integer >0 | 2       | `1/ratio` is the number of training points which should be used to train the clusters.                                                                                            |
-| `nprobe`  | `search_params` | Y        | Positive Integer >0 | | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                       |
-| `numThreads`     | `search_params` | N        | Positive Integer >0                  | 1       | Number of threads to use for queries.                                                                                                                                                                                                                                                             |
+|----------|----------------|----------|---------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `nlist`  | `build_param`  | Y        | Positive Integer >0 |         | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
+| `ratio`  | `build_param`  | N        | Positive Integer >0 | 2       | `1/ratio` is the number of training points which should be used to train the clusters.                                                                                            |
+| `nprobe` | `search_params` | Y        | Positive Integer >0 | | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                       |
+| `numThreads`    | `search_params` | N        | Positive Integer >0                  | 1       | Number of threads to use for queries.                                                                                                                                                                                                                                                             |
 
 ### `faiss_cpu_ivf_pq`
 
 Use FAISS IVF-PQ index on CPU
 
-| Parameter        | Type           | Required | Data Type                          | Default | Description                                                                                                                                                                   |
-|------------------|----------------|----------|------------------------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `nlists`         | `build_param`  | Y        | Positive Integer >0                |         | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
-| `ratio`          | `build_param`  | N        | Positive Integer >0                | 2       | `1/ratio` is the number of training points which should be used to train the clusters.                                                                                        |
-| `M`              | `build_param`  | Y        | Positive Integer Power of 2 [8-64] |         | Number of chunks or subquantizers for each vector.                                                                                                                            |
+| Parameter       | Type           | Required | Data Type                          | Default | Description                                                                                                                                                                   |
+|-----------------|----------------|----------|------------------------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `nlist`         | `build_param`  | Y        | Positive Integer >0                |         | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
+| `ratio`         | `build_param`  | N        | Positive Integer >0                | 2       | `1/ratio` is the number of training points which should be used to train the clusters.                                                                                        |
+| `M`             | `build_param`  | Y        | Positive Integer Power of 2 [8-64] |         | Number of chunks or subquantizers for each vector.                                                                                                                            |
 | `usePrecomputed` | `build_param`  | N        | Boolean. Default=`false`           | `false` | Use pre-computed lookup tables to speed up search at the cost of increased memory usage.                                                                                      |
-| `bitsPerCode`    | `build_param`  | N        | Positive Integer [4-8]             | 8       | Number of bits to use for each code.                                                                                                                                          |
-| `numProbes`      | `search_params` | Y        | Positive Integer >0                |         | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                   |
-| `refine_ratio`   | `search_params` | N| Positive Number >=0                | 0       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.         |
-| `numThreads`     | `search_params` | N        | Positive Integer >0                  | 1       | Number of threads to use for queries.                                                                                                                                                                                                                                                             |
+| `bitsPerCode`   | `build_param`  | N        | Positive Integer [4-8]             | 8       | Number of bits to use for each code.                                                                                                                                          |
+| `numProbes`     | `search_params` | Y        | Positive Integer >0                |         | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                   |
+| `refine_ratio`  | `search_params` | N| Positive Number >=0                | 0       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.         |
+| `numThreads`    | `search_params` | N        | Positive Integer >0                  | 1       | Number of threads to use for queries.                                                                                                                                                                                                                                                             |
 
 
 ## HNSW
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index 6a436a7213..305cb836e3 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -122,38 +122,52 @@ specified configuration.
 
 The usage of the script `raft-ann-bench.run` is:
 ```bash
-usage: run.py [-h] [-k COUNT] [-bs BATCH_SIZE] [--configuration CONFIGURATION] [--dataset DATASET] [--dataset-path DATASET_PATH] [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES]
-              [-f]
+usage: __main__.py [-h] [--subset-size SUBSET_SIZE] [-k COUNT] [-bs BATCH_SIZE] [--dataset-configuration DATASET_CONFIGURATION] [--configuration CONFIGURATION] [--dataset DATASET]
+                   [--dataset-path DATASET_PATH] [--build] [--search] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS] [-f] [-m SEARCH_MODE]
 
 options:
   -h, --help            show this help message and exit
+  --subset-size SUBSET_SIZE
+                        the number of subset rows of the dataset to build the index (default: None)
   -k COUNT, --count COUNT
                         the number of nearest neighbors to search for (default: 10)
   -bs BATCH_SIZE, --batch-size BATCH_SIZE
                         number of query vectors to use in each query trial (default: 10000)
+  --dataset-configuration DATASET_CONFIGURATION
+                        path to YAML configuration file for datasets (default: None)
   --configuration CONFIGURATION
-                        path to configuration file for a dataset (default: None)
-  --dataset DATASET     dataset whose configuration file will be used (default: glove-100-inner)
+                        path to YAML configuration file or directory for algorithms Any run groups found in the specified file/directory will automatically override groups of the same name
+                        present in the default configurations, including `base` (default: None)
+  --dataset DATASET     name of dataset (default: glove-100-inner)
   --dataset-path DATASET_PATH
-                        path to dataset folder (default: ${RAPIDS_DATASET_ROOT_DIR})
+                        path to dataset folder, by default will look in RAPIDS_DATASET_ROOT_DIR if defined, otherwise a datasets subdirectory from the calling directory (default:
+                        os.getcwd()/datasets/)
   --build
   --search
   --algorithms ALGORITHMS
-                        run only comma separated list of named algorithms (default: None)
-  --indices INDICES     run only comma separated list of named indices. parameter `algorithms` is ignored (default: None)
+                        run only comma separated list of named algorithms. If parameters `groups` and `algo-groups are both undefined, then group `base` is run by default (default: None)
+  --groups GROUPS       run only comma separated groups of parameters (default: base)
+  --algo-groups ALGO_GROUPS
+                        add comma separated <algorithm>.<group> to run. Example usage: "--algo-groups=raft_cagra.large,hnswlib.large" (default: None)
   -f, --force           re-run algorithms even if their results already exist (default: False)
-  -m MODE, --search-mode MODE
-                        run search in 'latency' (measure individual batches) or 
-                        'throughput' (pipeline batches and measure end-to-end) mode.
-                        (default: 'latency')
+  -m SEARCH_MODE, --search-mode SEARCH_MODE
+                        run search in 'latency' (measure individual batches) or 'throughput' (pipeline batches and measure end-to-end) mode (default: throughput)
 ```
 
-`configuration` and `dataset` : `configuration` is a path to a configuration file for a given dataset.
-The configuration file should be name as `<dataset>.json`. It is optional if the name of the dataset is
-provided with the `dataset` argument, in which case
-a configuration file will be searched for as `python/raft-ann-bench/src/raft-ann-bench/run/conf/<dataset>.json`.
-For every algorithm run by this script, it outputs an index build statistics JSON file in `<dataset-path/<dataset>/result/build/<algo-k{k}-batch_size{batch_size}.json>`
-and an index search statistics JSON file in `<dataset-path/<dataset>/result/search/<algo-k{k}-batch_size{batch_size}.json>`.
+`dataset`: name of the dataset to be searched in [datasets.yaml](#yaml-dataset-config)
+
+`dataset-configuration`: optional filepath to custom dataset YAML config which has an entry for arg `dataset`
+
+`configuration`: optional filepath to YAML configuration for an algorithm or to directory that contains YAML configurations for several algorithms. [Here's how to configure an algorithm.](#yaml-algo-config)
+
+`algorithms`: runs all algorithms that it can find in YAML configs found by `configuration`. By default, only `base` group will be run.
+
+`groups`: run only specific groups of parameters configurations for an algorithm. Groups are defined in YAML configs (see `configuration`), and by default run `base` group
+
+`algo-groups`: this parameter is helpful to append any specific algorithm+group combination to run the benchmark for in addition to all the arguments from `algorithms` and `groups`. It is of the format `<algorithm>.<group>`, or for example, `raft_cagra.large`
+
+For every algorithm run by this script, it outputs an index build statistics JSON file in `<dataset-path/<dataset>/result/build/<algo_{group}-{k}-{batch_size}.json>`
+and an index search statistics JSON file in `<dataset-path/<dataset>/result/search/<algo_{group}-{k}-{batch_size}.json>`. NOTE: The filenams will not have "_{group}" if `group = "base"`.
 
 `dataset-path` :
 1. data is read from `<dataset-path>/<dataset>`
@@ -188,18 +202,21 @@ CSV file in `<dataset-path/<dataset>/result/search/<-k{k}-batch_size{batch_size}
 
 The usage of this script is:
 ```bash
-usage: plot.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [-k COUNT] [-bs BATCH_SIZE] [--build] [--search]
-               [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw]
+usage: __main__.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS] [-k COUNT]
+                   [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw]
 
 options:
   -h, --help            show this help message and exit
-  --dataset DATASET     dataset to download (default: glove-100-inner)
+  --dataset DATASET     dataset to plot (default: glove-100-inner)
   --dataset-path DATASET_PATH
-                        path to dataset folder (default: ${RAPIDS_DATASET_ROOT_DIR})
+                        path to dataset folder (default: os.getcwd()/datasets/)
   --output-filepath OUTPUT_FILEPATH
                         directory for PNG to be saved (default: os.getcwd())
   --algorithms ALGORITHMS
-                        plot only comma separated list of named algorithms (default: None)
+                        plot only comma separated list of named algorithms. If parameters `groups` and `algo-groups are both undefined, then group `base` is plot by default (default: None)
+  --groups GROUPS       plot only comma separated groups of parameters (default: base)
+  --algo-groups ALGO_GROUPS, --algo-groups ALGO_GROUPS
+                        add comma separated <algorithm>.<group> to plot. Example usage: "--algo-groups=raft_cagra.large,hnswlib.large" (default: None)
   -k COUNT, --count COUNT
                         the number of nearest neighbors to search for (default: 10)
   -bs BATCH_SIZE, --batch-size BATCH_SIZE
@@ -211,6 +228,11 @@ options:
                         Scale to use when drawing the Y-axis (default: linear)
   --raw                 Show raw results (not just Pareto frontier) in faded colours (default: False)
 ```
+`algorithms`: plots all algorithms that it can find results for the specified `dataset`. By default, only `base` group will be plotted.
+
+`groups`: plot only specific groups of parameters configurations for an algorithm. Groups are defined in YAML configs (see `configuration`), and by default run `base` group
+
+`algo-groups`: this parameter is helpful to append any specific algorithm+group combination to plot results for in addition to all the arguments from `algorithms` and `groups`. It is of the format `<algorithm>.<group>`, or for example, `raft_cagra.large`
 
 The figure below is the resulting plot of running our benchmarks as of August 2023 for a batch size of 10, on an NVIDIA H100 GPU and an Intel Xeon Platinum 8480CL CPU. It presents the throughput (in Queries-Per-Second) performance for every level of recall.
 
@@ -394,40 +416,42 @@ Note that the actual table displayed on the screen may differ slightly as the hy
 
 ## Creating and customizing dataset configurations
 
-A single configuration file will often define a set of algorithms, with associated index and search parameters, for a specific dataset. A configuration file uses json format with 4 major parts:
-1. Dataset information
-2. Algorithm information
-3. Index parameters
-4. Search parameters
+A single configuration will often define a set of algorithms, with associated index and search parameters, that can be generalize across datasets. We use YAML to define dataset specific and algorithm specific configurations.
 
-Below is a simple example configuration file for the 1M-scale `sift-128-euclidean` dataset:
+<a id='yaml-dataset-config'></a>A default `datasets.yaml` is provided by RAFT in `${RAFT_HOME}/python/raft-ann-bench/src/raft-ann-bench/run/conf` with configurations available for several datasets. Here's a simple example entry for the `sift-128-euclidean` dataset:
 
-```json
-{
-  "dataset": {
-    "name": "sift-128-euclidean",
-    "base_file": "sift-128-euclidean/base.fbin",
-    "query_file": "sift-128-euclidean/query.fbin", 
-    "subset_size": 1000000,
-    "groundtruth_neighbors_file": "sift-128-euclidean/groundtruth.neighbors.ibin",
-    "distance": "euclidean"
-  },
-  "index": []
-}
+```yaml
+- name: sift-128-euclidean
+  base_file: sift-128-euclidean/base.fbin
+  query_file: sift-128-euclidean/query.fbin
+  groundtruth_neighbors_file: sift-128-euclidean/groundtruth.neighbors.ibin
+  distance: euclidean
 ```
 
-The `index` section will contain a list of index objects, each of which will have the following form:
-```json
-{
-   "name": "algo_name.unique_index_name",
-   "algo": "algo_name",
-   "file": "sift-128-euclidean/algo_name/param1_val1-param2_val2",
-   "build_param": { "param1": "val1", "param2": "val2" },
-   "search_params": [{ "search_param1": "search_val1" }]
-}
+<a id='yaml-algo-config'></a>Configuration files for ANN algorithms supported by `raft-ann-bench` are provided in `${RAFT_HOME}/python/raft-ann-bench/src/raft-ann-bench/run/conf`. `raft_cagra` algorithm configuration looks like:
+```yaml
+name: raft_cagra
+groups:
+  base:
+    build:
+      graph_degree: [32, 64]
+      intermediate_graph_degree: [64, 96]
+    search:
+      itopk: [32, 64, 128]
+
+  large:
+    build:
+      graph_degree: [32, 64]
+    search:
+      itopk: [32, 64, 128]
 ```
+The default parameters for which the benchmarks are run can be overridden by creating a custom YAML file for algorithms with a `base` group.
 
-The table below contains the possible settings for the `algo` field. Each unique algorithm will have its own set of `build_param` and `search_params` settings. The [ANN Algorithm Parameter Tuning Guide](ann_benchmarks_param_tuning.md) contains detailed instructions on choosing build and search parameters for each supported algorithm.
+There config above has 2 fields:
+1. `name` - define the name of the algorithm for which the parameters are being specified.
+2. `groups` - define a run group which has a particular set of parameters. Each group helps create a cross-product of all hyper-parameter fields for `build` and `search`.
+
+The table below contains all algorithms supported by RAFT. Each unique algorithm will have its own set of `build` and `search` settings. The [ANN Algorithm Parameter Tuning Guide](ann_benchmarks_param_tuning.md) contains detailed instructions on choosing build and search parameters for each supported algorithm.
 
 | Library   | Algorithms                                                       |
 |-----------|------------------------------------------------------------------|
@@ -437,8 +461,6 @@ The table below contains the possible settings for the `algo` field. Each unique
 | HNSWlib   | `hnswlib`                                                        |
 | RAFT      | `raft_brute_force`, `raft_cagra`, `raft_ivf_flat`, `raft_ivf_pq` |
 
-By default, the index will be placed in `bench/ann/data/<dataset_name>/index/<name>`. Using `sift-128-euclidean` for the dataset with the `algo` example above, the indexes would be placed in `bench/ann/data/sift-128-euclidean/index/algo_name/param1_val1-param2_val2`.
-
 ## Adding a new ANN algorithm
 
 ### Implementation and Configuration
diff --git a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
index fd9d00f43c..fe2670411e 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
@@ -41,7 +41,12 @@ def convert_json_to_csv_build(dataset, dataset_path):
                 "time": df["real_time"],
             }
         )
-        write.to_csv(file.replace(".json", ".csv"), index=False)
+        filepath = os.path.normpath(file).split(os.sep)
+        filename = filepath[-1].split("-")[0] + ".csv"
+        write.to_csv(
+            os.path.join(f"{os.sep}".join(filepath[:-1]), filename),
+            index=False,
+        )
 
 
 def convert_json_to_csv_search(dataset, dataset_path):
diff --git a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
index 6ec2cdaf22..ef81768f4d 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
@@ -305,7 +305,7 @@ def mean_y(algo):
     fig = ax.get_figure()
     print(f"writing build output to {fn_out}")
     plt.title("Build Time for Highest QPS")
-    plt.suptitle(f"{dataset} k={k} batch_size={batch_size}")
+    plt.suptitle(f"{dataset}")
     plt.ylabel("Build Time (s)")
     fig.savefig(fn_out)
 
@@ -345,25 +345,70 @@ def load_lines(results_path, result_files, method, index_key):
 
 
 def load_all_results(
-    dataset_path, algorithms, k, batch_size, method, index_key
+    dataset_path,
+    algorithms,
+    groups,
+    algo_groups,
+    k,
+    batch_size,
+    method,
+    index_key,
 ):
     results_path = os.path.join(dataset_path, "result", method)
     result_files = os.listdir(results_path)
     result_files = [
-        result_filename
-        for result_filename in result_files
-        if "csv" in result_filename
-        and f"{k}-{batch_size}"
-        == "-".join(result_filename.replace(".csv", "").split("-")[1:])
+        result_file for result_file in result_files if ".csv" in result_file
     ]
-    if len(algorithms) > 0:
+    if method == "search":
         result_files = [
             result_filename
             for result_filename in result_files
-            if result_filename.split("-")[0] in algorithms
+            if f"{k}-{batch_size}" in result_filename
+        ]
+        algo_group_files = [
+            result_filename.split("-")[0] for result_filename in result_files
         ]
+    else:
+        algo_group_files = [
+            result_filename for result_filename in result_files
+        ]
+    for i in range(len(algo_group_files)):
+        algo_group = algo_group_files[i].replace(".csv", "").split("_")
+        if len(algo_group) == 2:
+            algo_group_files[i] = ("_".join(algo_group), "base")
+        else:
+            algo_group_files[i] = ("_".join(algo_group[:-1]), algo_group[-1])
+    algo_group_files = list(zip(*algo_group_files))
 
-    results = load_lines(results_path, result_files, method, index_key)
+    if len(algorithms) > 0:
+        final_results = [
+            result_files[i]
+            for i in range(len(result_files))
+            if (algo_group_files[0][i] in algorithms)
+            and (algo_group_files[1][i] in groups)
+        ]
+    else:
+        final_results = [
+            result_files[i]
+            for i in range(len(result_files))
+            if (algo_group_files[1][i] in groups)
+        ]
+
+    if len(algo_groups) > 0:
+        split_algo_groups = [
+            algo_group.split(".") for algo_group in algo_groups
+        ]
+        split_algo_groups = list(zip(*split_algo_groups))
+        final_algo_groups = [
+            result_files[i]
+            for i in range(len(result_files))
+            if (algo_group_files[0][i] in split_algo_groups[0])
+            and (algo_group_files[1][i] in split_algo_groups[1])
+        ]
+        final_results = final_results + final_algo_groups
+        final_results = set(final_results)
+
+    results = load_lines(results_path, final_results, method, index_key)
 
     return results
 
@@ -379,7 +424,7 @@ def main():
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
     parser.add_argument(
-        "--dataset", help="dataset to download", default="glove-100-inner"
+        "--dataset", help="dataset to plot", default="glove-100-inner"
     )
     parser.add_argument(
         "--dataset-path",
@@ -394,9 +439,21 @@ def main():
     parser.add_argument(
         "--algorithms",
         help="plot only comma separated list of named \
-                              algorithms",
+              algorithms. If parameters `groups` and `algo-groups \
+              are both undefined, then group `base` is plot by default",
         default=None,
     )
+    parser.add_argument(
+        "--groups",
+        help="plot only comma separated groups of parameters",
+        default="base",
+    )
+    parser.add_argument(
+        "--algo-groups",
+        "--algo-groups",
+        help='add comma separated <algorithm>.<group> to plot. \
+              Example usage: "--algo-groups=raft_cagra.large,hnswlib.large"',
+    )
     parser.add_argument(
         "-k",
         "--count",
@@ -437,6 +494,11 @@ def main():
         algorithms = args.algorithms.split(",")
     else:
         algorithms = []
+    groups = args.groups.split(",")
+    if args.algo_groups:
+        algo_groups = args.algo_groups.split(",")
+    else:
+        algo_groups = []
     k = args.count
     batch_size = args.batch_size
     if not args.build and not args.search:
@@ -452,12 +514,14 @@ def main():
     )
     build_output_filepath = os.path.join(
         args.output_filepath,
-        f"build-{args.dataset}-k{k}-batch_size{batch_size}.png",
+        f"build-{args.dataset}.png",
     )
 
     search_results = load_all_results(
         os.path.join(args.dataset_path, args.dataset),
         algorithms,
+        groups,
+        algo_groups,
         k,
         batch_size,
         "search",
@@ -480,6 +544,8 @@ def main():
         build_results = load_all_results(
             os.path.join(args.dataset_path, args.dataset),
             algorithms,
+            groups,
+            algo_groups,
             k,
             batch_size,
             "build",
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
index 30d642f3ac..477c289666 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
@@ -14,9 +14,11 @@
 # limitations under the License.
 
 import argparse
+import itertools
 import json
 import os
 import subprocess
+from importlib import import_module
 
 import yaml
 
@@ -45,15 +47,20 @@ def validate_algorithm(algos_conf, algo, gpu_present):
         )
 
 
-def find_executable(algos_conf, algo, k, batch_size):
+def find_executable(algos_conf, algo, group, k, batch_size):
     executable = algos_conf[algo]["executable"]
 
+    if group != "base":
+        return_str = f"{algo}_{group}-{k}-{batch_size}"
+    else:
+        return_str = f"{algo}-{k}-{batch_size}"
+
     build_path = os.getenv("RAFT_HOME")
     if build_path is not None:
         build_path = os.path.join(build_path, "cpp", "build", executable)
         if os.path.exists(build_path):
             print(f"-- Using RAFT bench from repository in {build_path}. ")
-            return (executable, build_path, f"{algo}-{k}-{batch_size}")
+            return (executable, build_path, return_str)
 
     # if there is no build folder present, we look in the conda environment
     conda_path = os.getenv("CONDA_PREFIX")
@@ -61,7 +68,7 @@ def find_executable(algos_conf, algo, k, batch_size):
         conda_path = os.path.join(conda_path, "bin", "ann", executable)
         if os.path.exists(conda_path):
             print("-- Using RAFT bench found in conda environment. ")
-            return (executable, conda_path, f"{algo}-{k}-{batch_size}")
+            return (executable, conda_path, return_str)
 
     else:
         raise FileNotFoundError(executable)
@@ -151,7 +158,7 @@ def main():
         gpu_present = False
 
     with open(f"{scripts_path}/algos.yaml", "r") as f:
-        algos_conf = yaml.safe_load(f)
+        algos_yaml = yaml.safe_load(f)
 
     if "RAPIDS_DATASET_ROOT_DIR" in os.environ:
         default_dataset_path = os.getenv("RAPIDS_DATASET_ROOT_DIR")
@@ -162,6 +169,11 @@ def main():
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
 
+    parser.add_argument(
+        "--subset-size",
+        type=positive_int,
+        help="the number of subset rows of the dataset to build the index",
+    )
     parser.add_argument(
         "-k",
         "--count",
@@ -176,13 +188,20 @@ def main():
         type=positive_int,
         help="number of query vectors to use in each query trial",
     )
+    parser.add_argument(
+        "--dataset-configuration",
+        help="path to YAML configuration file for datasets",
+    )
     parser.add_argument(
         "--configuration",
-        help="path to configuration file for a dataset",
+        help="path to YAML configuration file or directory for algorithms\
+              Any run groups found in the specified file/directory will \
+              automatically override groups of the same name present in the \
+              default configurations, including `base`",
     )
     parser.add_argument(
         "--dataset",
-        help="dataset whose configuration file will be used",
+        help="name of dataset",
         default="glove-100-inner",
     )
     parser.add_argument(
@@ -197,14 +216,19 @@ def main():
     parser.add_argument(
         "--algorithms",
         help="run only comma separated list of named \
-                              algorithms",
+              algorithms. If parameters `groups` and `algo-groups \
+              are both undefined, then group `base` is run by default",
         default=None,
     )
     parser.add_argument(
-        "--indices",
-        help="run only comma separated list of named indices. \
-                              parameter `algorithms` is ignored",
-        default=None,
+        "--groups",
+        help="run only comma separated groups of parameters",
+        default="base",
+    )
+    parser.add_argument(
+        "--algo-groups",
+        help='add comma separated <algorithm>.<group> to run. \
+              Example usage: "--algo-groups=raft_cagra.large,hnswlib.large"',
     )
     parser.add_argument(
         "-f",
@@ -219,7 +243,7 @@ def main():
         "--search-mode",
         help="run search in 'latency' (measure individual batches) or "
         "'throughput' (pipeline batches and measure end-to-end) mode",
-        default="throughput",
+        default="latency",
     )
 
     args = parser.parse_args()
@@ -237,92 +261,175 @@ def main():
     k = args.count
     batch_size = args.batch_size
 
-    # Read configuration file associated to dataset
-    if args.configuration:
-        conf_filepath = args.configuration
-    elif args.dataset:
-        conf_filepath = os.path.join(
-            scripts_path, "conf", f"{args.dataset}.json"
-        )
+    # Read configuration file associated to datasets
+    if args.dataset_configuration:
+        dataset_conf_f = args.dataset_configuration
     else:
-        raise ValueError(
-            "One of parameters `configuration` or \
-                         `dataset` need to be provided"
-        )
-    conf_filename = conf_filepath.split("/")[-1]
-    conf_filedir = "/".join(conf_filepath.split("/")[:-1])
-    dataset_path = args.dataset_path
-    if not os.path.exists(conf_filepath):
-        raise FileNotFoundError(conf_filename)
-
-    with open(conf_filepath, "r") as f:
-        conf_file = json.load(f)
-
-    dataset_name = conf_file["dataset"]["name"]
+        dataset_conf_f = os.path.join(scripts_path, "conf", "datasets.yaml")
+    with open(dataset_conf_f, "r") as f:
+        dataset_conf_all = yaml.safe_load(f)
+
+    dataset_conf = None
+    for dataset in dataset_conf_all:
+        if args.dataset == dataset["name"]:
+            dataset_conf = dataset
+            break
+    if not dataset_conf:
+        raise ValueError("Could not find a dataset configuration")
+
+    conf_file = dict()
+    conf_file["dataset"] = dataset_conf
+    if args.subset_size:
+        conf_file["dataset"]["subset_size"] = args.subset_size
+
+    conf_file["search_basic_param"] = {}
+    conf_file["search_basic_param"]["k"] = k
+    conf_file["search_basic_param"]["batch_size"] = batch_size
+
+    algos_conf_fs = os.listdir(os.path.join(scripts_path, "conf", "algos"))
+    algos_conf_fs = [
+        os.path.join(scripts_path, "conf", "algos", f)
+        for f in algos_conf_fs
+        if ".json" not in f
+    ]
+    conf_filedir = os.path.join(scripts_path, "conf", "algos")
+    if args.configuration:
+        if os.path.isdir(args.configuration):
+            conf_filedir = args.configuration
+            algos_conf_fs = algos_conf_fs + [
+                os.path.join(args.configuration, f)
+                for f in os.listdir(args.configuration)
+                if ".json" not in f
+            ]
+        elif os.path.isfile(args.configuration):
+            conf_filedir = os.path.normpath(args.configuration).split(os.sep)
+            conf_filedir = os.path.join(*conf_filedir[:-1])
+            algos_conf_fs = algos_conf_fs + [args.configuration]
+
+    filter_algos = True if args.algorithms else False
+    if filter_algos:
+        allowed_algos = args.algorithms.split(",")
+    named_groups = args.groups.split(",")
+    filter_algo_groups = True if args.algo_groups else False
+    allowed_algo_groups = None
+    if filter_algo_groups:
+        allowed_algo_groups = [
+            algo_group.split(".") for algo_group in args.algo_groups.split(",")
+        ]
+        allowed_algo_groups = list(zip(*allowed_algo_groups))
+    algos_conf = dict()
+    for algo_f in algos_conf_fs:
+        with open(algo_f, "r") as f:
+            algo = yaml.safe_load(f)
+            insert_algo = True
+            insert_algo_group = False
+            if filter_algos:
+                if algo["name"] not in allowed_algos:
+                    insert_algo = False
+            if filter_algo_groups:
+                if algo["name"] in allowed_algo_groups[0]:
+                    insert_algo_group = True
+
+            def add_algo_group(group_list):
+                if algo["name"] not in algos_conf:
+                    algos_conf[algo["name"]] = {"groups": {}}
+                for group in algo["groups"].keys():
+                    if group in group_list:
+                        algos_conf[algo["name"]]["groups"][group] = algo[
+                            "groups"
+                        ][group]
+                if "validators" in algo:
+                    algos_conf[algo["name"]]["validators"] = algo["validators"]
+
+            if insert_algo:
+                add_algo_group(named_groups)
+            if insert_algo_group:
+                add_algo_group(allowed_algo_groups[1])
 
     executables_to_run = dict()
-    # At least one named index should exist in config file
-    if args.indices:
-        indices = set(args.indices.split(","))
-        # algo associated with index should still be present in algos.yaml
-        # and enabled
-        for index in conf_file["index"]:
-            curr_algo = index["algo"]
-            if index["name"] in indices and validate_algorithm(
-                algos_conf, curr_algo, gpu_present
-            ):
-                executable_path = find_executable(
-                    algos_conf, curr_algo, k, batch_size
-                )
-                if executable_path not in executables_to_run:
-                    executables_to_run[executable_path] = {"index": []}
-                executables_to_run[executable_path]["index"].append(index)
-
-    # switch to named algorithms if indices parameter is not supplied
-    elif args.algorithms:
-        algorithms = set(args.algorithms.split(","))
-        # pick out algorithms from conf file that exist
-        # and are enabled in algos.yaml
-        for index in conf_file["index"]:
-            curr_algo = index["algo"]
-            if curr_algo in algorithms and validate_algorithm(
-                algos_conf, curr_algo, gpu_present
-            ):
-                executable_path = find_executable(
-                    algos_conf, curr_algo, k, batch_size
-                )
-                if executable_path not in executables_to_run:
-                    executables_to_run[executable_path] = {"index": []}
-                executables_to_run[executable_path]["index"].append(index)
-
-    # default, try to run all available algorithms
-    else:
-        for index in conf_file["index"]:
-            curr_algo = index["algo"]
-            if validate_algorithm(algos_conf, curr_algo, gpu_present):
-                executable_path = find_executable(
-                    algos_conf, curr_algo, k, batch_size
-                )
-                if executable_path not in executables_to_run:
-                    executables_to_run[executable_path] = {"index": []}
-                executables_to_run[executable_path]["index"].append(index)
-
-    # Replace index to dataset path
-    for executable_path in executables_to_run:
-        for pos, index in enumerate(
-            executables_to_run[executable_path]["index"]
-        ):
-            index["file"] = os.path.join(
-                dataset_path, dataset_name, "index", index["name"]
+    for algo in algos_conf.keys():
+        validate_algorithm(algos_yaml, algo, gpu_present)
+        for group in algos_conf[algo]["groups"].keys():
+            executable = find_executable(
+                algos_yaml, algo, group, k, batch_size
             )
-            executables_to_run[executable_path]["index"][pos] = index
+            if executable not in executables_to_run:
+                executables_to_run[executable] = {"index": []}
+            build_params = algos_conf[algo]["groups"][group]["build"]
+            search_params = algos_conf[algo]["groups"][group]["search"]
+
+            param_names = []
+            param_lists = []
+            for param in build_params.keys():
+                param_names.append(param)
+                param_lists.append(build_params[param])
+
+            all_build_params = itertools.product(*param_lists)
+
+            search_param_names = []
+            search_param_lists = []
+            for search_param in search_params.keys():
+                search_param_names.append(search_param)
+                search_param_lists.append(search_params[search_param])
+
+            for params in all_build_params:
+                index = {"algo": algo, "build_param": {}}
+                if group != "base":
+                    index_name = f"{algo}_{group}"
+                else:
+                    index_name = f"{algo}"
+                for i in range(len(params)):
+                    index["build_param"][param_names[i]] = params[i]
+                    index_name += "." + f"{param_names[i]}{params[i]}"
+
+                if "validators" in algos_conf[algo]:
+                    if "build" in algos_conf[algo]["validators"]:
+                        importable = algos_conf[algo]["validators"]["build"]
+                        importable = importable.split(".")
+                        module = ".".join(importable[:-1])
+                        func = importable[-1]
+                        validator = import_module(module)
+                        build_validator = getattr(validator, func)
+                        if not build_validator(
+                            index["build_param"], conf_file["dataset"]["dims"]
+                        ):
+                            continue
+
+                index["name"] = index_name
+                index["file"] = os.path.join(
+                    args.dataset_path, args.dataset, "index", index_name
+                )
+                index["search_params"] = []
+                all_search_params = itertools.product(*search_param_lists)
+                for search_params in all_search_params:
+                    search_dict = dict()
+                    for i in range(len(search_params)):
+                        search_dict[search_param_names[i]] = search_params[i]
+                    if "validators" in algos_conf[algo]:
+                        if "search" in algos_conf[algo]["validators"]:
+                            importable = algos_conf[algo]["validators"][
+                                "search"
+                            ]
+                            importable = importable.split(".")
+                            module = ".".join(importable[:-1])
+                            func = importable[-1]
+                            validator = import_module(module)
+                            search_validator = getattr(validator, func)
+                            if search_validator(
+                                search_dict,
+                                index["build_param"],
+                                k,
+                                batch_size,
+                            ):
+                                index["search_params"].append(search_dict)
+                executables_to_run[executable]["index"].append(index)
 
     run_build_and_search(
         conf_file,
-        conf_filename,
+        f"{args.dataset}.json",
         conf_filedir,
         executables_to_run,
-        dataset_path,
+        args.dataset_path,
         args.force,
         build,
         search,
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_ivf_flat.yaml b/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_ivf_flat.yaml
new file mode 100644
index 0000000000..6542bbab4c
--- /dev/null
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_ivf_flat.yaml
@@ -0,0 +1,10 @@
+name: faiss_gpu_ivf_flat
+groups:
+  base:
+    build:
+      nlists: [1024, 2048, 4096, 8192, 16000, 32000]
+      ratio: [1, 10, 25]
+      useFloat16: [True, False]
+    search:
+      numProbes: [1, 5, 10, 50, 100, 200, 500, 1000, 2000]
+      refine_ratio: [1, 2, 4, 10]
\ No newline at end of file
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_ivf_pq.yaml b/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_ivf_pq.yaml
new file mode 100644
index 0000000000..7e453d506e
--- /dev/null
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_ivf_pq.yaml
@@ -0,0 +1,12 @@
+name: faiss_gpu_ivf_pq
+groups:
+  base:
+    build:
+      nlist: [1024, 2048, 4096, 8192, 16000, 32000, 64000, 100000]
+      M_ratio: [2, 4]
+      ratio: [1, 10, 25]
+      usePrecomputed: [True, False]
+      useFloat16: [True, False]
+    search:
+      numProbes: [1, 5, 10, 50, 100, 200, 500, 1000, 2000]
+      refine_ratio: [1, 2, 4, 10]
\ No newline at end of file
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_cagra.yaml b/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_cagra.yaml
new file mode 100644
index 0000000000..3e90494968
--- /dev/null
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_cagra.yaml
@@ -0,0 +1,16 @@
+name: raft_cagra
+validators:
+  search: raft-ann-bench.validators.raft_cagra_search_validator
+groups:
+  base:
+    build:
+      graph_degree: [32, 64]
+      intermediate_graph_degree: [64, 96]
+    search:
+      itopk: [32, 64, 128]
+
+  large:
+    build:
+      graph_degree: [32, 64]
+    search:
+      itopk: [32, 64, 128]
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_ivf_flat.yaml b/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_ivf_flat.yaml
new file mode 100644
index 0000000000..c36a26514d
--- /dev/null
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_ivf_flat.yaml
@@ -0,0 +1,9 @@
+name: raft_ivf_flat
+groups:
+  base:
+    build:
+      nlist: [1024, 2048, 4096, 8192, 16384, 32000, 64000]
+      ratio: [1, 2, 4]
+      niter: [20, 25]
+    search:
+      nprobe: [1, 5, 10, 50, 100, 200, 500, 1000, 2000]
\ No newline at end of file
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_ivf_pq.yaml b/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_ivf_pq.yaml
new file mode 100644
index 0000000000..2e1912c6b0
--- /dev/null
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_ivf_pq.yaml
@@ -0,0 +1,17 @@
+name: raft_ivf_pq
+validators:
+  build: raft-ann-bench.validators.raft_ivf_pq_build_validator
+  search: raft-ann-bench.validators.raft_ivf_pq_search_validator
+groups:
+  base:
+    build:
+      nlist: [1024, 2048, 4096, 8192]
+      pq_dim: [64, 32]
+      pq_bits: [8, 6, 5, 4]
+      ratio: [1, 10, 25]
+      niter: [25]
+    search:
+      nprobe: [1, 5, 10, 50, 100, 200, 500]
+      internalDistanceDtype: ["float", "half"]
+      smemLutDtype: ["float", "fp8", "half"]
+      refine_ratio: [1, 2, 4]
\ No newline at end of file
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/datasets.yaml b/python/raft-ann-bench/src/raft-ann-bench/run/conf/datasets.yaml
new file mode 100644
index 0000000000..23476cc056
--- /dev/null
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/datasets.yaml
@@ -0,0 +1,121 @@
+- name: bigann-100M
+  base_file: bigann-1B/base.1B.u8bin
+  subset_size: 100000000
+  dims: 128
+  query_file: bigann-1B/query.public.10K.u8bin
+  groundtruth_neighbors_file: bigann-100M/groundtruth.neighbors.ibin
+  distance: euclidean
+
+- name: deep-1B
+  base_file: deep-1B/base.1B.fbin
+  query_file: deep-1B/query.public.10K.fbin
+  dims: 96
+  groundtruth_neighbors_file: deep-1B/groundtruth.neighbors.ibin
+  distance: inner_product
+
+- name: bigann-100M
+  base_file: bigann-1B/base.1B.u8bin
+  subset_size: 100000000
+  dims: 128
+  query_file: bigann-1B/query.public.10K.u8bin
+  groundtruth_neighbors_file: bigann-100M/groundtruth.neighbors.ibin
+  distance: euclidean
+
+- name: deep-image-96-inner
+  base_file: deep-image-96-inner/base.fbin
+  query_file: deep-image-96-inner/query.fbin
+  dims: 96
+  groundtruth_neighbors_file: deep-image-96-inner/groundtruth.neighbors.ibin
+  distance: euclidean
+
+- name: fashion-mnist-784-euclidean
+  dims: 784
+  base_file: fashion-mnist-784-euclidean/base.fbin
+  query_file: fashion-mnist-784-euclidean/query.fbin
+  groundtruth_neighbors_file: fashion-mnist-784-euclidean/groundtruth.neighbors.ibin
+  distance: euclidean
+
+- name: gist-960-euclidean
+  dims: 960
+  base_file: gist-960-euclidean/base.fbin
+  query_file: gist-960-euclidean/query.fbin
+  distance: euclidean
+
+- name: glove-50-angular
+  dims: 50
+  base_file: glove-50-angular/base.fbin
+  query_file: glove-50-angular/query.fbin
+  distance: euclidean
+
+- name: glove-50-inner
+  dims: 50
+  base_file: glove-50-inner/base.fbin
+  query_file: glove-50-inner/query.fbin
+  distance: euclidean
+
+- name: glove-100-angular
+  dims: 100
+  base_file: glove-100-angular/base.fbin
+  query_file: glove-100-angular/query.fbin
+  distance: euclidean
+
+- name: glove-100-inner
+  dims: 100
+  base_file: glove-100-inner/base.fbin
+  query_file: glove-100-inner/query.fbin
+  distance: euclidean
+
+- name: lastfm-65-angular
+  dims: 65
+  base_file: lastfm-65-angular/base.fbin
+  query_file: lastfm-65-angular/query.fbin
+  distance: euclidean
+
+- name: mnist-784-euclidean
+  dims: 784
+  base_file: mnist-784-euclidean/base.fbin
+  query_file: mnist-784-euclidean/query.fbin
+  groundtruth_neighbors_file: mnist-784-euclidean/groundtruth.neighbors.ibin
+  distance: euclidean
+
+- name: nytimes-256-angular
+  dims: 256
+  base_file: nytimes-256-angular/base.fbin
+  query_file: nytimes-256-angular/query.fbin
+  groundtruth_neighbors_file: nytimes-256-angular/groundtruth.neighbors.ibin
+  distance: euclidean
+
+- name: nytimes-256-inner
+  dims: 256
+  base_file: nytimes-256-inner/base.fbin
+  query_file: nytimes-256-inner/query.fbin
+  groundtruth_neighbors_file: nytimes-256-inner/groundtruth.neighbors.ibin
+  distance: euclidean
+
+- name: sift-128-euclidean
+  dims: 128
+  base_file: sift-128-euclidean/base.fbin
+  query_file: sift-128-euclidean/query.fbin
+  groundtruth_neighbors_file: sift-128-euclidean/groundtruth.neighbors.ibin
+  distance: euclidean
+
+- name: wiki_all_1M,
+  dims: 784
+  base_file: wiki_all_1M/base.1MM.fbin,
+  query_file: wiki_all_1M/queries.fbin,
+  groundtruth_neighbors_file: wiki_all_1M/groundtruth.1M.neighbors.ibin,
+  distance: euclidean
+
+- name: wiki_all_10M,
+  dims: 784
+  base_file: wiki_all_10M/base.10M.fbin,
+  query_file: wiki_all_10M/queries.fbin,
+  groundtruth_neighbors_file: wiki_all_10M/groundtruth.10M.neighbors.ibin,
+  distance: euclidean
+
+- name: wiki_all_88M,
+  dims: 784
+  base_file: wiki_all_88M/base.88M.fbin,
+  query_file: wiki_all_88M/queries.fbin,
+  groundtruth_neighbors_file: wiki_all_88M/groundtruth.88M.neighbors.ibin,
+  distance: euclidean
diff --git a/python/raft-ann-bench/src/raft-ann-bench/validators/__init__.py b/python/raft-ann-bench/src/raft-ann-bench/validators/__init__.py
new file mode 100644
index 0000000000..03bf707e6b
--- /dev/null
+++ b/python/raft-ann-bench/src/raft-ann-bench/validators/__init__.py
@@ -0,0 +1,40 @@
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DTYPE_SIZES = {"float": 4, "half": 2, "fp8": 1}
+
+
+def raft_ivf_pq_build_validator(params, dims):
+    if "pq_dim" in params:
+        return params["pq_dim"] <= dims
+    return True
+
+
+def raft_ivf_pq_search_validator(params, build_params, k, batch_size):
+    ret = True
+    if "internalDistanceDtype" in params and "smemLutDtype" in params:
+        ret = (
+            DTYPE_SIZES[params["smemLutDtype"]]
+            < DTYPE_SIZES[params["internalDistanceDtype"]]
+        )
+
+    if "nlist" in build_params and "nprobe" in params:
+        ret = build_params["nlist"] <= params["nprobe"]
+    return ret
+
+
+def raft_cagra_search_validator(params, build_params, k, batch_size):
+    if "itopk" in params:
+        return params["itopk"] >= k