Merge branch 'branch-23.10' of github.com:rapidsai/raft into fea/devc…

…ontainers
rapidsai · Sep 25, 2023 · 41bb5f5 · 41bb5f5
2 parents 3d3161b + dfde3b4
commit 41bb5f5
Show file tree

Hide file tree

Showing 10 changed files with 3,951 additions and 601 deletions.
diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp
@@ -165,9 +165,10 @@ struct index : ann::index {
   ~index()                               = default;
 
   /** Construct an empty index. */
-  index(raft::resources const& res)
+  index(raft::resources const& res,
+        raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded)
     : ann::index(),
-      metric_(raft::distance::DistanceType::L2Expanded),
+      metric_(metric),
       dataset_(make_device_matrix<T, int64_t>(res, 0, 0)),
       graph_(make_device_matrix<IdxT, int64_t>(res, 0, 0))
   {
@@ -296,7 +297,11 @@ struct index : ann::index {
                     raft::host_matrix_view<const IdxT, int64_t, row_major> knn_graph)
   {
     RAFT_LOG_DEBUG("Copying CAGRA knn graph from host to device");
-    graph_ = make_device_matrix<IdxT, int64_t>(res, knn_graph.extent(0), knn_graph.extent(1));
+    if ((graph_.extent(0) != knn_graph.extent(0)) || (graph_.extent(1) != knn_graph.extent(1))) {
+      // clear existing memory before allocating to prevent OOM errors on large graphs
+      if (graph_.size()) { graph_ = make_device_matrix<IdxT, int64_t>(res, 0, 0); }
+      graph_ = make_device_matrix<IdxT, int64_t>(res, knn_graph.extent(0), knn_graph.extent(1));
+    }
     raft::copy(graph_.data_handle(),
                knn_graph.data_handle(),
                knn_graph.size(),
@@ -311,7 +316,13 @@ struct index : ann::index {
                    mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset)
   {
     size_t padded_dim = round_up_safe<size_t>(dataset.extent(1) * sizeof(T), 16) / sizeof(T);
-    dataset_          = make_device_matrix<T, int64_t>(res, dataset.extent(0), padded_dim);
+
+    if ((dataset_.extent(0) != dataset.extent(0)) ||
+        (static_cast<size_t>(dataset_.extent(1)) != padded_dim)) {
+      // clear existing memory before allocating to prevent OOM errors on large datasets
+      if (dataset_.size()) { dataset_ = make_device_matrix<T, int64_t>(res, 0, 0); }
+      dataset_ = make_device_matrix<T, int64_t>(res, dataset.extent(0), padded_dim);
+    }
     if (dataset_.extent(1) == dataset.extent(1)) {
       raft::copy(dataset_.data_handle(),
                  dataset.data_handle(),

diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
@@ -130,15 +130,22 @@ auto deserialize(raft::resources const& res, std::istream& is) -> index<T, IdxT>
   auto graph_degree = deserialize_scalar<std::uint32_t>(res, is);
   auto metric       = deserialize_scalar<raft::distance::DistanceType>(res, is);
 
-  auto dataset = raft::make_host_matrix<T, int64_t>(n_rows, dim);
-  auto graph   = raft::make_host_matrix<IdxT, int64_t>(n_rows, graph_degree);
+  auto graph = raft::make_host_matrix<IdxT, int64_t>(n_rows, graph_degree);
   deserialize_mdspan(res, is, graph.view());
 
   bool has_dataset = deserialize_scalar<bool>(res, is);
-  if (has_dataset) { deserialize_mdspan(res, is, dataset.view()); }
-
-  return index<T, IdxT>(
-    res, metric, raft::make_const_mdspan(dataset.view()), raft::make_const_mdspan(graph.view()));
+  if (has_dataset) {
+    auto dataset = raft::make_host_matrix<T, int64_t>(n_rows, dim);
+    deserialize_mdspan(res, is, dataset.view());
+    return index<T, IdxT>(
+      res, metric, raft::make_const_mdspan(dataset.view()), raft::make_const_mdspan(graph.view()));
+  } else {
+    // create a new index with no dataset - the user must supply via update_dataset themselves
+    // later (this avoids allocating GPU memory in the meantime)
+    index<T, IdxT> idx(res, metric);
+    idx.update_graph(res, raft::make_const_mdspan(graph.view()));
+    return idx;
+  }
 }
 
 template <typename T, typename IdxT>

diff --git a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
@@ -254,18 +254,18 @@ def create_plot_build(
     xn = "k-nn"
     yn = "qps"
 
-    # recall_85 = [-1] * len(linestyles)
     qps_85 = [-1] * len(linestyles)
     bt_85 = [0] * len(linestyles)
     i_85 = [-1] * len(linestyles)
-    # recall_90 = [-1] * len(linestyles)
+
     qps_90 = [-1] * len(linestyles)
     bt_90 = [0] * len(linestyles)
     i_90 = [-1] * len(linestyles)
-    # recall_95 = [-1] * len(linestyles)
+
     qps_95 = [-1] * len(linestyles)
     bt_95 = [0] * len(linestyles)
     i_95 = [-1] * len(linestyles)
+
     data = OrderedDict()
     colors = OrderedDict()
 
@@ -303,7 +303,7 @@ def mean_y(algo):
     plt.figure(figsize=(12, 9))
     ax = df.plot.bar(rot=0, color=colors)
     fig = ax.get_figure()
-    print(f"writing search output to {fn_out}")
+    print(f"writing build output to {fn_out}")
     plt.title("Build Time for Highest QPS")
     plt.suptitle(f"{dataset} k={k} batch_size={batch_size}")
     plt.ylabel("Build Time (s)")
@@ -313,35 +313,22 @@ def mean_y(algo):
 def load_lines(results_path, result_files, method, index_key):
     results = dict()
 
-    linebreaker = "name,iterations"
-
     for result_filename in result_files:
         if result_filename.endswith(".csv"):
             with open(os.path.join(results_path, result_filename), "r") as f:
                 lines = f.readlines()
                 lines = lines[:-1] if lines[-1] == "\n" else lines
-                idx = 0
-                for pos, line in enumerate(lines):
-                    if linebreaker in line:
-                        idx = pos
-                        break
 
                 if method == "build":
-                    if "hnswlib" in result_filename:
-                        key_idx = [2]
-                    else:
-                        key_idx = [10]
+                    key_idx = [2]
                 elif method == "search":
-                    if "hnswlib" in result_filename:
-                        key_idx = [10, 6]
-                    else:
-                        key_idx = [12, 10]
+                    key_idx = [2, 3]
 
-                for line in lines[idx + 1 :]:
+                for line in lines[1:]:
                     split_lines = line.split(",")
 
-                    algo_name = split_lines[0].split(".")[0].strip('"')
-                    index_name = split_lines[0].split("/")[0].strip('"')
+                    algo_name = split_lines[0]
+                    index_name = split_lines[1]
 
                     if index_key == "algo":
                         dict_key = algo_name
@@ -394,9 +381,7 @@ def main():
     )
     parser.add_argument(
         "--dataset-path",
-        help="path to dataset folder, by default will look in "
-        "RAPIDS_DATASET_ROOT_DIR if defined, otherwise a datasets "
-        "subdirectory from the calling directory",
+        help="path to dataset folder",
         default=default_dataset_path,
     )
     parser.add_argument(
@@ -460,10 +445,12 @@ def main():
         search = args.search
 
     search_output_filepath = os.path.join(
-        args.output_filepath, f"search-{args.dataset}-{k}-{batch_size}.png"
+        args.output_filepath,
+        f"search-{args.dataset}-k{k}-batch_size{batch_size}.png",
     )
     build_output_filepath = os.path.join(
-        args.output_filepath, f"build-{args.dataset}-{k}-{batch_size}.png"
+        args.output_filepath,
+        f"build-{args.dataset}-k{k}-batch_size{batch_size}.png",
     )
 
     search_results = load_all_results(

diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-image-96-inner.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-image-96-inner.json
@@ -3,6 +3,7 @@
     "name": "deep-image-96-inner",
     "base_file": "deep-image-96-inner/base.fbin",
     "query_file": "deep-image-96-inner/query.fbin",
+    "groundtruth_neighbors_file": "deep-image-96-inner/groundtruth.neighbors.ibin",
     "distance": "euclidean"
   },
   "search_basic_param": {

diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/fashion-mnist-784-euclidean.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/fashion-mnist-784-euclidean.json
@@ -3,6 +3,7 @@
     "name": "fashion-mnist-784-euclidean",
     "base_file": "fashion-mnist-784-euclidean/base.fbin",
     "query_file": "fashion-mnist-784-euclidean/query.fbin",
+    "groundtruth_neighbors_file": "fashion-mnist-784-euclidean/groundtruth.neighbors.ibin",
     "distance": "euclidean"
   },
   "search_basic_param": {