From dfde3b499a403af433e1e27b9174461ec086adc9 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Mon, 25 Sep 2023 10:02:07 -0700 Subject: [PATCH] Fixes for OOM during CAGRA benchmarks (#1832) Running the CAGRA benchmarks and there could be OOM errors on GPU memory with large datasets. This is caused by holding multiple copies of the dataset in GPU memory. Fix by: * Free existing memory for the dataset/graph before allocating new memory during update_dataset/update_grph * On deserialize, if the serialized index doesn't contain the dataset - don't allocate GPU memory for it * Don't call update_dataset repeatedly in the benchmarking code with the same dataset Authors: - Ben Frederickson (https://github.com/benfred) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1832 --- cpp/include/raft/neighbors/cagra_types.hpp | 19 +++++++++++++++---- .../detail/cagra/cagra_serialize.cuh | 19 +++++++++++++------ 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp index 02e3f5338e..4728178194 100644 --- a/cpp/include/raft/neighbors/cagra_types.hpp +++ b/cpp/include/raft/neighbors/cagra_types.hpp @@ -165,9 +165,10 @@ struct index : ann::index { ~index() = default; /** Construct an empty index. */ - index(raft::resources const& res) + index(raft::resources const& res, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded) : ann::index(), - metric_(raft::distance::DistanceType::L2Expanded), + metric_(metric), dataset_(make_device_matrix(res, 0, 0)), graph_(make_device_matrix(res, 0, 0)) { @@ -296,7 +297,11 @@ struct index : ann::index { raft::host_matrix_view knn_graph) { RAFT_LOG_DEBUG("Copying CAGRA knn graph from host to device"); - graph_ = make_device_matrix(res, knn_graph.extent(0), knn_graph.extent(1)); + if ((graph_.extent(0) != knn_graph.extent(0)) || (graph_.extent(1) != knn_graph.extent(1))) { + // clear existing memory before allocating to prevent OOM errors on large graphs + if (graph_.size()) { graph_ = make_device_matrix(res, 0, 0); } + graph_ = make_device_matrix(res, knn_graph.extent(0), knn_graph.extent(1)); + } raft::copy(graph_.data_handle(), knn_graph.data_handle(), knn_graph.size(), @@ -311,7 +316,13 @@ struct index : ann::index { mdspan, row_major, data_accessor> dataset) { size_t padded_dim = round_up_safe(dataset.extent(1) * sizeof(T), 16) / sizeof(T); - dataset_ = make_device_matrix(res, dataset.extent(0), padded_dim); + + if ((dataset_.extent(0) != dataset.extent(0)) || + (static_cast(dataset_.extent(1)) != padded_dim)) { + // clear existing memory before allocating to prevent OOM errors on large datasets + if (dataset_.size()) { dataset_ = make_device_matrix(res, 0, 0); } + dataset_ = make_device_matrix(res, dataset.extent(0), padded_dim); + } if (dataset_.extent(1) == dataset.extent(1)) { raft::copy(dataset_.data_handle(), dataset.data_handle(), diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh index 234911e15c..8261f637e1 100644 --- a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh @@ -130,15 +130,22 @@ auto deserialize(raft::resources const& res, std::istream& is) -> index auto graph_degree = deserialize_scalar(res, is); auto metric = deserialize_scalar(res, is); - auto dataset = raft::make_host_matrix(n_rows, dim); - auto graph = raft::make_host_matrix(n_rows, graph_degree); + auto graph = raft::make_host_matrix(n_rows, graph_degree); deserialize_mdspan(res, is, graph.view()); bool has_dataset = deserialize_scalar(res, is); - if (has_dataset) { deserialize_mdspan(res, is, dataset.view()); } - - return index( - res, metric, raft::make_const_mdspan(dataset.view()), raft::make_const_mdspan(graph.view())); + if (has_dataset) { + auto dataset = raft::make_host_matrix(n_rows, dim); + deserialize_mdspan(res, is, dataset.view()); + return index( + res, metric, raft::make_const_mdspan(dataset.view()), raft::make_const_mdspan(graph.view())); + } else { + // create a new index with no dataset - the user must supply via update_dataset themselves + // later (this avoids allocating GPU memory in the meantime) + index idx(res, metric); + idx.update_graph(res, raft::make_const_mdspan(graph.view())); + return idx; + } } template