Skip to content

Commit

Permalink
Cagra memory optimizations (#1790)
Browse files Browse the repository at this point in the history
When trying to build a CAGRA index with 500M embeddings, we were running out of memory - even when using managed memory. 

This PR contains some changes to reduce the memory usage:
 * For certain large matrices, don't make 2nd copies on the device or host if the memory is already accessible via UVM /ATS/HMM. For instance, we were taking a copy of the intermediate graph from host to device memory - and in certain cases (500M dataset, intermediate_graph_degree=128) the intermediate graph was 256GB alone.
 *  Don’t create a separate ‘pruned_graph’ host matrix in the `optimize` call, and just use host memory passed in by caller
 * Free the intermediate graph before creating the index

Authors:
  - Ben Frederickson (https://github.com/benfred)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: #1790
  • Loading branch information
benfred authored Sep 10, 2023
1 parent 874189d commit 12480cf
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 52 deletions.
10 changes: 7 additions & 3 deletions cpp/include/raft/neighbors/cagra.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -256,13 +256,17 @@ index<T, IdxT> build(raft::resources const& res,
graph_degree = intermediate_degree;
}

auto knn_graph = raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), intermediate_degree);
std::optional<raft::host_matrix<IdxT, int64_t>> knn_graph(
raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), intermediate_degree));

build_knn_graph(res, dataset, knn_graph.view());
build_knn_graph(res, dataset, knn_graph->view());

auto cagra_graph = raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), graph_degree);

optimize<IdxT>(res, knn_graph.view(), cagra_graph.view());
optimize<IdxT>(res, knn_graph->view(), cagra_graph.view());

// free intermediate graph before trying to create the index
knn_graph.reset();

// Construct an index from dataset and optimized knn graph.
return index<T, IdxT>(res, params.metric, dataset, raft::make_const_mdspan(cagra_graph.view()));
Expand Down
77 changes: 28 additions & 49 deletions cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -334,18 +334,13 @@ void optimize(raft::resources const& res,
auto output_graph_ptr = new_graph.data_handle();
const IdxT graph_size = new_graph.extent(0);

auto pruned_graph = raft::make_host_matrix<IdxT, int64_t>(graph_size, output_graph_degree);

{
//
// Prune kNN graph
//
auto d_input_graph =
raft::make_device_matrix<IdxT, int64_t>(res, graph_size, input_graph_degree);

auto detour_count = raft::make_host_matrix<uint8_t, int64_t>(graph_size, input_graph_degree);
auto d_detour_count =
raft::make_device_matrix<uint8_t, int64_t>(res, graph_size, input_graph_degree);

RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(),
0xff,
graph_size * input_graph_degree * sizeof(uint8_t),
Expand Down Expand Up @@ -376,24 +371,13 @@ void optimize(raft::resources const& res,
const double time_prune_start = cur_time();
RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r");

raft::copy(d_input_graph.data_handle(),
input_graph_ptr,
graph_size * input_graph_degree,
resource::get_cuda_stream(res));
void (*kernel_prune)(const IdxT* const,
const uint32_t,
const uint32_t,
const uint32_t,
const uint32_t,
const uint32_t,
uint8_t* const,
uint32_t* const,
uint64_t* const);
// Copy input_graph_ptr over to device if necessary
device_matrix_view_from_host d_input_graph(
res,
raft::make_host_matrix_view<IdxT, int64_t>(input_graph_ptr, graph_size, input_graph_degree));

constexpr int MAX_DEGREE = 1024;
if (input_graph_degree <= MAX_DEGREE) {
kernel_prune = kern_prune<MAX_DEGREE, IdxT>;
} else {
if (input_graph_degree > MAX_DEGREE) {
RAFT_FAIL(
"The degree of input knn graph is too large (%u). "
"It must be equal to or smaller than %d.",
Expand All @@ -410,16 +394,17 @@ void optimize(raft::resources const& res,
dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, resource::get_cuda_stream(res)));

for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
kernel_prune<<<blocks_prune, threads_prune, 0, resource::get_cuda_stream(res)>>>(
d_input_graph.data_handle(),
graph_size,
input_graph_degree,
output_graph_degree,
batch_size,
i_batch,
d_detour_count.data_handle(),
d_num_no_detour_edges.data_handle(),
dev_stats.data_handle());
kern_prune<MAX_DEGREE, IdxT>
<<<blocks_prune, threads_prune, 0, resource::get_cuda_stream(res)>>>(
d_input_graph.data_handle(),
graph_size,
input_graph_degree,
output_graph_degree,
batch_size,
i_batch,
d_detour_count.data_handle(),
d_num_no_detour_edges.data_handle(),
dev_stats.data_handle());
resource::sync_stream(res);
RAFT_LOG_DEBUG(
"# Pruning kNN Graph on GPUs (%.1lf %%)\r",
Expand All @@ -428,10 +413,7 @@ void optimize(raft::resources const& res,
resource::sync_stream(res);
RAFT_LOG_DEBUG("\n");

raft::copy(detour_count.data_handle(),
d_detour_count.data_handle(),
graph_size * input_graph_degree,
resource::get_cuda_stream(res));
host_matrix_view_from_device<uint8_t, int64_t> detour_count(res, d_detour_count.view());

raft::copy(
host_stats.data_handle(), dev_stats.data_handle(), 2, resource::get_cuda_stream(res));
Expand All @@ -447,7 +429,7 @@ void optimize(raft::resources const& res,
if (max_detour < num_detour) { max_detour = num_detour; /* stats */ }
for (uint64_t k = 0; k < input_graph_degree; k++) {
if (detour_count.data_handle()[k + (input_graph_degree * i)] != num_detour) { continue; }
pruned_graph.data_handle()[pk + (output_graph_degree * i)] =
output_graph_ptr[pk + (output_graph_degree * i)] =
input_graph_ptr[k + (input_graph_degree * i)];
pk += 1;
if (pk >= output_graph_degree) break;
Expand Down Expand Up @@ -478,8 +460,7 @@ void optimize(raft::resources const& res,
//
const double time_make_start = cur_time();

auto d_rev_graph =
raft::make_device_matrix<IdxT, int64_t>(res, graph_size, output_graph_degree);
device_matrix_view_from_host<IdxT, int64_t> d_rev_graph(res, rev_graph.view());
RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph.data_handle(),
0xff,
graph_size * output_graph_degree * sizeof(IdxT),
Expand All @@ -497,7 +478,7 @@ void optimize(raft::resources const& res,
for (uint64_t k = 0; k < output_graph_degree; k++) {
#pragma omp parallel for
for (uint64_t i = 0; i < graph_size; i++) {
dest_nodes.data_handle()[i] = pruned_graph.data_handle()[k + (output_graph_degree * i)];
dest_nodes.data_handle()[i] = output_graph_ptr[k + (output_graph_degree * i)];
}
resource::sync_stream(res);

Expand All @@ -520,10 +501,12 @@ void optimize(raft::resources const& res,
resource::sync_stream(res);
RAFT_LOG_DEBUG("\n");

raft::copy(rev_graph.data_handle(),
d_rev_graph.data_handle(),
graph_size * output_graph_degree,
resource::get_cuda_stream(res));
if (d_rev_graph.allocated_memory()) {
raft::copy(rev_graph.data_handle(),
d_rev_graph.data_handle(),
graph_size * output_graph_degree,
resource::get_cuda_stream(res));
}
raft::copy(rev_graph_count.data_handle(),
d_rev_graph_count.data_handle(),
graph_size,
Expand All @@ -542,10 +525,6 @@ void optimize(raft::resources const& res,
const uint64_t num_protected_edges = output_graph_degree / 2;
RAFT_LOG_DEBUG("# num_protected_edges: %lu", num_protected_edges);

memcpy(output_graph_ptr,
pruned_graph.data_handle(),
sizeof(IdxT) * graph_size * output_graph_degree);

constexpr int _omp_chunk = 1024;
#pragma omp parallel for schedule(dynamic, _omp_chunk)
for (uint64_t j = 0; j < graph_size; j++) {
Expand Down Expand Up @@ -578,7 +557,7 @@ void optimize(raft::resources const& res,
#pragma omp parallel for reduction(+ : num_replaced_edges)
for (uint64_t i = 0; i < graph_size; i++) {
for (uint64_t k = 0; k < output_graph_degree; k++) {
const uint64_t j = pruned_graph.data_handle()[k + (output_graph_degree * i)];
const uint64_t j = output_graph_ptr[k + (output_graph_degree * i)];
const uint64_t pos =
pos_in_array<IdxT>(j, output_graph_ptr + (output_graph_degree * i), output_graph_degree);
if (pos == output_graph_degree) { num_replaced_edges += 1; }
Expand Down
95 changes: 95 additions & 0 deletions cpp/include/raft/neighbors/detail/cagra/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
#include <cuda.h>
#include <cuda_fp16.h>
#include <raft/core/detail/macros.hpp>
#include <raft/core/device_mdarray.hpp>
#include <raft/core/host_mdarray.hpp>
#include <type_traits>

namespace raft::neighbors::cagra::detail {
Expand Down Expand Up @@ -150,4 +152,97 @@ struct gen_index_msb_1_mask {
};
} // namespace utils

/**
* Utility to sync memory from a host_matrix_view to a device_matrix_view
*
* In certain situations (UVM/HMM/ATS) host memory might be directly accessible on the
* device, and no extra allocations need to be performed. This class checks
* if the host_matrix_view is already accessible on the device, and only creates device
* memory and copies over if necessary. In memory limited situations this is preferable
* to having both a host and device copy
* TODO: once the mdbuffer changes here https://github.com/wphicks/raft/blob/fea-mdbuffer
* have been merged, we should remove this class and switch over to using mdbuffer for this
*/
template <typename T, typename IdxT>
class device_matrix_view_from_host {
public:
device_matrix_view_from_host(raft::resources const& res, host_matrix_view<T, IdxT> host_view)
: host_view_(host_view)
{
cudaPointerAttributes attr;
RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, host_view.data_handle()));
device_ptr = reinterpret_cast<T*>(attr.devicePointer);
if (device_ptr == NULL) {
// allocate memory and copy over
device_mem_.emplace(
raft::make_device_matrix<T, IdxT>(res, host_view.extent(0), host_view.extent(1)));
raft::copy(device_mem_->data_handle(),
host_view.data_handle(),
host_view.extent(0) * host_view.extent(1),
resource::get_cuda_stream(res));
device_ptr = device_mem_->data_handle();
}
}

device_matrix_view<T, IdxT> view()
{
return make_device_matrix_view<T, IdxT>(device_ptr, host_view_.extent(0), host_view_.extent(1));
}

T* data_handle() { return device_ptr; }

bool allocated_memory() const { return device_mem_.has_value(); }

private:
std::optional<device_matrix<T, IdxT>> device_mem_;
host_matrix_view<T, IdxT> host_view_;
T* device_ptr;
};

/**
* Utility to sync memory from a device_matrix_view to a host_matrix_view
*
* In certain situations (UVM/HMM/ATS) device memory might be directly accessible on the
* host, and no extra allocations need to be performed. This class checks
* if the device_matrix_view is already accessible on the host, and only creates host
* memory and copies over if necessary. In memory limited situations this is preferable
* to having both a host and device copy
* TODO: once the mdbuffer changes here https://github.com/wphicks/raft/blob/fea-mdbuffer
* have been merged, we should remove this class and switch over to using mdbuffer for this
*/
template <typename T, typename IdxT>
class host_matrix_view_from_device {
public:
host_matrix_view_from_device(raft::resources const& res, device_matrix_view<T, IdxT> device_view)
: device_view_(device_view)
{
cudaPointerAttributes attr;
RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, device_view.data_handle()));
host_ptr = reinterpret_cast<T*>(attr.hostPointer);
if (host_ptr == NULL) {
// allocate memory and copy over
host_mem_.emplace(
raft::make_host_matrix<T, IdxT>(device_view.extent(0), device_view.extent(1)));
raft::copy(host_mem_->data_handle(),
device_view.data_handle(),
device_view.extent(0) * device_view.extent(1),
resource::get_cuda_stream(res));
host_ptr = host_mem_->data_handle();
}
}

host_matrix_view<T, IdxT> view()
{
return make_host_matrix_view<T, IdxT>(host_ptr, device_view_.extent(0), device_view_.extent(1));
}

T* data_handle() { return host_ptr; }

bool allocated_memory() const { return host_mem_.has_value(); }

private:
std::optional<host_matrix<T, IdxT>> host_mem_;
device_matrix_view<T, IdxT> device_view_;
T* host_ptr;
};
} // namespace raft::neighbors::cagra::detail

0 comments on commit 12480cf

Please sign in to comment.