Skip to content

Commit

Permalink
Merge branch 'branch-23.10' into fea-cublaslt-matmul
Browse files Browse the repository at this point in the history
  • Loading branch information
achirkin authored Sep 13, 2023
2 parents 01e62b0 + 28b7894 commit 8fdf6cc
Show file tree
Hide file tree
Showing 10 changed files with 244 additions and 249 deletions.
2 changes: 2 additions & 0 deletions conda/recipes/raft-ann-bench/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ requirements:
- h5py {{ h5py_version }}
- benchmark
- matplotlib
# rmm is needed to determine if package is gpu-enabled
- rmm ={{ minor_version }}
- python
- pandas
- pyyaml
Expand Down
5 changes: 3 additions & 2 deletions cpp/bench/ann/src/common/benchmark.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,9 +211,10 @@ void bench_search(::benchmark::State& state,
try {
algo->set_search_dataset(dataset->base_set(algo_property.dataset_memory_type),
dataset->base_set_size());
} catch (const std::exception&) {
} catch (const std::exception& ex) {
state.SkipWithError("The algorithm '" + index.name +
"' requires the base set, but it's not available.");
"' requires the base set, but it's not available. " +
"Exception: " + std::string(ex.what()));
return;
}
}
Expand Down
3 changes: 1 addition & 2 deletions cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@ template <typename T>
void parse_build_param(const nlohmann::json& conf,
typename raft::bench::ann::Ggnn<T>::BuildParam& param)
{
param.dataset_size = conf.at("dataset_size");
param.k = conf.at("k");
param.k = conf.at("k");

if (conf.contains("k_build")) { param.k_build = conf.at("k_build"); }
if (conf.contains("segment_size")) { param.segment_size = conf.at("segment_size"); }
Expand Down
22 changes: 4 additions & 18 deletions cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ class Ggnn : public ANN<T> {
int num_layers{4}; // L
float tau{0.5};
int refine_iterations{2};

size_t dataset_size;
int k; // GGNN requires to know k during building
};

Expand Down Expand Up @@ -182,24 +180,17 @@ GgnnImpl<T, measure, D, KBuild, KQuery, S>::GgnnImpl(Metric metric,
}

if (dim != D) { throw std::runtime_error("mis-matched dim"); }

int device;
RAFT_CUDA_TRY(cudaGetDevice(&device));

ggnn_ = std::make_unique<GGNNGPUInstance>(
device, build_param_.dataset_size, build_param_.num_layers, true, build_param_.tau);
}

template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
void GgnnImpl<T, measure, D, KBuild, KQuery, S>::build(const T* dataset,
size_t nrow,
cudaStream_t stream)
{
if (nrow != build_param_.dataset_size) {
throw std::runtime_error(
"build_param_.dataset_size = " + std::to_string(build_param_.dataset_size) +
" , but nrow = " + std::to_string(nrow));
}
int device;
RAFT_CUDA_TRY(cudaGetDevice(&device));
ggnn_ = std::make_unique<GGNNGPUInstance>(
device, nrow, build_param_.num_layers, true, build_param_.tau);

ggnn_->set_base_data(dataset);
ggnn_->set_stream(stream);
Expand All @@ -212,11 +203,6 @@ void GgnnImpl<T, measure, D, KBuild, KQuery, S>::build(const T* dataset,
template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
void GgnnImpl<T, measure, D, KBuild, KQuery, S>::set_search_dataset(const T* dataset, size_t nrow)
{
if (nrow != build_param_.dataset_size) {
throw std::runtime_error(
"build_param_.dataset_size = " + std::to_string(build_param_.dataset_size) +
" , but nrow = " + std::to_string(nrow));
}
ggnn_->set_base_data(dataset);
}

Expand Down
6 changes: 4 additions & 2 deletions cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
#include <utility>
#include <vector>

#include <omp.h>

#include "../common/ann_types.hpp"
#include <hnswlib.h>

Expand Down Expand Up @@ -164,13 +166,13 @@ class HnswLib : public ANN<T> {
struct BuildParam {
int M;
int ef_construction;
int num_threads{1};
int num_threads = omp_get_num_procs();
};

using typename ANN<T>::AnnSearchParam;
struct SearchParam : public AnnSearchParam {
int ef;
int num_threads{1};
int num_threads = omp_get_num_procs();
};

HnswLib(Metric metric, int dim, const BuildParam& param);
Expand Down
10 changes: 7 additions & 3 deletions cpp/include/raft/neighbors/cagra.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -256,13 +256,17 @@ index<T, IdxT> build(raft::resources const& res,
graph_degree = intermediate_degree;
}

auto knn_graph = raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), intermediate_degree);
std::optional<raft::host_matrix<IdxT, int64_t>> knn_graph(
raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), intermediate_degree));

build_knn_graph(res, dataset, knn_graph.view());
build_knn_graph(res, dataset, knn_graph->view());

auto cagra_graph = raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), graph_degree);

optimize<IdxT>(res, knn_graph.view(), cagra_graph.view());
optimize<IdxT>(res, knn_graph->view(), cagra_graph.view());

// free intermediate graph before trying to create the index
knn_graph.reset();

// Construct an index from dataset and optimized knn graph.
return index<T, IdxT>(res, params.metric, dataset, raft::make_const_mdspan(cagra_graph.view()));
Expand Down
77 changes: 28 additions & 49 deletions cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -334,18 +334,13 @@ void optimize(raft::resources const& res,
auto output_graph_ptr = new_graph.data_handle();
const IdxT graph_size = new_graph.extent(0);

auto pruned_graph = raft::make_host_matrix<IdxT, int64_t>(graph_size, output_graph_degree);

{
//
// Prune kNN graph
//
auto d_input_graph =
raft::make_device_matrix<IdxT, int64_t>(res, graph_size, input_graph_degree);

auto detour_count = raft::make_host_matrix<uint8_t, int64_t>(graph_size, input_graph_degree);
auto d_detour_count =
raft::make_device_matrix<uint8_t, int64_t>(res, graph_size, input_graph_degree);

RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(),
0xff,
graph_size * input_graph_degree * sizeof(uint8_t),
Expand Down Expand Up @@ -376,24 +371,13 @@ void optimize(raft::resources const& res,
const double time_prune_start = cur_time();
RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r");

raft::copy(d_input_graph.data_handle(),
input_graph_ptr,
graph_size * input_graph_degree,
resource::get_cuda_stream(res));
void (*kernel_prune)(const IdxT* const,
const uint32_t,
const uint32_t,
const uint32_t,
const uint32_t,
const uint32_t,
uint8_t* const,
uint32_t* const,
uint64_t* const);
// Copy input_graph_ptr over to device if necessary
device_matrix_view_from_host d_input_graph(
res,
raft::make_host_matrix_view<IdxT, int64_t>(input_graph_ptr, graph_size, input_graph_degree));

constexpr int MAX_DEGREE = 1024;
if (input_graph_degree <= MAX_DEGREE) {
kernel_prune = kern_prune<MAX_DEGREE, IdxT>;
} else {
if (input_graph_degree > MAX_DEGREE) {
RAFT_FAIL(
"The degree of input knn graph is too large (%u). "
"It must be equal to or smaller than %d.",
Expand All @@ -410,16 +394,17 @@ void optimize(raft::resources const& res,
dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, resource::get_cuda_stream(res)));

for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
kernel_prune<<<blocks_prune, threads_prune, 0, resource::get_cuda_stream(res)>>>(
d_input_graph.data_handle(),
graph_size,
input_graph_degree,
output_graph_degree,
batch_size,
i_batch,
d_detour_count.data_handle(),
d_num_no_detour_edges.data_handle(),
dev_stats.data_handle());
kern_prune<MAX_DEGREE, IdxT>
<<<blocks_prune, threads_prune, 0, resource::get_cuda_stream(res)>>>(
d_input_graph.data_handle(),
graph_size,
input_graph_degree,
output_graph_degree,
batch_size,
i_batch,
d_detour_count.data_handle(),
d_num_no_detour_edges.data_handle(),
dev_stats.data_handle());
resource::sync_stream(res);
RAFT_LOG_DEBUG(
"# Pruning kNN Graph on GPUs (%.1lf %%)\r",
Expand All @@ -428,10 +413,7 @@ void optimize(raft::resources const& res,
resource::sync_stream(res);
RAFT_LOG_DEBUG("\n");

raft::copy(detour_count.data_handle(),
d_detour_count.data_handle(),
graph_size * input_graph_degree,
resource::get_cuda_stream(res));
host_matrix_view_from_device<uint8_t, int64_t> detour_count(res, d_detour_count.view());

raft::copy(
host_stats.data_handle(), dev_stats.data_handle(), 2, resource::get_cuda_stream(res));
Expand All @@ -447,7 +429,7 @@ void optimize(raft::resources const& res,
if (max_detour < num_detour) { max_detour = num_detour; /* stats */ }
for (uint64_t k = 0; k < input_graph_degree; k++) {
if (detour_count.data_handle()[k + (input_graph_degree * i)] != num_detour) { continue; }
pruned_graph.data_handle()[pk + (output_graph_degree * i)] =
output_graph_ptr[pk + (output_graph_degree * i)] =
input_graph_ptr[k + (input_graph_degree * i)];
pk += 1;
if (pk >= output_graph_degree) break;
Expand Down Expand Up @@ -478,8 +460,7 @@ void optimize(raft::resources const& res,
//
const double time_make_start = cur_time();

auto d_rev_graph =
raft::make_device_matrix<IdxT, int64_t>(res, graph_size, output_graph_degree);
device_matrix_view_from_host<IdxT, int64_t> d_rev_graph(res, rev_graph.view());
RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph.data_handle(),
0xff,
graph_size * output_graph_degree * sizeof(IdxT),
Expand All @@ -497,7 +478,7 @@ void optimize(raft::resources const& res,
for (uint64_t k = 0; k < output_graph_degree; k++) {
#pragma omp parallel for
for (uint64_t i = 0; i < graph_size; i++) {
dest_nodes.data_handle()[i] = pruned_graph.data_handle()[k + (output_graph_degree * i)];
dest_nodes.data_handle()[i] = output_graph_ptr[k + (output_graph_degree * i)];
}
resource::sync_stream(res);

Expand All @@ -520,10 +501,12 @@ void optimize(raft::resources const& res,
resource::sync_stream(res);
RAFT_LOG_DEBUG("\n");

raft::copy(rev_graph.data_handle(),
d_rev_graph.data_handle(),
graph_size * output_graph_degree,
resource::get_cuda_stream(res));
if (d_rev_graph.allocated_memory()) {
raft::copy(rev_graph.data_handle(),
d_rev_graph.data_handle(),
graph_size * output_graph_degree,
resource::get_cuda_stream(res));
}
raft::copy(rev_graph_count.data_handle(),
d_rev_graph_count.data_handle(),
graph_size,
Expand All @@ -542,10 +525,6 @@ void optimize(raft::resources const& res,
const uint64_t num_protected_edges = output_graph_degree / 2;
RAFT_LOG_DEBUG("# num_protected_edges: %lu", num_protected_edges);

memcpy(output_graph_ptr,
pruned_graph.data_handle(),
sizeof(IdxT) * graph_size * output_graph_degree);

constexpr int _omp_chunk = 1024;
#pragma omp parallel for schedule(dynamic, _omp_chunk)
for (uint64_t j = 0; j < graph_size; j++) {
Expand Down Expand Up @@ -578,7 +557,7 @@ void optimize(raft::resources const& res,
#pragma omp parallel for reduction(+ : num_replaced_edges)
for (uint64_t i = 0; i < graph_size; i++) {
for (uint64_t k = 0; k < output_graph_degree; k++) {
const uint64_t j = pruned_graph.data_handle()[k + (output_graph_degree * i)];
const uint64_t j = output_graph_ptr[k + (output_graph_degree * i)];
const uint64_t pos =
pos_in_array<IdxT>(j, output_graph_ptr + (output_graph_degree * i), output_graph_degree);
if (pos == output_graph_degree) { num_replaced_edges += 1; }
Expand Down
95 changes: 95 additions & 0 deletions cpp/include/raft/neighbors/detail/cagra/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
#include <cuda.h>
#include <cuda_fp16.h>
#include <raft/core/detail/macros.hpp>
#include <raft/core/device_mdarray.hpp>
#include <raft/core/host_mdarray.hpp>
#include <type_traits>

namespace raft::neighbors::cagra::detail {
Expand Down Expand Up @@ -150,4 +152,97 @@ struct gen_index_msb_1_mask {
};
} // namespace utils

/**
* Utility to sync memory from a host_matrix_view to a device_matrix_view
*
* In certain situations (UVM/HMM/ATS) host memory might be directly accessible on the
* device, and no extra allocations need to be performed. This class checks
* if the host_matrix_view is already accessible on the device, and only creates device
* memory and copies over if necessary. In memory limited situations this is preferable
* to having both a host and device copy
* TODO: once the mdbuffer changes here https://github.com/wphicks/raft/blob/fea-mdbuffer
* have been merged, we should remove this class and switch over to using mdbuffer for this
*/
template <typename T, typename IdxT>
class device_matrix_view_from_host {
public:
device_matrix_view_from_host(raft::resources const& res, host_matrix_view<T, IdxT> host_view)
: host_view_(host_view)
{
cudaPointerAttributes attr;
RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, host_view.data_handle()));
device_ptr = reinterpret_cast<T*>(attr.devicePointer);
if (device_ptr == NULL) {
// allocate memory and copy over
device_mem_.emplace(
raft::make_device_matrix<T, IdxT>(res, host_view.extent(0), host_view.extent(1)));
raft::copy(device_mem_->data_handle(),
host_view.data_handle(),
host_view.extent(0) * host_view.extent(1),
resource::get_cuda_stream(res));
device_ptr = device_mem_->data_handle();
}
}

device_matrix_view<T, IdxT> view()
{
return make_device_matrix_view<T, IdxT>(device_ptr, host_view_.extent(0), host_view_.extent(1));
}

T* data_handle() { return device_ptr; }

bool allocated_memory() const { return device_mem_.has_value(); }

private:
std::optional<device_matrix<T, IdxT>> device_mem_;
host_matrix_view<T, IdxT> host_view_;
T* device_ptr;
};

/**
* Utility to sync memory from a device_matrix_view to a host_matrix_view
*
* In certain situations (UVM/HMM/ATS) device memory might be directly accessible on the
* host, and no extra allocations need to be performed. This class checks
* if the device_matrix_view is already accessible on the host, and only creates host
* memory and copies over if necessary. In memory limited situations this is preferable
* to having both a host and device copy
* TODO: once the mdbuffer changes here https://github.com/wphicks/raft/blob/fea-mdbuffer
* have been merged, we should remove this class and switch over to using mdbuffer for this
*/
template <typename T, typename IdxT>
class host_matrix_view_from_device {
public:
host_matrix_view_from_device(raft::resources const& res, device_matrix_view<T, IdxT> device_view)
: device_view_(device_view)
{
cudaPointerAttributes attr;
RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, device_view.data_handle()));
host_ptr = reinterpret_cast<T*>(attr.hostPointer);
if (host_ptr == NULL) {
// allocate memory and copy over
host_mem_.emplace(
raft::make_host_matrix<T, IdxT>(device_view.extent(0), device_view.extent(1)));
raft::copy(host_mem_->data_handle(),
device_view.data_handle(),
device_view.extent(0) * device_view.extent(1),
resource::get_cuda_stream(res));
host_ptr = host_mem_->data_handle();
}
}

host_matrix_view<T, IdxT> view()
{
return make_host_matrix_view<T, IdxT>(host_ptr, device_view_.extent(0), device_view_.extent(1));
}

T* data_handle() { return host_ptr; }

bool allocated_memory() const { return host_mem_.has_value(); }

private:
std::optional<host_matrix<T, IdxT>> host_mem_;
device_matrix_view<T, IdxT> device_view_;
T* host_ptr;
};
} // namespace raft::neighbors::cagra::detail
2 changes: 1 addition & 1 deletion docs/source/ann_benchmarks_param_tuning.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,4 +99,4 @@ IVF-pq is an inverted-file index, which partitions the vectors into a series of
| `ef` | `search_param` | Y | Positive Integer >0 | | Size of the dynamic list for the nearest neighbors used for search. Higher value leads to more accurate but slower search. Cannot be lower than `k`. |
| `numThreads` | `search_params` | N | Positive Integer >0 | 1 | Number of threads to use for queries. |

Please refer to [HNSW algorithm parameters guide](https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md) from `hnswlib` to learn more about these arguments.
Please refer to [HNSW algorithm parameters guide] from `hnswlib` to learn more about these arguments.
Loading

0 comments on commit 8fdf6cc

Please sign in to comment.