Skip to content

Commit

Permalink
add-latest-changes
Browse files Browse the repository at this point in the history
  • Loading branch information
tarang-jain committed Jun 28, 2024
1 parent 8c1ad31 commit b4eaacc
Show file tree
Hide file tree
Showing 50 changed files with 155 additions and 16,195 deletions.
4 changes: 2 additions & 2 deletions cpp/bench/ann/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchm
option(RAFT_ANN_BENCH_USE_RAFT_CAGRA "Include raft's CAGRA in benchmark" ON)
option(RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE "Include raft's brute force knn in benchmark" ON)
option(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB "Include raft's CAGRA in benchmark" ON)
option(RAFT_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON)
option(RAFT_ANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" ON)
option(RAFT_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" OFF)
option(RAFT_ANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" OFF)
option(RAFT_ANN_BENCH_USE_DISKANN "Include diskann algorithm in benchmark" ON)
option(RAFT_ANN_BENCH_SINGLE_EXE
"Make a single executable with benchmark as shared library modules" OFF
Expand Down
85 changes: 42 additions & 43 deletions cpp/bench/ann/src/diskann/diskann_wrapper.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ class DiskANNMemory : public ANN<T> {
bool use_cagra_graph_;
bool use_pq_build_ = false;
uint32_t build_pq_bytes_ = 0;
std::shared_ptr<diskann::IndexWriteParameters> diskann_index_write_params_{nullptr};
// std::shared_ptr<diskann::IndexWriteParameters> diskann_index_write_params_{nullptr};
std::shared_ptr<diskann::IndexSearchParams> diskann_index_search_params_{nullptr};
std::shared_ptr<diskann::Index<T>> diskann_index_{nullptr};
// uint32_t L_load_;
Expand All @@ -111,7 +111,7 @@ DiskANNMemory<T>::DiskANNMemory(Metric metric, int dim, const BuildParam& param)
: ANN<T>(metric, dim)
{
assert(this->dim_ > 0);
diskann_index_write_params_ = std::make_shared<diskann::IndexWriteParameters>(
auto diskann_index_write_params = std::make_shared<diskann::IndexWriteParameters>(
diskann::IndexWriteParametersBuilder(param.L_build, param.R)
.with_filter_list_size(0)
.with_alpha(param.alpha)
Expand All @@ -122,37 +122,37 @@ DiskANNMemory<T>::DiskANNMemory(Metric metric, int dim, const BuildParam& param)
build_pq_bytes_ = 0;
cagra_graph_degree_ = param.cagra_graph_degree;
cagra_intermediate_graph_degree_ = param.cagra_intermediate_graph_degree;
}

template <typename T>
void DiskANNMemory<T>::build(const T* dataset, size_t nrow)
{
max_points_ = nrow;
std::cout << "num_threads" << this->diskann_index_write_params_->num_threads << std::endl;

this->diskann_index_ = std::make_shared<diskann::Index<T>>(parse_metric_type(this->metric_),
this->dim_,
max_points_,
this->diskann_index_write_params_,
this->diskann_index_ = std::make_shared<diskann::Index<T>>(parse_metric_type(metric),
dim,
10000000,
diskann_index_write_params,
nullptr,
0,
false,
false,
false,
this->use_pq_build_,
false,
this->build_pq_bytes_,
false,
false,
this->use_cagra_graph_,
cagra_graph_degree_);
param.use_cagra_graph,
param.cagra_graph_degree);
}

template <typename T>
void DiskANNMemory<T>::build(const T* dataset, size_t nrow)
{
max_points_ = nrow;
// std::cout << "num_threads" << this->diskann_index_write_params_->num_threads << std::endl;

if (use_cagra_graph_) {
std::optional<raft::host_matrix<uint32_t, int64_t>> intermediate_graph(
raft::make_host_matrix<uint32_t, int64_t>(nrow, cagra_intermediate_graph_degree_));

std::vector<std::vector<uint32_t>> knn_graph(nrow, std::vector<uint32_t>(cagra_graph_degree_));
auto knn_graph_view = raft::make_host_matrix_view<uint32_t, int64_t>(
knn_graph[0].data(), nrow, cagra_graph_degree_);
std::vector<uint32_t> knn_graph(nrow * cagra_graph_degree_);
auto knn_graph_view =
raft::make_host_matrix_view<uint32_t, int64_t>(knn_graph.data(), nrow, cagra_graph_degree_);
auto dataset_view = raft::make_host_matrix_view<const T, int64_t>(
dataset, static_cast<int64_t>(nrow), (int64_t)this->dim_);
raft::resources res;
Expand All @@ -161,6 +161,10 @@ void DiskANNMemory<T>::build(const T* dataset, size_t nrow)
nn_descent_params.graph_degree = cagra_intermediate_graph_degree_;
nn_descent_params.intermediate_graph_degree = 1.5 * cagra_intermediate_graph_degree_;
nn_descent_params.max_iterations = 20;
// auto ivf_pq_params =
// raft::neighbors::ivf_pq::index_params::from_dataset(dataset_view); ivf_pq_params.n_lists =
// static_cast<uint32_t>(nrow / 2500);

raft::neighbors::cagra::build_knn_graph(
res, dataset_view, intermediate_graph->view(), nn_descent_params);
raft::neighbors::cagra::optimize(res, intermediate_graph->view(), knn_graph_view);
Expand Down Expand Up @@ -188,15 +192,25 @@ template <typename T>
void DiskANNMemory<T>::search(
const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
{
if (this->metric_objective_ == Objective::LATENCY)
omp_set_num_threads(diskann_index_write_params_->num_threads);
// std::cout << "num_search_threads" << diskann_index_write_params_->num_threads << std::endl;
if (this->metric_objective_ == Objective::LATENCY) {
omp_set_num_threads(omp_get_num_procs());
#pragma omp parallel for
for (int64_t i = 0; i < (int64_t)batch_size; i++) {
diskann_index_->search(queries + i * this->dim_,
static_cast<size_t>(k),
L_search_,
neighbors + i * k,
distances + i * k);
for (int64_t i = 0; i < (int64_t)batch_size; i++) {
diskann_index_->search(queries + i * this->dim_,
static_cast<size_t>(k),
L_search_,
neighbors + i * k,
distances + i * k);
}
} else {
for (int64_t i = 0; i < (int64_t)batch_size; i++) {
diskann_index_->search(queries + i * this->dim_,
static_cast<size_t>(k),
L_search_,
neighbors + i * k,
distances + i * k);
}
}
}

Expand All @@ -209,21 +223,6 @@ void DiskANNMemory<T>::save(const std::string& path_to_index) const
template <typename T>
void DiskANNMemory<T>::load(const std::string& path_to_index)
{
this->diskann_index_ = std::make_shared<diskann::Index<T>>(parse_metric_type(this->metric_),
this->dim_,
max_points_,
this->diskann_index_write_params_,
nullptr,
0,
false,
false,
false,
this->use_pq_build_,
this->build_pq_bytes_,
false,
false,
this->use_cagra_graph_,
cagra_graph_degree_);
diskann_index_->load(path_to_index.c_str(), diskann_index_write_params_->num_threads, 100);
diskann_index_->load(path_to_index.c_str(), 80, 100);
}
}; // namespace raft::bench::ann
83 changes: 35 additions & 48 deletions cpp/cmake/patches/diskann.diff
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ index d0206a7..46cdee4 100644
};

diff --git a/include/index.h b/include/index.h
index b9bf4f3..88939a9 100644
index b9bf4f3..4890f00 100644
--- a/include/index.h
+++ b/include/index.h
@@ -66,7 +66,8 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
Expand All @@ -251,27 +251,26 @@ index b9bf4f3..88939a9 100644

DISKANN_DLLEXPORT ~Index();

@@ -100,6 +101,10 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
// Batch build from a data array, which must pad vectors to aligned_dim
DISKANN_DLLEXPORT void build(const T *data, const size_t num_points_to_load, const std::vector<TagT> &tags);
@@ -98,7 +99,8 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
DISKANN_DLLEXPORT void build(const char *filename, const size_t num_points_to_load, const char *tag_filename);

+ // Batch build from a data array, which must pad vectors to aligned_dim
// Batch build from a data array, which must pad vectors to aligned_dim
- DISKANN_DLLEXPORT void build(const T *data, const size_t num_points_to_load, const std::vector<TagT> &tags);
+ DISKANN_DLLEXPORT void build(const T *data, const size_t num_points_to_load, const std::vector<TagT> &tags,
+ std::vector<std::vector<uint32_t>> &raft_cagra_graph_vec);
+
+ const std::vector<uint32_t> &raft_cagra_graph_vec = std::vector<uint32_t>());
// Based on filter params builds a filtered or unfiltered index
DISKANN_DLLEXPORT void build(const std::string &data_file, const size_t num_points_to_load,
IndexFilterParams &filter_params);
@@ -236,6 +241,8 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
@@ -236,6 +238,8 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
Index(const Index<T, TagT, LabelT> &) = delete;
Index<T, TagT, LabelT> &operator=(const Index<T, TagT, LabelT> &) = delete;

+ void add_raft_cagra_neighbours(std::vector<std::vector<uint32_t>>& raft_cagra_graph_vec);
+ void add_raft_cagra_neighbours(const std::vector<uint32_t>& raft_cagra_graph_vec);
+
// Use after _data and _nd have been populated
// Acquire exclusive _update_lock before calling
void build_with_data_populated(const std::vector<TagT> &tags);
@@ -444,5 +451,8 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
@@ -444,5 +448,8 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
std::vector<non_recursive_mutex> _locks;

static const float INDEX_GROWTH_FACTOR;
Expand Down Expand Up @@ -395,7 +394,7 @@ index c12b251..ea39001 100644
+
} // namespace diskann
diff --git a/src/index.cpp b/src/index.cpp
index bf93344..014ed66 100644
index bf93344..665f45a 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -37,8 +37,10 @@ Index<T, TagT, LabelT>::Index(const IndexConfig &index_config, std::shared_ptr<A
Expand Down Expand Up @@ -430,20 +429,29 @@ index bf93344..014ed66 100644
.build(),
IndexFactory::construct_datastore<T>(DataStoreStrategy::MEMORY,
(max_points == 0 ? (size_t)1 : max_points) +
@@ -1505,6 +1510,21 @@ void Index<T, TagT, LabelT>::set_start_points_at_random(T radius, uint32_t rando
@@ -1505,6 +1510,30 @@ void Index<T, TagT, LabelT>::set_start_points_at_random(T radius, uint32_t rando
set_start_points(points_data.data(), points_data.size());
}

+template <typename T, typename TagT, typename LabelT>
+void Index<T, TagT, LabelT>::add_raft_cagra_neighbours(std::vector<std::vector<uint32_t>> &raft_cagra_graph_vec)
+void Index<T, TagT, LabelT>::add_raft_cagra_neighbours(const std::vector<uint32_t> &raft_cagra_graph_vec)
+{
+ std::cout << "inside add_raft_cagra_neighbours" << std::endl;
+ std::vector<std::vector<uint32_t>> &graph = _graph_store->graph();
+ std::cout << "accessed graph" << std::endl;
+ std::cout << "graph size " << graph.size() << std::endl;
+
+#pragma omp parallel for num_threads(_indexingThreads)
+ for (int i = 0; i < graph.size(); i++)
+ {
+ _graph_store->set_neighbours(i, raft_cagra_graph_vec[i]);
+ assert(_graph_store->get_neighbours((location_t)i).size() <= _indexingRange);
+ raft_cagra_graph_vec[i].clear();
+ graph[i].resize(_raft_cagra_graph_degree);
+ for (int j = 0; j < _raft_cagra_graph_degree; j++)
+ {
+ graph[i][j] = raft_cagra_graph_vec[i * _raft_cagra_graph_degree + j];
+ }
+ // if (i % 100000 == 0) {
+ // std::cout << "resized " << i << " rows" << std::endl;
+ // }
+ }
+ std::cout << "_indexingThreads" << _indexingThreads << std::endl;
+ _has_built = true;
Expand All @@ -452,46 +460,27 @@ index bf93344..014ed66 100644
template <typename T, typename TagT, typename LabelT>
void Index<T, TagT, LabelT>::build_with_data_populated(const std::vector<TagT> &tags)
{
@@ -1575,6 +1595,7 @@ void Index<T, TagT, LabelT>::_build(const DataType &data, const size_t num_point
@@ -1575,8 +1604,10 @@ void Index<T, TagT, LabelT>::_build(const DataType &data, const size_t num_point
throw ANNException("Error" + std::string(e.what()), -1);
}
}
+
template <typename T, typename TagT, typename LabelT>
void Index<T, TagT, LabelT>::build(const T *data, const size_t num_points_to_load, const std::vector<TagT> &tags)
-void Index<T, TagT, LabelT>::build(const T *data, const size_t num_points_to_load, const std::vector<TagT> &tags)
+void Index<T, TagT, LabelT>::build(const T *data, const size_t num_points_to_load, const std::vector<TagT> &tags,
+ const std::vector<uint32_t> &raft_cagra_graph_vec)
{
@@ -1593,13 +1614,44 @@ void Index<T, TagT, LabelT>::build(const T *data, const size_t num_points_to_loa
if (num_points_to_load == 0)
{
@@ -1593,11 +1624,18 @@ void Index<T, TagT, LabelT>::build(const T *data, const size_t num_points_to_loa
{
std::unique_lock<std::shared_timed_mutex> tl(_tag_lock);
_nd = num_points_to_load;
-
_data_store->populate_data(data, (location_t)num_points_to_load);
}
-
build_with_data_populated(tags);
}

+template <typename T, typename TagT, typename LabelT>
+void Index<T, TagT, LabelT>::build(const T *data, const size_t num_points_to_load, const std::vector<TagT> &tags,
+ std::vector<std::vector<uint32_t>> &raft_cagra_graph_vec)
+{
+ if (num_points_to_load == 0)
+ {
+ throw ANNException("Do not call build with 0 points", -1, __FUNCSIG__, __FILE__, __LINE__);
+ }
+ if (_pq_dist)
+ {
+ throw ANNException("ERROR: DO not use this build interface with PQ distance", -1, __FUNCSIG__, __FILE__,
+ __LINE__);
+ }
+
+ std::unique_lock<std::shared_timed_mutex> ul(_update_lock);
+
+ {
+ std::unique_lock<std::shared_timed_mutex> tl(_tag_lock);
+ _nd = num_points_to_load;
+ _data_store->populate_data(data, (location_t)num_points_to_load);
+ }
- build_with_data_populated(tags);
+ if (!_raft_cagra_graph)
+ build_with_data_populated(tags);
+ else
Expand All @@ -502,8 +491,6 @@ index bf93344..014ed66 100644
+ _start = calculate_entry_point();
+ add_raft_cagra_neighbours(raft_cagra_graph_vec);
+ }
+}
+
}
template <typename T, typename TagT, typename LabelT>
void Index<T, TagT, LabelT>::build(const char *filename, const size_t num_points_to_load, const std::vector<TagT> &tags)
{
Loading

0 comments on commit b4eaacc

Please sign in to comment.