Skip to content

Commit

Permalink
Merge branch 'branch-24.12' into 24.12-nb-milvus
Browse files Browse the repository at this point in the history
  • Loading branch information
lowener authored Nov 20, 2024
2 parents 0a61fc0 + f127b06 commit 165a1cd
Show file tree
Hide file tree
Showing 29 changed files with 348 additions and 98 deletions.
5 changes: 4 additions & 1 deletion ci/build_wheel_cuvs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

set -euo pipefail

package_dir="python/cuvs"

case "${RAPIDS_CUDA_VERSION}" in
12.*)
EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=ON"
Expand All @@ -15,4 +17,5 @@ esac
# Set up skbuild options. Enable sccache in skbuild config options
export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUVS_CPP=OFF${EXTRA_CMAKE_ARGS}"

ci/build_wheel.sh cuvs python/cuvs
ci/build_wheel.sh cuvs ${package_dir}
ci/validate_wheel.sh ${package_dir} final_dist
35 changes: 35 additions & 0 deletions ci/validate_wheel.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash
# Copyright (c) 2024, NVIDIA CORPORATION.

set -euo pipefail

package_dir=$1
wheel_dir_relative_path=$2

RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"

# some packages are much larger on CUDA 11 than on CUDA 12
if [[ "${RAPIDS_CUDA_MAJOR}" == "11" ]]; then
PYDISTCHECK_ARGS=(
--max-allowed-size-compressed '1.4G'
)
else
PYDISTCHECK_ARGS=(
--max-allowed-size-compressed '950M'
)
fi

cd "${package_dir}"

rapids-logger "validate packages with 'pydistcheck'"

pydistcheck \
--inspect \
"${PYDISTCHECK_ARGS[@]}" \
"$(echo ${wheel_dir_relative_path}/*.whl)"

rapids-logger "validate packages with 'twine'"

twine check \
--strict \
"$(echo ${wheel_dir_relative_path}/*.whl)"
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,7 @@ if(BUILD_SHARED_LIBS)
src/neighbors/nn_descent.cu
src/neighbors/nn_descent_float.cu
src/neighbors/nn_descent_half.cu
src/neighbors/nn_descent_index.cpp
src/neighbors/nn_descent_int8.cu
src/neighbors/nn_descent_uint8.cu
src/neighbors/reachability.cu
Expand Down
24 changes: 14 additions & 10 deletions cpp/include/cuvs/neighbors/nn_descent.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,10 @@ struct index_params : cuvs::neighbors::index_params {
/** @brief Construct NN descent parameters for a specific kNN graph degree
*
* @param graph_degree output graph degree
* @param metric distance metric to use
*/
index_params(size_t graph_degree = 64)
: graph_degree(graph_degree), intermediate_graph_degree(1.5 * graph_degree)
{
}
index_params(size_t graph_degree = 64,
cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded);
};

/**
Expand Down Expand Up @@ -103,11 +102,16 @@ struct index : cuvs::neighbors::index {
* @param n_rows number of rows in knn-graph
* @param n_cols number of cols in knn-graph
* @param return_distances whether to return distances
* @param metric distance metric to use
*/
index(raft::resources const& res, int64_t n_rows, int64_t n_cols, bool return_distances = false)
index(raft::resources const& res,
int64_t n_rows,
int64_t n_cols,
bool return_distances = false,
cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded)
: cuvs::neighbors::index(),
res_{res},
metric_{cuvs::distance::DistanceType::L2Expanded},
metric_{metric},
graph_{raft::make_host_matrix<IdxT, int64_t, raft::row_major>(n_rows, n_cols)},
graph_view_{graph_.view()},
return_distances_{return_distances}
Expand All @@ -129,14 +133,16 @@ struct index : cuvs::neighbors::index {
* @param graph_view raft::host_matrix_view<IdxT, int64_t, raft::row_major> for storing knn-graph
* @param distances_view optional raft::device_matrix_view<float, int64_t, row_major> for storing
* distances
* @param metric distance metric to use
*/
index(raft::resources const& res,
raft::host_matrix_view<IdxT, int64_t, raft::row_major> graph_view,
std::optional<raft::device_matrix_view<float, int64_t, row_major>> distances_view =
std::nullopt)
std::nullopt,
cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded)
: cuvs::neighbors::index(),
res_{res},
metric_{cuvs::distance::DistanceType::L2Expanded},
metric_{metric},
graph_{raft::make_host_matrix<IdxT, int64_t, raft::row_major>(0, 0)},
graph_view_{graph_view},
distances_view_{distances_view},
Expand Down Expand Up @@ -473,8 +479,6 @@ auto build(raft::resources const& res,
std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;

/** @} */

/**
* @brief Test if we have enough GPU memory to run NN descent algorithm.
*
Expand Down
2 changes: 2 additions & 0 deletions cpp/src/neighbors/cagra_c.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
#include <cuvs/neighbors/cagra.h>
#include <cuvs/neighbors/cagra.hpp>

#include <fstream>

namespace {

template <typename T>
Expand Down
12 changes: 5 additions & 7 deletions cpp/src/neighbors/detail/cagra/cagra_build.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -436,11 +436,11 @@ index<T, IdxT> build(
auto knn_build_params = params.graph_build_params;
if (std::holds_alternative<std::monostate>(params.graph_build_params)) {
// Heuristic to decide default build algo and its params.
if (params.metric == cuvs::distance::DistanceType::L2Expanded &&
cuvs::neighbors::nn_descent::has_enough_device_memory(
if (cuvs::neighbors::nn_descent::has_enough_device_memory(
res, dataset.extents(), sizeof(IdxT))) {
RAFT_LOG_DEBUG("NN descent solver");
knn_build_params = cagra::graph_build_params::nn_descent_params(intermediate_degree);
knn_build_params =
cagra::graph_build_params::nn_descent_params(intermediate_degree, params.metric);
} else {
RAFT_LOG_DEBUG("Selecting IVF-PQ solver");
knn_build_params = cagra::graph_build_params::ivf_pq_params(dataset.extents(), params.metric);
Expand All @@ -453,9 +453,6 @@ index<T, IdxT> build(
std::get<cuvs::neighbors::cagra::graph_build_params::ivf_pq_params>(knn_build_params);
build_knn_graph(res, dataset, knn_graph->view(), ivf_pq_params);
} else {
RAFT_EXPECTS(
params.metric == cuvs::distance::DistanceType::L2Expanded,
"L2Expanded is the only distance metrics supported for CAGRA build with nn_descent");
auto nn_descent_params =
std::get<cagra::graph_build_params::nn_descent_params>(knn_build_params);

Expand All @@ -466,7 +463,8 @@ index<T, IdxT> build(
"nn-descent graph_degree.",
nn_descent_params.graph_degree,
intermediate_degree);
nn_descent_params = cagra::graph_build_params::nn_descent_params(intermediate_degree);
nn_descent_params =
cagra::graph_build_params::nn_descent_params(intermediate_degree, params.metric);
}

// Use nn-descent to build CAGRA knn graph
Expand Down
1 change: 1 addition & 0 deletions cpp/src/neighbors/detail/cagra/graph_core.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ __global__ void kern_prune(const IdxT* const knn_graph, // [graph_chunk_size, g
// count number of detours (A->D->B)
for (uint32_t kAD = 0; kAD < graph_degree - 1; kAD++) {
const uint64_t iD = knn_graph[kAD + (graph_degree * iA)];
if (iD >= graph_size) { continue; }
for (uint32_t kDB = threadIdx.x; kDB < graph_degree; kDB += blockDim.x) {
const uint64_t iB_candidate = knn_graph[kDB + ((uint64_t)graph_degree * iD)];
for (uint32_t kAB = kAD + 1; kAB < graph_degree; kAB++) {
Expand Down
87 changes: 59 additions & 28 deletions cpp/src/neighbors/detail/nn_descent.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "ann_utils.cuh"
#include "cagra/device_common.hpp"

#include <cuvs/distance/distance.hpp>
#include <cuvs/neighbors/nn_descent.hpp>

#include <raft/core/device_mdarray.hpp>
Expand Down Expand Up @@ -216,6 +217,7 @@ struct BuildConfig {
size_t max_iterations{50};
float termination_threshold{0.0001};
size_t output_graph_degree{32};
cuvs::distance::DistanceType metric{cuvs::distance::DistanceType::L2Expanded};
};

template <typename Index_t>
Expand Down Expand Up @@ -454,11 +456,13 @@ __device__ __forceinline__ void load_vec(Data_t* vec_buffer,
// TODO: Replace with RAFT utilities https://github.com/rapidsai/raft/issues/1827
/** Calculate L2 norm, and cast data to __half */
template <typename Data_t>
RAFT_KERNEL preprocess_data_kernel(const Data_t* input_data,
__half* output_data,
int dim,
DistData_t* l2_norms,
size_t list_offset = 0)
RAFT_KERNEL preprocess_data_kernel(
const Data_t* input_data,
__half* output_data,
int dim,
DistData_t* l2_norms,
size_t list_offset = 0,
cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded)
{
extern __shared__ char buffer[];
__shared__ float l2_norm;
Expand All @@ -468,26 +472,32 @@ RAFT_KERNEL preprocess_data_kernel(const Data_t* input_data,
load_vec(s_vec, input_data + blockIdx.x * dim, dim, dim, threadIdx.x % raft::warp_size());
if (threadIdx.x == 0) { l2_norm = 0; }
__syncthreads();
int lane_id = threadIdx.x % raft::warp_size();
for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) {
int idx = step * raft::warp_size() + lane_id;
float part_dist = 0;
if (idx < dim) {
part_dist = s_vec[idx];
part_dist = part_dist * part_dist;
}
__syncwarp();
for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) {
part_dist += __shfl_down_sync(raft::warp_full_mask(), part_dist, offset);

if (metric == cuvs::distance::DistanceType::L2Expanded ||
metric == cuvs::distance::DistanceType::CosineExpanded) {
int lane_id = threadIdx.x % raft::warp_size();
for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) {
int idx = step * raft::warp_size() + lane_id;
float part_dist = 0;
if (idx < dim) {
part_dist = s_vec[idx];
part_dist = part_dist * part_dist;
}
__syncwarp();
for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) {
part_dist += __shfl_down_sync(raft::warp_full_mask(), part_dist, offset);
}
if (lane_id == 0) { l2_norm += part_dist; }
__syncwarp();
}
if (lane_id == 0) { l2_norm += part_dist; }
__syncwarp();
}

for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) {
int idx = step * raft::warp_size() + threadIdx.x;
if (idx < dim) {
if (l2_norms == nullptr) {
if (metric == cuvs::distance::DistanceType::InnerProduct) {
output_data[list_id * dim + idx] = input_data[(size_t)blockIdx.x * dim + idx];
} else if (metric == cuvs::distance::DistanceType::CosineExpanded) {
output_data[list_id * dim + idx] =
(float)input_data[(size_t)blockIdx.x * dim + idx] / sqrt(l2_norm);
} else {
Expand Down Expand Up @@ -715,7 +725,8 @@ __launch_bounds__(BLOCK_SIZE, 4)
DistData_t* dists,
int graph_width,
int* locks,
DistData_t* l2_norms)
DistData_t* l2_norms,
cuvs::distance::DistanceType metric)
{
#if (__CUDA_ARCH__ >= 700)
using namespace nvcuda;
Expand Down Expand Up @@ -827,8 +838,10 @@ __launch_bounds__(BLOCK_SIZE, 4)
for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) {
if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_new_size &&
i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) {
if (l2_norms == nullptr) {
if (metric == cuvs::distance::DistanceType::InnerProduct) {
s_distances[i] = -s_distances[i];
} else if (metric == cuvs::distance::DistanceType::CosineExpanded) {
s_distances[i] = 1.0 - s_distances[i];
} else {
s_distances[i] = l2_norms[new_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] +
l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] -
Expand Down Expand Up @@ -906,8 +919,10 @@ __launch_bounds__(BLOCK_SIZE, 4)
for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) {
if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_old_size &&
i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) {
if (l2_norms == nullptr) {
if (metric == cuvs::distance::DistanceType::InnerProduct) {
s_distances[i] = -s_distances[i];
} else if (metric == cuvs::distance::DistanceType::CosineExpanded) {
s_distances[i] = 1.0 - s_distances[i];
} else {
s_distances[i] = l2_norms[old_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] +
l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] -
Expand Down Expand Up @@ -1161,7 +1176,7 @@ GNND<Data_t, Index_t>::GNND(raft::resources const& res, const BuildConfig& build
ndim_(build_config.dataset_dim),
d_data_{raft::make_device_matrix<__half, size_t, raft::row_major>(
res, nrow_, build_config.dataset_dim)},
l2_norms_{raft::make_device_vector<DistData_t, size_t>(res, nrow_)},
l2_norms_{raft::make_device_vector<DistData_t, size_t>(res, 0)},
graph_buffer_{
raft::make_device_matrix<ID_t, size_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
dists_buffer_{
Expand All @@ -1181,11 +1196,16 @@ GNND<Data_t, Index_t>::GNND(raft::resources const& res, const BuildConfig& build
d_list_sizes_old_{raft::make_device_vector<int2, size_t>(res, nrow_)}
{
static_assert(NUM_SAMPLES <= 32);

raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits<float>::max());
auto graph_buffer_view = raft::make_device_matrix_view<Index_t, int64_t>(
reinterpret_cast<Index_t*>(graph_buffer_.data_handle()), nrow_, DEGREE_ON_DEVICE);
raft::matrix::fill(res, graph_buffer_view, std::numeric_limits<Index_t>::max());
raft::matrix::fill(res, d_locks_.view(), 0);

if (build_config.metric == cuvs::distance::DistanceType::L2Expanded) {
l2_norms_ = raft::make_device_vector<DistData_t, size_t>(res, nrow_);
}
};

template <typename Data_t, typename Index_t>
Expand Down Expand Up @@ -1228,7 +1248,8 @@ void GNND<Data_t, Index_t>::local_join(cudaStream_t stream)
dists_buffer_.data_handle(),
DEGREE_ON_DEVICE,
d_locks_.data_handle(),
l2_norms_.data_handle());
l2_norms_.data_handle(),
build_config_.metric);
}

template <typename Data_t, typename Index_t>
Expand Down Expand Up @@ -1261,7 +1282,8 @@ void GNND<Data_t, Index_t>::build(Data_t* data,
d_data_.data_handle(),
build_config_.dataset_dim,
l2_norms_.data_handle(),
batch.offset());
batch.offset(),
build_config_.metric);
}
graph_.clear();
Expand Down Expand Up @@ -1417,6 +1439,11 @@ void build(raft::resources const& res,
RAFT_EXPECTS(dataset.extent(0) < std::numeric_limits<int>::max() - 1,
"The dataset size for GNND should be less than %d",
std::numeric_limits<int>::max() - 1);
auto allowed_metrics = params.metric == cuvs::distance::DistanceType::L2Expanded ||
params.metric == cuvs::distance::DistanceType::CosineExpanded ||
params.metric == cuvs::distance::DistanceType::InnerProduct;
RAFT_EXPECTS(allowed_metrics && idx.metric() == params.metric,
"The metric for NN Descent should be L2Expanded, CosineExpanded or InnerProduct");
size_t intermediate_degree = params.intermediate_graph_degree;
size_t graph_degree = params.graph_degree;
Expand Down Expand Up @@ -1452,7 +1479,8 @@ void build(raft::resources const& res,
.internal_node_degree = extended_intermediate_degree,
.max_iterations = params.max_iterations,
.termination_threshold = params.termination_threshold,
.output_graph_degree = params.graph_degree};
.output_graph_degree = params.graph_degree,
.metric = params.metric};
GNND<const T, int> nnd(res, build_config);
Expand Down Expand Up @@ -1500,8 +1528,11 @@ index<IdxT> build(
graph_degree = intermediate_degree;
}
index<IdxT> idx{
res, dataset.extent(0), static_cast<int64_t>(graph_degree), params.return_distances};
index<IdxT> idx{res,
dataset.extent(0),
static_cast<int64_t>(graph_degree),
params.return_distances,
params.metric};
build(res, params, dataset, idx);
Expand Down
Loading

0 comments on commit 165a1cd

Please sign in to comment.