Skip to content

Commit

Permalink
Merge branch 'branch-23.12' into 23.12-doc-filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
cjnolet authored Nov 16, 2023
2 parents 1a4a117 + 31fcbf1 commit e70885f
Show file tree
Hide file tree
Showing 49 changed files with 2,115 additions and 830 deletions.
9 changes: 9 additions & 0 deletions .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ jobs:
- checks
- conda-cpp-build
- conda-cpp-tests
- conda-cpp-checks
- conda-python-build
- conda-python-tests
- docs-build
Expand Down Expand Up @@ -43,6 +44,14 @@ jobs:
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
build_type: pull-request
conda-cpp-checks:
needs: conda-cpp-build
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
build_type: pull-request
enable_check_symbols: true
symbol_exclusions: (void (thrust::|cub::)|_ZN\d+raft_cutlass)
conda-python-build:
needs: conda-cpp-build
secrets: inherit
Expand Down
10 changes: 10 additions & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,16 @@ on:
type: string

jobs:
conda-cpp-checks:
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
build_type: nightly
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
sha: ${{ inputs.sha }}
enable_check_symbols: true
symbol_exclusions: (void (thrust::|cub::)|_ZN\d+raft_cutlass)
conda-cpp-tests:
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
Expand Down
2 changes: 2 additions & 0 deletions ci/build_wheel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ fi
if [[ ${package_name} == "raft-dask" ]]; then
sed -r -i "s/pylibraft==(.*)\"/pylibraft${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
sed -r -i "s/ucx-py==(.*)\"/ucx-py${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file}
sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file}
else
sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
fi
Expand Down
16 changes: 10 additions & 6 deletions ci/release/update-version.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,12 @@ sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/source/conf.
DEPENDENCIES=(
dask-cuda
pylibraft
pylibraft-cu11
pylibraft-cu12
rmm
rmm-cu11
rmm-cu12
rapids-dask-dependency
# ucx-py is handled separately below
)
for FILE in dependencies.yaml conda/environments/*.yaml; do
Expand All @@ -75,11 +80,6 @@ done

sed_runner "/^ucx_py_version:$/ {n;s/.*/ - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/raft-dask/conda_build_config.yaml

# Wheel builds install dask-cuda from source, update its branch
for FILE in .github/workflows/*.yaml; do
sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE};
done

for FILE in .github/workflows/*.yaml; do
sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
done
Expand All @@ -88,9 +88,13 @@ sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TA
sed_runner "/^PROJECT_NUMBER/ s|\".*\"|\"${NEXT_SHORT_TAG}\"|g" cpp/doxygen/Doxyfile

sed_runner "/^set(RAFT_VERSION/ s|\".*\"|\"${NEXT_SHORT_TAG}\"|g" docs/source/build.md
sed_runner "/GIT_TAG.*branch-/ s|branch-.*|branch-${NEXT_SHORT_TAG}|g" docs/source/build.md
sed_runner "s|branch-[0-9][0-9].[0-9][0-9]|branch-${NEXT_SHORT_TAG}|g" docs/source/build.md
sed_runner "/rapidsai\/raft/ s|branch-[0-9][0-9].[0-9][0-9]|branch-${NEXT_SHORT_TAG}|g" docs/source/developer_guide.md

sed_runner "s|:[0-9][0-9].[0-9][0-9]|:${NEXT_SHORT_TAG}|g" docs/source/raft_ann_benchmarks.md

sed_runner "s|branch-[0-9][0-9].[0-9][0-9]|branch-${NEXT_SHORT_TAG}|g" README.md

# .devcontainer files
find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do
sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}"
Expand Down
3 changes: 0 additions & 3 deletions ci/test_wheel_raft_dask.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,6 @@ RAPIDS_PY_WHEEL_NAME="raft_dask_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels
RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibraft-dep
python -m pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl

# Always install latest dask for testing
python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/[email protected]

# echo to expand wildcard before adding `[extra]` requires for pip
python -m pip install $(echo ./dist/raft_dask*.whl)[test]

Expand Down
4 changes: 1 addition & 3 deletions conda/environments/all_cuda-118_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,7 @@ dependencies:
- cupy>=12.0.0
- cxx-compiler
- cython>=3.0.0
- dask-core>=2023.9.2
- dask-cuda==23.12.*
- dask>=2023.9.2
- distributed>=2023.9.2
- doxygen>=1.8.20
- gcc_linux-aarch64=11.*
- gmock>=1.13.0
Expand All @@ -49,6 +46,7 @@ dependencies:
- pydata-sphinx-theme
- pytest
- pytest-cov
- rapids-dask-dependency==23.12.*
- recommonmark
- rmm==23.12.*
- scikit-build>=0.13.1
Expand Down
4 changes: 1 addition & 3 deletions conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,7 @@ dependencies:
- cupy>=12.0.0
- cxx-compiler
- cython>=3.0.0
- dask-core>=2023.9.2
- dask-cuda==23.12.*
- dask>=2023.9.2
- distributed>=2023.9.2
- doxygen>=1.8.20
- gcc_linux-64=11.*
- gmock>=1.13.0
Expand All @@ -49,6 +46,7 @@ dependencies:
- pydata-sphinx-theme
- pytest
- pytest-cov
- rapids-dask-dependency==23.12.*
- recommonmark
- rmm==23.12.*
- scikit-build>=0.13.1
Expand Down
4 changes: 1 addition & 3 deletions conda/environments/all_cuda-120_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,7 @@ dependencies:
- cupy>=12.0.0
- cxx-compiler
- cython>=3.0.0
- dask-core>=2023.9.2
- dask-cuda==23.12.*
- dask>=2023.9.2
- distributed>=2023.9.2
- doxygen>=1.8.20
- gcc_linux-aarch64=11.*
- gmock>=1.13.0
Expand All @@ -45,6 +42,7 @@ dependencies:
- pydata-sphinx-theme
- pytest
- pytest-cov
- rapids-dask-dependency==23.12.*
- recommonmark
- rmm==23.12.*
- scikit-build>=0.13.1
Expand Down
4 changes: 1 addition & 3 deletions conda/environments/all_cuda-120_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,7 @@ dependencies:
- cupy>=12.0.0
- cxx-compiler
- cython>=3.0.0
- dask-core>=2023.9.2
- dask-cuda==23.12.*
- dask>=2023.9.2
- distributed>=2023.9.2
- doxygen>=1.8.20
- gcc_linux-64=11.*
- gmock>=1.13.0
Expand All @@ -45,6 +42,7 @@ dependencies:
- pydata-sphinx-theme
- pytest
- pytest-cov
- rapids-dask-dependency==23.12.*
- recommonmark
- rmm==23.12.*
- scikit-build>=0.13.1
Expand Down
4 changes: 1 addition & 3 deletions conda/recipes/raft-dask/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,8 @@ requirements:
- cudatoolkit
{% endif %}
- {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
- dask >=2023.9.2
- dask-core >=2023.9.2
- dask-cuda ={{ minor_version }}
- distributed >=2023.9.2
- rapids-dask-dependency ={{ minor_version }}
- joblib >=0.11
- nccl >=2.9.9
- pylibraft {{ version }}
Expand Down
19 changes: 18 additions & 1 deletion cpp/bench/ann/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ "Include faiss' cpu ivf pq algorithm
option(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" ON)
option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" ON)
option(RAFT_ANN_BENCH_USE_RAFT_CAGRA "Include raft's CAGRA in benchmark" ON)
option(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB "Include raft's CAGRA in benchmark" ON)
option(RAFT_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON)
option(RAFT_ANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" ON)
option(RAFT_ANN_BENCH_SINGLE_EXE
Expand All @@ -54,6 +55,7 @@ if(BUILD_CPU_ONLY)
set(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT OFF)
set(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ OFF)
set(RAFT_ANN_BENCH_USE_RAFT_CAGRA OFF)
set(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB OFF)
set(RAFT_ANN_BENCH_USE_GGNN OFF)
else()
# Disable faiss benchmarks on CUDA 12 since faiss is not yet CUDA 12-enabled.
Expand Down Expand Up @@ -88,14 +90,15 @@ if(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
OR RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE
OR RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
OR RAFT_ANN_BENCH_USE_RAFT_CAGRA
OR RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB
)
set(RAFT_ANN_BENCH_USE_RAFT ON)
endif()

# ##################################################################################################
# * Fetch requirements -------------------------------------------------------------

if(RAFT_ANN_BENCH_USE_HNSWLIB)
if(RAFT_ANN_BENCH_USE_HNSWLIB OR RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB)
include(cmake/thirdparty/get_hnswlib.cmake)
endif()

Expand Down Expand Up @@ -250,6 +253,20 @@ if(RAFT_ANN_BENCH_USE_RAFT_CAGRA)
)
endif()

if(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB)
ConfigureAnnBench(
NAME
RAFT_CAGRA_HNSWLIB
PATH
bench/ann/src/raft/raft_cagra_hnswlib.cu
INCLUDES
${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib-src/hnswlib
LINKS
raft::compiled
CXXFLAGS "${HNSW_CXX_FLAGS}"
)
endif()

set(RAFT_FAISS_TARGETS faiss::faiss)
if(TARGET faiss::faiss_avx2)
set(RAFT_FAISS_TARGETS faiss::faiss_avx2)
Expand Down
76 changes: 50 additions & 26 deletions cpp/bench/ann/src/common/benchmark.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include <memory>
#include <mutex>
#include <numeric>
#include <sstream>
#include <string>
#include <unistd.h>
#include <vector>
Expand Down Expand Up @@ -175,7 +176,6 @@ void bench_search(::benchmark::State& state,
std::shared_ptr<const Dataset<T>> dataset,
Objective metric_objective)
{
std::ptrdiff_t batch_offset = 0;
std::size_t queries_processed = 0;

const auto& sp_json = index.search_params[search_param_ix];
Expand All @@ -189,6 +189,20 @@ void bench_search(::benchmark::State& state,
// Round down the query data to a multiple of the batch size to loop over full batches of data
const std::size_t query_set_size = (dataset->query_set_size() / n_queries) * n_queries;

if (dataset->query_set_size() < n_queries) {
std::stringstream msg;
msg << "Not enough queries in benchmark set. Expected " << n_queries << ", actual "
<< dataset->query_set_size();
return state.SkipWithError(msg.str());
}

// Each thread start from a different offset, so that the queries that they process do not
// overlap.
std::ptrdiff_t batch_offset = (state.thread_index() * n_queries) % query_set_size;
std::ptrdiff_t queries_stride = state.threads() * n_queries;
// Output is saved into a contiguous buffer (separate buffers for each thread).
std::ptrdiff_t out_offset = 0;

const T* query_set = nullptr;

if (!file_exists(index.file)) {
Expand Down Expand Up @@ -273,12 +287,11 @@ void bench_search(::benchmark::State& state,
std::shared_ptr<buf<std::size_t>> neighbors =
std::make_shared<buf<std::size_t>>(algo_property.query_memory_type, k * query_set_size);

auto start = std::chrono::high_resolution_clock::now();
cuda_timer gpu_timer;
auto start = std::chrono::high_resolution_clock::now();
{
nvtx_case nvtx{state.name()};

// TODO: Have the odd threads load the queries backwards just to rule out caching.
ANN<T>* algo = dynamic_cast<ANN<T>*>(current_algo.get());
for (auto _ : state) {
[[maybe_unused]] auto ntx_lap = nvtx.lap();
Expand All @@ -289,15 +302,16 @@ void bench_search(::benchmark::State& state,
algo->search(query_set + batch_offset * dataset->dim(),
n_queries,
k,
neighbors->data + batch_offset * k,
distances->data + batch_offset * k,
neighbors->data + out_offset * k,
distances->data + out_offset * k,
gpu_timer.stream());
} catch (const std::exception& e) {
state.SkipWithError(std::string(e.what()));
}

// advance to the next batch
batch_offset = (batch_offset + n_queries) % query_set_size;
batch_offset = (batch_offset + queries_stride) % query_set_size;
out_offset = (out_offset + n_queries) % query_set_size;

queries_processed += n_queries;
}
Expand All @@ -323,31 +337,41 @@ void bench_search(::benchmark::State& state,
// last thread to finish processing notifies all
if (processed_threads-- == 0) { cond_var.notify_all(); }

// Use the last thread as a sanity check that all the threads are working.
if (state.thread_index() == state.threads() - 1) {
// evaluate recall
if (dataset->max_k() >= k) {
const std::int32_t* gt = dataset->gt_set();
const std::uint32_t max_k = dataset->max_k();
buf<std::size_t> neighbors_host = neighbors->move(MemoryType::Host);
std::size_t rows = std::min(queries_processed, query_set_size);
std::size_t match_count = 0;
std::size_t total_count = rows * static_cast<size_t>(k);
for (std::size_t i = 0; i < rows; i++) {
for (std::uint32_t j = 0; j < k; j++) {
auto act_idx = std::int32_t(neighbors_host.data[i * k + j]);
for (std::uint32_t l = 0; l < k; l++) {
auto exp_idx = gt[i * max_k + l];
if (act_idx == exp_idx) {
match_count++;
break;
// Each thread calculates recall on their partition of queries.
// evaluate recall
if (dataset->max_k() >= k) {
const std::int32_t* gt = dataset->gt_set();
const std::uint32_t max_k = dataset->max_k();
buf<std::size_t> neighbors_host = neighbors->move(MemoryType::Host);
std::size_t rows = std::min(queries_processed, query_set_size);
std::size_t match_count = 0;
std::size_t total_count = rows * static_cast<size_t>(k);

// We go through the groundtruth with same stride as the benchmark loop.
size_t out_offset = 0;
size_t batch_offset = (state.thread_index() * n_queries) % query_set_size;
while (out_offset < rows) {
for (std::size_t i = 0; i < n_queries; i++) {
size_t i_orig_idx = batch_offset + i;
size_t i_out_idx = out_offset + i;
if (i_out_idx < rows) {
for (std::uint32_t j = 0; j < k; j++) {
auto act_idx = std::int32_t(neighbors_host.data[i_out_idx * k + j]);
for (std::uint32_t l = 0; l < k; l++) {
auto exp_idx = gt[i_orig_idx * max_k + l];
if (act_idx == exp_idx) {
match_count++;
break;
}
}
}
}
}
double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
state.counters.insert({{"Recall", actual_recall}});
out_offset += n_queries;
batch_offset = (batch_offset + queries_stride) % query_set_size;
}
double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
state.counters.insert({"Recall", {actual_recall, benchmark::Counter::kAvgThreads}});
}
}

Expand Down
2 changes: 2 additions & 0 deletions cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ class HnswLib : public ANN<T> {
return property;
}

void set_base_layer_only() { appr_alg_->base_layer_only = true; }

private:
void get_search_knn_results_(const T* query, int k, size_t* indices, float* distances) const;

Expand Down
Loading

0 comments on commit e70885f

Please sign in to comment.