From e2902831d307bb7ad59046ed9c9646a15807f3e7 Mon Sep 17 00:00:00 2001 From: Divye Gala Date: Wed, 6 Sep 2023 13:19:16 -0400 Subject: [PATCH 1/5] Simplify wheel build scripts and allow alphas of RAPIDS dependencies (#1804) This PR: 1. Removes `ci/apply_wheel_modifications.sh` and uses it inline in wheel build scripts 2. Allows for specifying alpha versioned dependencies of RAPIDS projects Authors: - Divye Gala (https://github.com/divyegala) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/raft/pull/1804 --- ci/build_wheel.sh | 33 ++++++++++++++++++++++--- ci/build_wheel_raft_dask.sh | 5 ---- ci/release/apply_wheel_modifications.sh | 25 ------------------- 3 files changed, 30 insertions(+), 33 deletions(-) delete mode 100755 ci/release/apply_wheel_modifications.sh diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index a9f7f64294..af66de7f8b 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -15,9 +15,36 @@ version_override="$(rapids-pip-wheel-version ${RAPIDS_DATE_STRING})" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -ci/release/apply_wheel_modifications.sh ${version_override} "-${RAPIDS_PY_CUDA_SUFFIX}" -echo "The package name and/or version was modified in the package source. The git diff is:" -git diff +# This is the version of the suffix with a preceding hyphen. It's used +# everywhere except in the final wheel name. +PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}" + +# Patch project metadata files to include the CUDA version suffix and version override. +pyproject_file="${package_dir}/pyproject.toml" + +sed -i "s/^version = .*/version = \"${version_override}\"/g" ${pyproject_file} +sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file} + +# For nightlies we want to ensure that we're pulling in alphas as well. The +# easiest way to do so is to augment the spec with a constraint containing a +# min alpha version that doesn't affect the version bounds but does allow usage +# of alpha versions for that dependency without --pre +alpha_spec='' +if ! rapids-is-release-build; then + alpha_spec=',>=0.0.0a0' +fi + +if [[ ${package_name} == "raft_dask" ]]; then + sed -r -i "s/pylibraft==(.*)\"/pylibraft${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file} + sed -i "s/ucx-py/ucx-py${PACKAGE_CUDA_SUFFIX}/g" python/raft-dask/pyproject.toml +else + sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file} +fi + +if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then + sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" ${pyproject_file} + sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file} +fi cd "${package_dir}" diff --git a/ci/build_wheel_raft_dask.sh b/ci/build_wheel_raft_dask.sh index f0204d45c0..e90412fb80 100755 --- a/ci/build_wheel_raft_dask.sh +++ b/ci/build_wheel_raft_dask.sh @@ -6,9 +6,4 @@ set -euo pipefail # Set up skbuild options. Enable sccache in skbuild config options export SKBUILD_CONFIGURE_OPTIONS="-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF" -RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" - -RAPIDS_PY_WHEEL_NAME=pylibraft_${RAPIDS_PY_CUDA_SUFFIX} rapids-download-wheels-from-s3 ./local-pylibraft -python -m pip install --no-deps ./local-pylibraft/pylibraft*.whl - ci/build_wheel.sh raft_dask python/raft-dask diff --git a/ci/release/apply_wheel_modifications.sh b/ci/release/apply_wheel_modifications.sh deleted file mode 100755 index fd6c2f929e..0000000000 --- a/ci/release/apply_wheel_modifications.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -# Copyright (c) 2023, NVIDIA CORPORATION. -# -# Usage: bash apply_wheel_modifications.sh - -VERSION=${1} -CUDA_SUFFIX=${2} - -# pyproject.toml versions -sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/pylibraft/pyproject.toml -sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/raft-dask/pyproject.toml - -# pylibraft pyproject.toml cuda suffixes -sed -i "s/^name = \"pylibraft\"/name = \"pylibraft${CUDA_SUFFIX}\"/g" python/pylibraft/pyproject.toml -sed -i "s/rmm/rmm${CUDA_SUFFIX}/g" python/pylibraft/pyproject.toml - -# raft-dask pyproject.toml cuda suffixes -sed -i "s/^name = \"raft-dask\"/name = \"raft-dask${CUDA_SUFFIX}\"/g" python/raft-dask/pyproject.toml -sed -i "s/pylibraft/pylibraft${CUDA_SUFFIX}/g" python/raft-dask/pyproject.toml -sed -i "s/ucx-py/ucx-py${CUDA_SUFFIX}/g" python/raft-dask/pyproject.toml - -if [[ $CUDA_SUFFIX == "-cu12" ]]; then - sed -i "s/cuda-python[<=>\.,0-9]*/cuda-python>=12.0,<13.0/g" python/pylibraft/pyproject.toml - sed -i "s/cupy-cuda11x/cupy-cuda12x/g" python/pylibraft/pyproject.toml -fi From f691fc9651dc83a2e6b3ce7a8de524f4c2a57c44 Mon Sep 17 00:00:00 2001 From: Divye Gala Date: Wed, 6 Sep 2023 17:40:35 -0400 Subject: [PATCH 2/5] Fix `raft-dask` naming in wheel builds (#1805) Before https://github.com/rapidsai/raft/pull/1804/files, we were using `raft-dask` and `raft_dask` interchangeably because they were hardcoded. Since the package name is now capture as a variable, it's a bug to use `raft_dask` as the pyproject name does not get updated correctly and we end up publishing `raft-dask` wheels without `cu{version}` suffixes Authors: - Divye Gala (https://github.com/divyegala) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/raft/pull/1805 --- ci/build_wheel.sh | 5 +++-- ci/build_wheel_raft_dask.sh | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index af66de7f8b..662a11ad0e 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -5,6 +5,7 @@ set -euo pipefail package_name=$1 package_dir=$2 +underscore_package_name=$(echo "${package_name}" | tr "-" "_") source rapids-configure-sccache source rapids-date-string @@ -34,7 +35,7 @@ if ! rapids-is-release-build; then alpha_spec=',>=0.0.0a0' fi -if [[ ${package_name} == "raft_dask" ]]; then +if [[ ${package_name} == "raft-dask" ]]; then sed -r -i "s/pylibraft==(.*)\"/pylibraft${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file} sed -i "s/ucx-py/ucx-py${PACKAGE_CUDA_SUFFIX}/g" python/raft-dask/pyproject.toml else @@ -54,4 +55,4 @@ python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check mkdir -p final_dist python -m auditwheel repair -w final_dist dist/* -RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist +RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist diff --git a/ci/build_wheel_raft_dask.sh b/ci/build_wheel_raft_dask.sh index e90412fb80..ff89f4da23 100755 --- a/ci/build_wheel_raft_dask.sh +++ b/ci/build_wheel_raft_dask.sh @@ -6,4 +6,4 @@ set -euo pipefail # Set up skbuild options. Enable sccache in skbuild config options export SKBUILD_CONFIGURE_OPTIONS="-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF" -ci/build_wheel.sh raft_dask python/raft-dask +ci/build_wheel.sh raft-dask python/raft-dask From be378eec26752f411d9fe2f8b330ce0665389a18 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 7 Sep 2023 08:56:45 -0400 Subject: [PATCH 3/5] A few fixes to `raft-ann-bench` recipe and docs (#1806) Authors: - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Divye Gala (https://github.com/divyegala) - Dante Gama Dessavre (https://github.com/dantegd) - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/raft/pull/1806 --- conda/recipes/raft-ann-bench-cpu/meta.yaml | 1 + conda/recipes/raft-ann-bench/meta.yaml | 6 +++++- docs/source/raft_ann_benchmarks.md | 16 ++++++++-------- .../src/raft-ann-bench/run/__main__.py | 2 +- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/conda/recipes/raft-ann-bench-cpu/meta.yaml b/conda/recipes/raft-ann-bench-cpu/meta.yaml index 355ea640ff..699e485d0b 100644 --- a/conda/recipes/raft-ann-bench-cpu/meta.yaml +++ b/conda/recipes/raft-ann-bench-cpu/meta.yaml @@ -57,6 +57,7 @@ requirements: - matplotlib - python - pyyaml + - benchmark about: home: https://rapids.ai/ diff --git a/conda/recipes/raft-ann-bench/meta.yaml b/conda/recipes/raft-ann-bench/meta.yaml index 882ff6cc49..a5c20b0a28 100644 --- a/conda/recipes/raft-ann-bench/meta.yaml +++ b/conda/recipes/raft-ann-bench/meta.yaml @@ -90,7 +90,11 @@ requirements: - libfaiss {{ faiss_version }} {% endif %} - h5py {{ h5py_version }} - + - benchmark + - glog {{ glog_version }} + - matplotlib + - python + - pyyaml about: home: https://rapids.ai/ license: Apache-2.0 diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md index e0c02bb7eb..af0b040d34 100644 --- a/docs/source/raft_ann_benchmarks.md +++ b/docs/source/raft_ann_benchmarks.md @@ -8,7 +8,7 @@ The easiest way to install these benchmarks is through conda. We provide package ```bash -mamba env create --name raft_ann_benchmarks +mamba create --name raft_ann_benchmarks conda activate raft_ann_benchmarks # to install GPU package: @@ -25,7 +25,7 @@ Please see the [build instructions](ann_benchmarks_build.md) to build the benchm ## Running the benchmarks ### Usage -There are 3 general steps to running the benchmarks and vizualizing the results: +There are 4 general steps to running the benchmarks and visualizing the results: 1. Prepare Dataset 2. Build Index and Search Index 3. Data Export @@ -39,7 +39,7 @@ expected to be defined to run these scripts; this variable holds the directory w ### End-to-end example: Million-scale -The steps below demonstrate how to download, install, and run benchmarks on a subset of 10M vectors from the Yandex Deep-1B dataset By default the datasets will be stored and used from the folder indicated by the RAPIDS_DATASET_ROOT_DIR environment variable if defined, otherwise a datasets subfolder from where the script is being called: +The steps below demonstrate how to download, install, and run benchmarks on a subset of 10M vectors from the Yandex Deep-1B dataset By default the datasets will be stored and used from the folder indicated by the RAPIDS_DATASET_ROOT_DIR environment variable if defined, otherwise a datasets sub-folder from where the script is being called: ```bash @@ -56,7 +56,7 @@ python -m raft-ann-bench.data_export --dataset deep-image-96-inner python -m raft-ann-bench.plot --dataset deep-image-96-inner ``` -Configuration files already exist for the following list of the million-scale datasets. These all work out-of-the-box with the `--dataset` argument. Other million-scale datasets from `ann-benchmarks.com` will work, but will require a json configuration file to be created in `python/raft-ann-bench/src/raft-ann-bench/conf`. +Configuration files already exist for the following list of the million-scale datasets. Please refer to [ann-benchmarks datasets](https://github.com/erikbern/ann-benchmarks/#data-sets) for more information, including actual train and sizes. These all work out-of-the-box with the `--dataset` argument. Other million-scale datasets from `ann-benchmarks.com` will work, but will require a json configuration file to be created in `python/raft-ann-bench/src/raft-ann-bench/run/conf`. - `deep-image-96-angular` - `fashion-mnist-784-euclidean` - `glove-50-angular` @@ -80,17 +80,17 @@ mkdir -p datasets/deep-1B # (1) prepare dataset # download manually "Ground Truth" file of "Yandex DEEP" # suppose the file name is deep_new_groundtruth.public.10K.bin -python python -m raft-ann-bench.split_groundtruth --groundtruth datasets/deep-1B/deep_new_groundtruth.public.10K.bin +python -m raft-ann-bench.split_groundtruth --groundtruth datasets/deep-1B/deep_new_groundtruth.public.10K.bin # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced # (2) build and search index -python python -m raft-ann-bench.run --dataset deep-1B +python -m raft-ann-bench.run --dataset deep-1B # (3) export data -python python -m raft-ann-bench.data_export --dataset deep-1B +python -m raft-ann-bench.data_export --dataset deep-1B # (4) plot results -python python -m raft-ann-bench.plot --dataset deep-1B +python -m raft-ann-bench.plot --dataset deep-1B ``` The usage of `python -m raft-ann-bench.split-groundtruth` is: diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py index d5a65ddfb7..347c68c477 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py @@ -145,7 +145,7 @@ def main(): # Read list of allowed algorithms try: - import pylibraft # noqa: F401 + import rmm # noqa: F401 gpu_present = True except ImportError: From ec4236a9905aee1ad79a11265b6f8240bf9657c3 Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Thu, 7 Sep 2023 19:07:03 +0200 Subject: [PATCH 4/5] ann-bench: miscellaneous improvements (#1808) 1. IVF-PQ: slightly improve stream ordering 2. IVF-PQ: build param 'codebook_kind' - as per `ivf_pq_types`. 3. FAISS IVF models: build param 'ratio' with the same meaning as in IVF-PQ - the clustering algorithm uses `1/ratio` of the given dataset for training. Authors: - Artem M. Chirkin (https://github.com/achirkin) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1808 --- cpp/bench/ann/src/faiss/faiss_benchmark.cu | 16 +++-- cpp/bench/ann/src/faiss/faiss_wrapper.h | 75 +++++++++++++------- cpp/bench/ann/src/raft/raft_benchmark.cu | 16 +++-- cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h | 41 ++++++----- docs/source/ann_benchmarks_param_tuning.md | 12 ++-- 5 files changed, 105 insertions(+), 55 deletions(-) diff --git a/cpp/bench/ann/src/faiss/faiss_benchmark.cu b/cpp/bench/ann/src/faiss/faiss_benchmark.cu index 231154ccfd..56885cce5c 100644 --- a/cpp/bench/ann/src/faiss/faiss_benchmark.cu +++ b/cpp/bench/ann/src/faiss/faiss_benchmark.cu @@ -30,19 +30,27 @@ namespace raft::bench::ann { +template +void parse_base_build_param(const nlohmann::json& conf, + typename raft::bench::ann::FaissGpu::BuildParam& param) +{ + param.nlist = conf.at("nlist"); + if (conf.contains("ratio")) { param.ratio = conf.at("ratio"); } +} + template void parse_build_param(const nlohmann::json& conf, typename raft::bench::ann::FaissGpuIVFFlat::BuildParam& param) { - param.nlist = conf.at("nlist"); + parse_base_build_param(conf, param); } template void parse_build_param(const nlohmann::json& conf, typename raft::bench::ann::FaissGpuIVFPQ::BuildParam& param) { - param.nlist = conf.at("nlist"); - param.M = conf.at("M"); + parse_base_build_param(conf, param); + param.M = conf.at("M"); if (conf.contains("usePrecomputed")) { param.usePrecomputed = conf.at("usePrecomputed"); } else { @@ -59,7 +67,7 @@ template void parse_build_param(const nlohmann::json& conf, typename raft::bench::ann::FaissGpuIVFSQ::BuildParam& param) { - param.nlist = conf.at("nlist"); + parse_base_build_param(conf, param); param.quantizer_type = conf.at("quantizer_type"); } diff --git a/cpp/bench/ann/src/faiss/faiss_wrapper.h b/cpp/bench/ann/src/faiss/faiss_wrapper.h index ec80e6cbfd..672c685b1f 100644 --- a/cpp/bench/ann/src/faiss/faiss_wrapper.h +++ b/cpp/bench/ann/src/faiss/faiss_wrapper.h @@ -18,6 +18,7 @@ #include "../common/ann_types.hpp" +#include #include #include @@ -85,7 +86,23 @@ class FaissGpu : public ANN { float refine_ratio = 1.0; }; - FaissGpu(Metric metric, int dim, int nlist); + struct BuildParam { + int nlist = 1; + int ratio = 2; + }; + + FaissGpu(Metric metric, int dim, const BuildParam& param) + : ANN(metric, dim), + metric_type_(parse_metric_type(metric)), + nlist_{param.nlist}, + training_sample_fraction_{1.0 / double(param.ratio)} + { + static_assert(std::is_same_v, "faiss support only float type"); + RAFT_CUDA_TRY(cudaGetDevice(&device_)); + RAFT_CUDA_TRY(cudaEventCreate(&sync_, cudaEventDisableTiming)); + faiss_default_stream_ = gpu_resource_.getDefaultStream(device_); + } + virtual ~FaissGpu() noexcept { RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(sync_)); } void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) final; @@ -131,23 +148,35 @@ class FaissGpu : public ANN { int device_; cudaEvent_t sync_{nullptr}; cudaStream_t faiss_default_stream_{nullptr}; + double training_sample_fraction_; }; -template -FaissGpu::FaissGpu(Metric metric, int dim, int nlist) - : ANN(metric, dim), metric_type_(parse_metric_type(metric)), nlist_(nlist) -{ - static_assert(std::is_same_v, "faiss support only float type"); - RAFT_CUDA_TRY(cudaGetDevice(&device_)); - RAFT_CUDA_TRY(cudaEventCreate(&sync_, cudaEventDisableTiming)); - faiss_default_stream_ = gpu_resource_.getDefaultStream(device_); -} - template void FaissGpu::build(const T* dataset, size_t nrow, cudaStream_t stream) { OmpSingleThreadScope omp_single_thread; - + auto index_ivf = dynamic_cast(index_.get()); + if (index_ivf != nullptr) { + // set the min/max training size for clustering to use the whole provided training set. + double trainset_size = training_sample_fraction_ * static_cast(nrow); + double points_per_centroid = trainset_size / static_cast(nlist_); + int max_ppc = std::ceil(points_per_centroid); + int min_ppc = std::floor(points_per_centroid); + if (min_ppc < index_ivf->cp.min_points_per_centroid) { + RAFT_LOG_WARN( + "The suggested training set size %zu (data size %zu, training sample ratio %f) yields %d " + "points per cluster (n_lists = %d). This is smaller than the FAISS default " + "min_points_per_centroid = %d.", + static_cast(trainset_size), + nrow, + training_sample_fraction_, + min_ppc, + nlist_, + index_ivf->cp.min_points_per_centroid); + } + index_ivf->cp.max_points_per_centroid = max_ppc; + index_ivf->cp.min_points_per_centroid = min_ppc; + } index_->train(nrow, dataset); // faiss::gpu::GpuIndexFlat::train() will do nothing assert(index_->is_trained); index_->add(nrow, dataset); @@ -208,12 +237,9 @@ void FaissGpu::load_(const std::string& file) template class FaissGpuIVFFlat : public FaissGpu { public: - struct BuildParam { - int nlist; - }; + using typename FaissGpu::BuildParam; - FaissGpuIVFFlat(Metric metric, int dim, const BuildParam& param) - : FaissGpu(metric, dim, param.nlist) + FaissGpuIVFFlat(Metric metric, int dim, const BuildParam& param) : FaissGpu(metric, dim, param) { faiss::gpu::GpuIndexIVFFlatConfig config; config.device = this->device_; @@ -234,15 +260,13 @@ class FaissGpuIVFFlat : public FaissGpu { template class FaissGpuIVFPQ : public FaissGpu { public: - struct BuildParam { - int nlist; + struct BuildParam : public FaissGpu::BuildParam { int M; bool useFloat16; bool usePrecomputed; }; - FaissGpuIVFPQ(Metric metric, int dim, const BuildParam& param) - : FaissGpu(metric, dim, param.nlist) + FaissGpuIVFPQ(Metric metric, int dim, const BuildParam& param) : FaissGpu(metric, dim, param) { faiss::gpu::GpuIndexIVFPQConfig config; config.useFloat16LookupTables = param.useFloat16; @@ -271,13 +295,11 @@ class FaissGpuIVFPQ : public FaissGpu { template class FaissGpuIVFSQ : public FaissGpu { public: - struct BuildParam { - int nlist; + struct BuildParam : public FaissGpu::BuildParam { std::string quantizer_type; }; - FaissGpuIVFSQ(Metric metric, int dim, const BuildParam& param) - : FaissGpu(metric, dim, param.nlist) + FaissGpuIVFSQ(Metric metric, int dim, const BuildParam& param) : FaissGpu(metric, dim, param) { faiss::ScalarQuantizer::QuantizerType qtype; if (param.quantizer_type == "fp16") { @@ -310,7 +332,8 @@ class FaissGpuIVFSQ : public FaissGpu { template class FaissGpuFlat : public FaissGpu { public: - FaissGpuFlat(Metric metric, int dim) : FaissGpu(metric, dim, 0) + FaissGpuFlat(Metric metric, int dim) + : FaissGpu(metric, dim, typename FaissGpu::BuildParam{}) { faiss::gpu::GpuIndexFlatConfig config; config.device = this->device_; diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu index aa25d1532f..7ba381ab0a 100644 --- a/cpp/bench/ann/src/raft/raft_benchmark.cu +++ b/cpp/bench/ann/src/raft/raft_benchmark.cu @@ -58,10 +58,7 @@ void parse_build_param(const nlohmann::json& conf, { param.n_lists = conf.at("nlist"); if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); } - if (conf.contains("ratio")) { - param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); - std::cout << "kmeans_trainset_fraction " << param.kmeans_trainset_fraction; - } + if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); } } template @@ -82,6 +79,17 @@ void parse_build_param(const nlohmann::json& conf, if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); } if (conf.contains("pq_bits")) { param.pq_bits = conf.at("pq_bits"); } if (conf.contains("pq_dim")) { param.pq_dim = conf.at("pq_dim"); } + if (conf.contains("codebook_kind")) { + std::string kind = conf.at("codebook_kind"); + if (kind == "cluster") { + param.codebook_kind = raft::neighbors::ivf_pq::codebook_gen::PER_CLUSTER; + } else if (kind == "subspace") { + param.codebook_kind = raft::neighbors::ivf_pq::codebook_gen::PER_SUBSPACE; + } else { + throw std::runtime_error("codebook_kind: '" + kind + + "', should be either 'cluster' or 'subspace'"); + } + } } template diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h index 1554c1f016..8f1e43a706 100644 --- a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h +++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h @@ -63,9 +63,14 @@ class RaftIvfPQ : public ANN { rmm::mr::set_current_device_resource(&mr_); index_params_.metric = parse_metric_type(metric); RAFT_CUDA_TRY(cudaGetDevice(&device_)); + RAFT_CUDA_TRY(cudaEventCreate(&sync_, cudaEventDisableTiming)); } - ~RaftIvfPQ() noexcept { rmm::mr::set_current_device_resource(mr_.get_upstream()); } + ~RaftIvfPQ() noexcept + { + RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(sync_)); + rmm::mr::set_current_device_resource(mr_.get_upstream()); + } void build(const T* dataset, size_t nrow, cudaStream_t stream) final; @@ -96,6 +101,7 @@ class RaftIvfPQ : public ANN { // `mr_` must go first to make sure it dies last rmm::mr::pool_memory_resource mr_; raft::device_resources handle_; + cudaEvent_t sync_{nullptr}; BuildParam index_params_; raft::neighbors::ivf_pq::search_params search_params_; std::optional> index_; @@ -103,6 +109,12 @@ class RaftIvfPQ : public ANN { int dimension_; float refine_ratio_ = 1.0; raft::device_matrix_view dataset_; + + void stream_wait(cudaStream_t stream) const + { + RAFT_CUDA_TRY(cudaEventRecord(sync_, resource::get_cuda_stream(handle_))); + RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, sync_)); + } }; template @@ -121,12 +133,12 @@ void RaftIvfPQ::load(const std::string& file) } template -void RaftIvfPQ::build(const T* dataset, size_t nrow, cudaStream_t) +void RaftIvfPQ::build(const T* dataset, size_t nrow, cudaStream_t stream) { auto dataset_v = raft::make_device_matrix_view(dataset, IdxT(nrow), dim_); index_.emplace(raft::runtime::neighbors::ivf_pq::build(handle_, index_params_, dataset_v)); - return; + stream_wait(stream); } template @@ -176,16 +188,14 @@ void RaftIvfPQ::search(const T* queries, neighbors_v, distances_v, index_->metric()); + stream_wait(stream); // RAFT stream -> bench stream } else { auto queries_host = raft::make_host_matrix(batch_size, index_->dim()); auto candidates_host = raft::make_host_matrix(batch_size, k0); auto neighbors_host = raft::make_host_matrix(batch_size, k); auto distances_host = raft::make_host_matrix(batch_size, k); - raft::copy(queries_host.data_handle(), - queries, - queries_host.size(), - resource::get_cuda_stream(handle_)); + raft::copy(queries_host.data_handle(), queries, queries_host.size(), stream); raft::copy(candidates_host.data_handle(), candidates.data_handle(), candidates_host.size(), @@ -194,6 +204,10 @@ void RaftIvfPQ::search(const T* queries, auto dataset_v = raft::make_host_matrix_view( dataset_.data_handle(), dataset_.extent(0), dataset_.extent(1)); + // wait for the queries to copy to host in 'stream` and for IVF-PQ::search to finish + RAFT_CUDA_TRY(cudaEventRecord(sync_, resource::get_cuda_stream(handle_))); + RAFT_CUDA_TRY(cudaEventRecord(sync_, stream)); + RAFT_CUDA_TRY(cudaEventSynchronize(sync_)); raft::runtime::neighbors::refine(handle_, dataset_v, queries_host.view(), @@ -202,14 +216,8 @@ void RaftIvfPQ::search(const T* queries, distances_host.view(), index_->metric()); - raft::copy(neighbors, - (size_t*)neighbors_host.data_handle(), - neighbors_host.size(), - resource::get_cuda_stream(handle_)); - raft::copy(distances, - distances_host.data_handle(), - distances_host.size(), - resource::get_cuda_stream(handle_)); + raft::copy(neighbors, (size_t*)neighbors_host.data_handle(), neighbors_host.size(), stream); + raft::copy(distances, distances_host.data_handle(), distances_host.size(), stream); } } else { auto queries_v = @@ -219,8 +227,7 @@ void RaftIvfPQ::search(const T* queries, raft::runtime::neighbors::ivf_pq::search( handle_, search_params_, *index_, queries_v, neighbors_v, distances_v); + stream_wait(stream); // RAFT stream -> bench stream } - resource::sync_stream(handle_); - return; } } // namespace raft::bench::ann diff --git a/docs/source/ann_benchmarks_param_tuning.md b/docs/source/ann_benchmarks_param_tuning.md index 020c2d5ad9..ca8ffa5e18 100644 --- a/docs/source/ann_benchmarks_param_tuning.md +++ b/docs/source/ann_benchmarks_param_tuning.md @@ -1,6 +1,6 @@ # ANN Benchmarks Parameter Tuning Guide -This guide outlines the various parameter settings that can be specified in [RAFT ANN Benchmark](raft_ann_benchmarks.md) json configuration files and explains the impact they have on corresponding algorithms to help inform their settings for benchmarking across desired levels of recall. +This guide outlines the various parameter settings that can be specified in [RAFT ANN Benchmark](raft_ann_benchmarks.md) json configuration files and explains the impact they have on corresponding algorithms to help inform their settings for benchmarking across desired levels of recall. ## RAFT Indexes @@ -15,8 +15,8 @@ IVF-flat is a simple algorithm which won't save any space, but it provides compe |-----------|------------------|----------|---------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | `nlists` | `build_param` | Y | Positive Integer >0 | | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. | | `niter` | `build_param` | N | Positive Integer >0 | 20 | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. | -| `ratio` | `build_param` | N | Positive Float >0 | 0.5 | Fraction of the number of training points which should be used to train the clusters. | -| `nprobe` | `search_params` | Y | Positive Integer >0 | | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index. | +| `ratio` | `build_param` | N | Positive Integer >0 | 2 | `1/ratio` is the number of training points which should be used to train the clusters. | +| `nprobe` | `search_params` | Y | Positive Integer >0 | | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index. | ### `raft_ivf_pq` @@ -27,8 +27,10 @@ IVF-pq is an inverted-file index, which partitions the vectors into a series of |-------------------------|----------------|---|------------------------------|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | `nlists` | `build_param` | Y | Positive Integer >0 | | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. | | `niter` | `build_param` | N | Positive Integer >0 | 20 | Number of k-means iterations to use when training the clusters. | +| `ratio` | `build_param` | N | Positive Integer >0 | 2 | `1/ratio` is the number of training points which should be used to train the clusters. | | `pq_dim` | `build_param` | N | Positive Integer. Multiple of 8. | 0 | Dimensionality of the vector after product quantization. When 0, a heuristic is used to select this value. `pq_dim` * `pq_bits` must be a multiple of 8. | | `pq_bits` | `build_param` | N | Positive Integer. [4-8] | 8 | Bit length of the vector element after quantization. | +| `codebook_kind` | `build_param` | N | ["cluster", "subspace"] | "subspace" | Type of codebook. See the [API docs](https://docs.rapids.ai/api/raft/nightly/cpp_api/neighbors_ivf_pq/#_CPPv412codebook_gen) for more detail | | `nprobe` | `search_params` | Y | Positive Integer >0 | | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index. | | `internalDistanceDtype` | `search_params` | N | [`float`, `half`] | `half` | The precision to use for the distance computations. Lower precision can increase performance at the cost of accuracy. | | `smemLutDtype` | `search_params` | N | [`float`, `half`, `fp8`] | `half` | The precision to use for the lookup table in shared memory. Lower precision can increase performance at the cost of accuracy. | @@ -58,7 +60,8 @@ IVF-flat is a simple algorithm which won't save any space, but it provides compe | Parameter | Type | Required | Data Type | Default | Description | |-----------|----------------|----------|---------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `nlists` | `build_param` | Y | Positive Integer >0 | | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. | +| `nlists` | `build_param` | Y | Positive Integer >0 | | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. | +| `ratio` | `build_param` | N | Positive Integer >0 | 2 | `1/ratio` is the number of training points which should be used to train the clusters. | | `nprobe` | `search_params` | Y | Positive Integer >0 | | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index. | ### `faiss_gpu_ivf_pq` @@ -68,6 +71,7 @@ IVF-pq is an inverted-file index, which partitions the vectors into a series of | Parameter | Type | Required | Data Type | Default | Description | |------------------|----------------|----------|----------------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | `nlists` | `build_param` | Y | Positive Integer >0 | | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. | +| `ratio` | `build_param` | N | Positive Integer >0 | 2 | `1/ratio` is the number of training points which should be used to train the clusters. | | `M` | `build_param` | Y | Positive Integer Power of 2 [8-64] | | Number of chunks or subquantizers for each vector. | | `usePrecomputed` | `build_param` | N | Boolean. Default=`false` | `false` | Use pre-computed lookup tables to speed up search at the cost of increased memory usage. | | `useFloat16` | `build_param` | N | Boolean. Default=`false` | `false` | Use half-precision floats for clustering step. | From e1c8566f20556be0c5b0a59f782f843a1b97416f Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 8 Sep 2023 13:57:26 -0500 Subject: [PATCH 5/5] Use `conda mambabuild` not `mamba mambabuild` (#1812) With the release of conda 23.7.3, `mamba mambabuild` stopped working. With boa installed, `conda mambabuild` uses the mamba solver, so just use that instead. See also https://github.com/rapidsai/cudf/issues/14068. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/raft/pull/1812 --- ci/build_cpp.sh | 2 +- ci/build_python.sh | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index 853ae095d3..d2d2d08b99 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -11,6 +11,6 @@ rapids-print-env rapids-logger "Begin cpp build" -rapids-mamba-retry mambabuild conda/recipes/libraft +rapids-conda-retry mambabuild conda/recipes/libraft rapids-upload-conda-to-s3 cpp diff --git a/ci/build_python.sh b/ci/build_python.sh index 2a31deb46a..c49677e78c 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -15,19 +15,19 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) # TODO: Remove `--no-test` flags once importing on a CPU # node works correctly -rapids-mamba-retry mambabuild \ +rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ conda/recipes/pylibraft -rapids-mamba-retry mambabuild \ +rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ conda/recipes/raft-dask # Build ann-bench for each cuda and python version -rapids-mamba-retry mambabuild \ +rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ @@ -37,7 +37,7 @@ conda/recipes/raft-ann-bench # version RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" if [[ ${RAPIDS_CUDA_MAJOR} == "11" ]]; then - rapids-mamba-retry mambabuild \ + rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \