diff --git a/CHANGELOG.md b/CHANGELOG.md index 16c3ba4985..8642f2bdf3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,97 @@ +# raft 23.08.00 (9 Aug 2023) + +## 🚨 Breaking Changes + +- Separate CAGRA index type from internal idx type ([#1664](https://github.com/rapidsai/raft/pull/1664)) [@tfeher](https://github.com/tfeher) +- Stop using setup.py in build.sh ([#1645](https://github.com/rapidsai/raft/pull/1645)) [@vyasr](https://github.com/vyasr) +- CAGRA max_queries auto configuration ([#1613](https://github.com/rapidsai/raft/pull/1613)) [@enp1s0](https://github.com/enp1s0) +- Rename the CAGRA prune function to optimize ([#1588](https://github.com/rapidsai/raft/pull/1588)) [@enp1s0](https://github.com/enp1s0) +- CAGRA pad dataset for 128bit vectorized load ([#1505](https://github.com/rapidsai/raft/pull/1505)) [@tfeher](https://github.com/tfeher) +- Sparse Pairwise Distances API Updates ([#1502](https://github.com/rapidsai/raft/pull/1502)) [@divyegala](https://github.com/divyegala) +- Cagra index construction without copying device mdarrays ([#1494](https://github.com/rapidsai/raft/pull/1494)) [@tfeher](https://github.com/tfeher) +- [FEA] Masked NN for connect_components ([#1445](https://github.com/rapidsai/raft/pull/1445)) [@tarang-jain](https://github.com/tarang-jain) +- Limiting workspace memory resource ([#1356](https://github.com/rapidsai/raft/pull/1356)) [@achirkin](https://github.com/achirkin) + +## 🐛 Bug Fixes + +- Remove push condition on docs-build ([#1693](https://github.com/rapidsai/raft/pull/1693)) [@raydouglass](https://github.com/raydouglass) +- IVF-PQ: Fix illegal memory access with large max_samples ([#1685](https://github.com/rapidsai/raft/pull/1685)) [@achirkin](https://github.com/achirkin) +- Fix missing parameter for select_k ([#1682](https://github.com/rapidsai/raft/pull/1682)) [@ucassjy](https://github.com/ucassjy) +- Separate CAGRA index type from internal idx type ([#1664](https://github.com/rapidsai/raft/pull/1664)) [@tfeher](https://github.com/tfeher) +- Add rmm to pylibraft run dependencies, since it is used by Cython. ([#1656](https://github.com/rapidsai/raft/pull/1656)) [@bdice](https://github.com/bdice) +- Hotfix: wrong constant in IVF-PQ fp_8bit2half ([#1654](https://github.com/rapidsai/raft/pull/1654)) [@achirkin](https://github.com/achirkin) +- Fix sparse KNN for large batches ([#1640](https://github.com/rapidsai/raft/pull/1640)) [@viclafargue](https://github.com/viclafargue) +- Fix uploading of RAFT nightly packages ([#1638](https://github.com/rapidsai/raft/pull/1638)) [@dantegd](https://github.com/dantegd) +- Fix cagra multi CTA bug ([#1628](https://github.com/rapidsai/raft/pull/1628)) [@enp1s0](https://github.com/enp1s0) +- pass correct stream to cutlass kernel launch of L2/cosine pairwise distance kernels ([#1597](https://github.com/rapidsai/raft/pull/1597)) [@mdoijade](https://github.com/mdoijade) +- Fix launchconfig y-gridsize too large in epilogue kernel ([#1586](https://github.com/rapidsai/raft/pull/1586)) [@mfoerste4](https://github.com/mfoerste4) +- Fix update version and pinnings for 23.08. ([#1556](https://github.com/rapidsai/raft/pull/1556)) [@bdice](https://github.com/bdice) +- Fix for function exposing KNN merge ([#1418](https://github.com/rapidsai/raft/pull/1418)) [@viclafargue](https://github.com/viclafargue) + +## 📖 Documentation + +- Critical doc fixes and updates for 23.08 ([#1705](https://github.com/rapidsai/raft/pull/1705)) [@cjnolet](https://github.com/cjnolet) +- Fix the documentation about changing the logging level ([#1596](https://github.com/rapidsai/raft/pull/1596)) [@enp1s0](https://github.com/enp1s0) +- Fix raft::bitonic_sort small usage example ([#1580](https://github.com/rapidsai/raft/pull/1580)) [@enp1s0](https://github.com/enp1s0) + +## 🚀 New Features + +- Use rapids-cmake new parallel testing feature ([#1623](https://github.com/rapidsai/raft/pull/1623)) [@robertmaynard](https://github.com/robertmaynard) +- Add support for row-major slice ([#1591](https://github.com/rapidsai/raft/pull/1591)) [@lowener](https://github.com/lowener) +- IVF-PQ tutorial notebook ([#1544](https://github.com/rapidsai/raft/pull/1544)) [@achirkin](https://github.com/achirkin) +- [FEA] Masked NN for connect_components ([#1445](https://github.com/rapidsai/raft/pull/1445)) [@tarang-jain](https://github.com/tarang-jain) +- raft: Build CUDA 12 packages ([#1388](https://github.com/rapidsai/raft/pull/1388)) [@vyasr](https://github.com/vyasr) +- Limiting workspace memory resource ([#1356](https://github.com/rapidsai/raft/pull/1356)) [@achirkin](https://github.com/achirkin) + +## 🛠️ Improvements + +- Pin `dask` and `distributed` for `23.08` release ([#1711](https://github.com/rapidsai/raft/pull/1711)) [@galipremsagar](https://github.com/galipremsagar) +- Add algo parameter for CAGRA ANN bench ([#1687](https://github.com/rapidsai/raft/pull/1687)) [@tfeher](https://github.com/tfeher) +- ANN benchmarks python wrapper for splitting billion-scale dataset groundtruth ([#1679](https://github.com/rapidsai/raft/pull/1679)) [@divyegala](https://github.com/divyegala) +- Rename CAGRA parameter num_parents to search_width ([#1676](https://github.com/rapidsai/raft/pull/1676)) [@tfeher](https://github.com/tfeher) +- Renaming namespaces to promote CAGRA from experimental ([#1666](https://github.com/rapidsai/raft/pull/1666)) [@cjnolet](https://github.com/cjnolet) +- CAGRA Python wrappers ([#1665](https://github.com/rapidsai/raft/pull/1665)) [@dantegd](https://github.com/dantegd) +- Add notebook for Vector Search - Question Retrieval ([#1662](https://github.com/rapidsai/raft/pull/1662)) [@lowener](https://github.com/lowener) +- Fix CMake CUDA support for pylibraft when raft is found. ([#1659](https://github.com/rapidsai/raft/pull/1659)) [@bdice](https://github.com/bdice) +- Cagra ANN benchmark improvements ([#1658](https://github.com/rapidsai/raft/pull/1658)) [@tfeher](https://github.com/tfeher) +- ANN-benchmarks: avoid using the dataset during search when possible ([#1657](https://github.com/rapidsai/raft/pull/1657)) [@achirkin](https://github.com/achirkin) +- Revert CUDA 12.0 CI workflows to branch-23.08. ([#1652](https://github.com/rapidsai/raft/pull/1652)) [@bdice](https://github.com/bdice) +- ANN: Optimize host-side refine ([#1651](https://github.com/rapidsai/raft/pull/1651)) [@achirkin](https://github.com/achirkin) +- Cagra template instantiations ([#1650](https://github.com/rapidsai/raft/pull/1650)) [@tfeher](https://github.com/tfeher) +- Modify comm_split to avoid ucp ([#1649](https://github.com/rapidsai/raft/pull/1649)) [@ChuckHastings](https://github.com/ChuckHastings) +- Stop using setup.py in build.sh ([#1645](https://github.com/rapidsai/raft/pull/1645)) [@vyasr](https://github.com/vyasr) +- IVF-PQ: Add a (faster) direct conversion fp8->half ([#1644](https://github.com/rapidsai/raft/pull/1644)) [@achirkin](https://github.com/achirkin) +- Simplify `bench/ann` scripts to Python based module ([#1642](https://github.com/rapidsai/raft/pull/1642)) [@divyegala](https://github.com/divyegala) +- Further removal of uses-setup-env-vars ([#1639](https://github.com/rapidsai/raft/pull/1639)) [@dantegd](https://github.com/dantegd) +- Drop blank line in `raft-dask/meta.yaml` ([#1637](https://github.com/rapidsai/raft/pull/1637)) [@jakirkham](https://github.com/jakirkham) +- Enable conservative memory allocations for RAFT IVF-Flat benchmarks. ([#1634](https://github.com/rapidsai/raft/pull/1634)) [@tfeher](https://github.com/tfeher) +- [FEA] Codepacking for IVF-flat ([#1632](https://github.com/rapidsai/raft/pull/1632)) [@tarang-jain](https://github.com/tarang-jain) +- Fixing ann bench cmake (and docs) ([#1630](https://github.com/rapidsai/raft/pull/1630)) [@cjnolet](https://github.com/cjnolet) +- [WIP] Test CI issues ([#1626](https://github.com/rapidsai/raft/pull/1626)) [@VibhuJawa](https://github.com/VibhuJawa) +- Set pool memory resource for raft IVF ANN benchmarks ([#1625](https://github.com/rapidsai/raft/pull/1625)) [@tfeher](https://github.com/tfeher) +- Adding sort option to matrix::select_k api ([#1615](https://github.com/rapidsai/raft/pull/1615)) [@cjnolet](https://github.com/cjnolet) +- CAGRA max_queries auto configuration ([#1613](https://github.com/rapidsai/raft/pull/1613)) [@enp1s0](https://github.com/enp1s0) +- Use exceptions instead of `exit(-1)` ([#1594](https://github.com/rapidsai/raft/pull/1594)) [@benfred](https://github.com/benfred) +- [REVIEW] Add scheduler_file argument to support MNMG setup ([#1593](https://github.com/rapidsai/raft/pull/1593)) [@VibhuJawa](https://github.com/VibhuJawa) +- Rename the CAGRA prune function to optimize ([#1588](https://github.com/rapidsai/raft/pull/1588)) [@enp1s0](https://github.com/enp1s0) +- This PR adds support to __half and nb_bfloat16 to myAtomicReduce ([#1585](https://github.com/rapidsai/raft/pull/1585)) [@Kh4ster](https://github.com/Kh4ster) +- [IMP] move core CUDA RT macros to cuda_rt_essentials.hpp ([#1584](https://github.com/rapidsai/raft/pull/1584)) [@MatthiasKohl](https://github.com/MatthiasKohl) +- preprocessor syntax fix ([#1582](https://github.com/rapidsai/raft/pull/1582)) [@AyodeAwe](https://github.com/AyodeAwe) +- use rapids-upload-docs script ([#1578](https://github.com/rapidsai/raft/pull/1578)) [@AyodeAwe](https://github.com/AyodeAwe) +- Unpin `dask` and `distributed` for development and fix `merge_labels` test ([#1574](https://github.com/rapidsai/raft/pull/1574)) [@galipremsagar](https://github.com/galipremsagar) +- Remove documentation build scripts for Jenkins ([#1570](https://github.com/rapidsai/raft/pull/1570)) [@ajschmidt8](https://github.com/ajschmidt8) +- Add support to __half and nv_bfloat16 to most math functions ([#1554](https://github.com/rapidsai/raft/pull/1554)) [@Kh4ster](https://github.com/Kh4ster) +- Add RAFT ANN benchmark for CAGRA ([#1552](https://github.com/rapidsai/raft/pull/1552)) [@enp1s0](https://github.com/enp1s0) +- Update CAGRA knn_graph_sort to use Raft::bitonic_sort ([#1550](https://github.com/rapidsai/raft/pull/1550)) [@enp1s0](https://github.com/enp1s0) +- Add identity matrix function ([#1548](https://github.com/rapidsai/raft/pull/1548)) [@lowener](https://github.com/lowener) +- Unpin scikit-build upper bound ([#1547](https://github.com/rapidsai/raft/pull/1547)) [@vyasr](https://github.com/vyasr) +- Migrate wheel workflow scripts locally ([#1546](https://github.com/rapidsai/raft/pull/1546)) [@divyegala](https://github.com/divyegala) +- Add sample filtering for ivf_flat. Filtering code refactoring and cleanup ([#1541](https://github.com/rapidsai/raft/pull/1541)) [@alexanderguzhva](https://github.com/alexanderguzhva) +- CAGRA pad dataset for 128bit vectorized load ([#1505](https://github.com/rapidsai/raft/pull/1505)) [@tfeher](https://github.com/tfeher) +- Sparse Pairwise Distances API Updates ([#1502](https://github.com/rapidsai/raft/pull/1502)) [@divyegala](https://github.com/divyegala) +- Add CAGRA gbench ([#1496](https://github.com/rapidsai/raft/pull/1496)) [@tfeher](https://github.com/tfeher) +- Cagra index construction without copying device mdarrays ([#1494](https://github.com/rapidsai/raft/pull/1494)) [@tfeher](https://github.com/tfeher) + # raft 23.06.00 (7 Jun 2023) ## 🚨 Breaking Changes diff --git a/README.md b/README.md index 0e8b2d9c89..ed6ae1b529 100755 --- a/README.md +++ b/README.md @@ -318,6 +318,7 @@ The [build](docs/source/build.md) instructions contain more details on building The folder structure mirrors other RAPIDS repos, with the following folders: +- `bench/ann`: Python scripts for running ANN benchmarks - `ci`: Scripts for running CI in PRs - `conda`: Conda recipes and development conda environments - `cpp`: Source code for C++ libraries. diff --git a/scripts/ann-benchmarks/algos.yaml b/bench/ann/algos.yaml similarity index 97% rename from scripts/ann-benchmarks/algos.yaml rename to bench/ann/algos.yaml index 54fddf607b..5f554fc46b 100644 --- a/scripts/ann-benchmarks/algos.yaml +++ b/bench/ann/algos.yaml @@ -1,4 +1,4 @@ -faise_gpu_ivf_flat: +faiss_gpu_ivf_flat: executable: FAISS_IVF_FLAT_ANN_BENCH disabled: false faiss_gpu_flat: diff --git a/cpp/bench/ann/conf/bigann-100M.json b/bench/ann/conf/bigann-100M.json similarity index 100% rename from cpp/bench/ann/conf/bigann-100M.json rename to bench/ann/conf/bigann-100M.json diff --git a/cpp/bench/ann/conf/deep-100M.json b/bench/ann/conf/deep-100M.json similarity index 100% rename from cpp/bench/ann/conf/deep-100M.json rename to bench/ann/conf/deep-100M.json diff --git a/cpp/bench/ann/conf/deep-1B.json b/bench/ann/conf/deep-1B.json similarity index 100% rename from cpp/bench/ann/conf/deep-1B.json rename to bench/ann/conf/deep-1B.json diff --git a/cpp/bench/ann/conf/glove-100-inner.json b/bench/ann/conf/glove-100-inner.json similarity index 100% rename from cpp/bench/ann/conf/glove-100-inner.json rename to bench/ann/conf/glove-100-inner.json diff --git a/cpp/bench/ann/conf/sift-128-euclidean.json b/bench/ann/conf/sift-128-euclidean.json similarity index 100% rename from cpp/bench/ann/conf/sift-128-euclidean.json rename to bench/ann/conf/sift-128-euclidean.json diff --git a/scripts/ann-benchmarks/data_export.py b/bench/ann/data_export.py similarity index 66% rename from scripts/ann-benchmarks/data_export.py rename to bench/ann/data_export.py index 5be73bef11..9410cfe773 100644 --- a/scripts/ann-benchmarks/data_export.py +++ b/bench/ann/data_export.py @@ -19,7 +19,7 @@ def export_results(output_filepath, recompute, groundtruth_filepath, - result_filepaths): + result_filepath): print(f"Writing output file to: {output_filepath}") ann_bench_scripts_dir = os.path.join(os.getenv("RAFT_HOME"), "cpp/bench/ann/scripts") @@ -27,10 +27,10 @@ def export_results(output_filepath, recompute, groundtruth_filepath, "eval.pl") if recompute: p = subprocess.Popen([ann_bench_scripts_path, "-f", "-o", output_filepath, - groundtruth_filepath] + result_filepaths) + groundtruth_filepath, result_filepath]) else: p = subprocess.Popen([ann_bench_scripts_path, "-o", output_filepath, - groundtruth_filepath] + result_filepaths) + groundtruth_filepath, result_filepath]) p.wait() @@ -41,18 +41,23 @@ def main(): required=True) parser.add_argument("--recompute", action="store_true", help="Recompute metrics") - parser.add_argument("--groundtruth", - help="Path to groundtruth.neighbors.ibin file for a dataset", - required=True) - args, result_filepaths = parser.parse_known_args() - - # if nothing is provided - if len(result_filepaths) == 0: - raise ValueError("No filepaths to results were provided") + parser.add_argument("--dataset", + help="Name of the dataset to export results for", + default="glove-100-inner") + parser.add_argument( + "--dataset-path", + help="path to dataset folder", + default=os.path.join(os.getenv("RAFT_HOME"), + "bench", "ann", "data") + ) + + args = parser.parse_args() - groundtruth_filepath = args.groundtruth + result_filepath = os.path.join(args.dataset_path, args.dataset, "result") + groundtruth_filepath = os.path.join(args.dataset_path, args.dataset, + "groundtruth.neighbors.ibin") export_results(args.output, args.recompute, groundtruth_filepath, - result_filepaths) + result_filepath) if __name__ == "__main__": diff --git a/scripts/ann-benchmarks/get_dataset.py b/bench/ann/get_dataset.py similarity index 89% rename from scripts/ann-benchmarks/get_dataset.py rename to bench/ann/get_dataset.py index 5c21a5e2e1..a175384dc3 100644 --- a/scripts/ann-benchmarks/get_dataset.py +++ b/bench/ann/get_dataset.py @@ -76,16 +76,17 @@ def download(name, normalize, ann_bench_data_path): def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument("--name", help="dataset to download", + parser.add_argument("--dataset", help="dataset to download", default="glove-100-angular") - parser.add_argument("--path", help="path to download dataset", - default=os.path.join(os.getcwd(), "data")) + parser.add_argument("--dataset-path", help="path to download dataset", + default=os.path.join(os.getenv("RAFT_HOME"), + "bench", "ann", "data")) parser.add_argument("--normalize", help="normalize cosine distance to inner product", action="store_true") args = parser.parse_args() - download(args.name, args.normalize, args.path) + download(args.dataset, args.normalize, args.dataset_path) if __name__ == "__main__": diff --git a/scripts/ann-benchmarks/plot.py b/bench/ann/plot.py similarity index 99% rename from scripts/ann-benchmarks/plot.py rename to bench/ann/plot.py index 772bdf8738..0020e398a9 100644 --- a/scripts/ann-benchmarks/plot.py +++ b/bench/ann/plot.py @@ -208,7 +208,7 @@ def load_all_results(result_filepath): def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument("--result_csv", help="Path to CSV Results", required=True) + parser.add_argument("--result-csv", help="Path to CSV Results", required=True) parser.add_argument("--output", help="Path to the PNG output file", default=f"{os.getcwd()}/out.png") parser.add_argument( diff --git a/scripts/ann-benchmarks/run.py b/bench/ann/run.py similarity index 81% rename from scripts/ann-benchmarks/run.py rename to bench/ann/run.py index e906b26e23..d8e33f1113 100644 --- a/scripts/ann-benchmarks/run.py +++ b/bench/ann/run.py @@ -86,7 +86,17 @@ def main(): parser.add_argument( "--configuration", help="path to configuration file for a dataset", - required=True + ) + parser.add_argument( + "--dataset", + help="dataset whose configuration file will be used", + default="glove-100-inner" + ) + parser.add_argument( + "--dataset-path", + help="path to dataset folder", + default=os.path.join(os.getenv("RAFT_HOME"), + "bench", "ann", "data") ) parser.add_argument( "--build", @@ -121,15 +131,29 @@ def main(): search = args.search # Read configuration file associated to dataset - conf_filepath = args.configuration + if args.configuration: + conf_filepath = args.configuration + else: + conf_filepath = os.path.join(scripts_path, "conf", f"{args.dataset}.json") conf_filename = conf_filepath.split("/")[-1] conf_filedir = "/".join(conf_filepath.split("/")[:-1]) + dataset_name = conf_filename.replace(".json", "") + dataset_path = os.path.join(args.dataset_path, dataset_name) if not os.path.exists(conf_filepath): raise FileNotFoundError(conf_filename) with open(conf_filepath, "r") as f: conf_file = json.load(f) + # Replace base, query to dataset-path + replacement_base_filepath = \ + os.path.normpath(conf_file["dataset"]["base_file"]).split(os.path.sep)[-1] + conf_file["dataset"]["base_file"] = \ + os.path.join(dataset_path, replacement_base_filepath) + replacement_query_filepath = \ + os.path.normpath(conf_file["dataset"]["query_file"]).split(os.path.sep)[-1] + conf_file["dataset"]["query_file"] = \ + os.path.join(dataset_path, replacement_query_filepath) # Ensure base and query files exist for dataset if not os.path.exists(conf_file["dataset"]["base_file"]): raise FileNotFoundError(conf_file["dataset"]["base_file"]) @@ -175,6 +199,14 @@ def main(): executables_to_run[executable_path] = {"index": []} executables_to_run[executable_path]["index"].append(index) + # Replace build, search to dataset path + for executable_path in executables_to_run: + for pos, index in enumerate(executables_to_run[executable_path]["index"]): + index["file"] = os.path.join(dataset_path, "index", index["name"]) + index["search_result_file"] = \ + os.path.join(dataset_path, "result", index["name"]) + executables_to_run[executable_path]["index"][pos] = index + run_build_and_search(conf_filename, conf_file, executables_to_run, args.force, conf_filedir, build, search) diff --git a/scripts/ann-benchmarks/split_groundtruth.py b/bench/ann/split_groundtruth.py similarity index 100% rename from scripts/ann-benchmarks/split_groundtruth.py rename to bench/ann/split_groundtruth.py diff --git a/build.sh b/build.sh index 1213500159..8706f1b138 100755 --- a/build.sh +++ b/build.sh @@ -76,7 +76,7 @@ INSTALL_TARGET=install BUILD_REPORT_METRICS="" BUILD_REPORT_INCL_CACHE_STATS=OFF -TEST_TARGETS="CLUSTER_TEST;CORE_TEST;DISTANCE_TEST;LABEL_TEST;LINALG_TEST;MATRIX_TEST;RANDOM_TEST;SOLVERS_TEST;SPARSE_TEST;SPARSE_DIST_TEST;SPARSE_NEIGHBORS_TEST;NEIGHBORS_TEST;STATS_TEST;UTILS_TEST" +TEST_TARGETS="CLUSTER_TEST;CORE_TEST;DISTANCE_TEST;LABEL_TEST;LINALG_TEST;MATRIX_TEST;NEIGHBORS_TEST;NEIGHBORS_ANN_CAGRA_TEST;RANDOM_TEST;SOLVERS_TEST;SPARSE_TEST;SPARSE_DIST_TEST;SPARSE_NEIGHBORS_TEST;STATS_TEST;UTILS_TEST" BENCH_TARGETS="CLUSTER_BENCH;NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH;MATRIX_BENCH;SPARSE_BENCH;RANDOM_BENCH" CACHE_ARGS="" @@ -315,10 +315,11 @@ if hasArg tests || (( ${NUMARGS} == 0 )); then # Force compile library when needed test targets are specified if [[ $CMAKE_TARGET == *"CLUSTER_TEST"* || \ $CMAKE_TARGET == *"DISTANCE_TEST"* || \ + $CMAKE_TARGET == *"MATRIX_TEST"* || \ + $CMAKE_TARGET == *"NEIGHBORS_ANN_CAGRA_TEST"* || \ + $CMAKE_TARGET == *"NEIGHBORS_TEST"* || \ $CMAKE_TARGET == *"SPARSE_DIST_TEST" || \ $CMAKE_TARGET == *"SPARSE_NEIGHBORS_TEST"* || \ - $CMAKE_TARGET == *"MATRIX_TEST"* || \ - $CMAKE_TARGET == *"NEIGHBORS_TEST" || \ $CMAKE_TARGET == *"STATS_TEST"* ]]; then echo "-- Enabling compiled lib for gtests" COMPILE_LIBRARY=ON diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 057e33faf3..7e921decd5 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -19,10 +19,10 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.0 -- dask-core>=2023.5.1 +- dask-core>=2023.7.1 - dask-cuda==23.10.* -- dask>=2023.5.1 -- distributed>=2023.5.1 +- dask>=2023.7.1 +- distributed>=2023.7.1 - doxygen>=1.8.20 - gcc_linux-64=11.* - gmock>=1.13.0 diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index 403afe7413..2ea685b529 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -19,10 +19,10 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.0 -- dask-core>=2023.5.1 +- dask-core>=2023.7.1 - dask-cuda==23.10.* -- dask>=2023.5.1 -- distributed>=2023.5.1 +- dask>=2023.7.1 +- distributed>=2023.7.1 - doxygen>=1.8.20 - gcc_linux-64=11.* - gmock>=1.13.0 diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml index 278d8c4d5a..d62404b16f 100644 --- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml @@ -34,6 +34,7 @@ dependencies: - nccl>=2.9.9 - ninja - nlohmann_json>=3.11.2 +- pyyaml - scikit-build>=0.13.1 - sysroot_linux-64==2.17 name: bench_ann_cuda-118_arch-x86_64 diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml index b189abb313..c9caa4dd9b 100644 --- a/conda/recipes/raft-dask/meta.yaml +++ b/conda/recipes/raft-dask/meta.yaml @@ -60,10 +60,10 @@ requirements: - cudatoolkit {% endif %} - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} - - dask >=2023.5.1 - - dask-core >=2023.5.1 + - dask >=2023.7.1 + - dask-core >=2023.7.1 - dask-cuda ={{ minor_version }} - - distributed >=2023.5.1 + - distributed >=2023.7.1 - joblib >=0.11 - nccl >=2.9.9 - pylibraft {{ version }} diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 610c5b97f6..ae2660509a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -127,7 +127,11 @@ endif() rapids_cuda_init_runtime(USE_STATIC ${CUDA_STATIC_RUNTIME}) if(NOT DISABLE_OPENMP) - find_package(OpenMP) + rapids_find_package( + OpenMP REQUIRED + BUILD_EXPORT_SET raft-exports + INSTALL_EXPORT_SET raft-exports + ) if(OPENMP_FOUND) message(VERBOSE "RAFT: OpenMP found in ${OpenMP_CXX_INCLUDE_DIRS}") endif() diff --git a/cpp/include/raft/core/device_resources_manager.hpp b/cpp/include/raft/core/device_resources_manager.hpp new file mode 100644 index 0000000000..ee4b151362 --- /dev/null +++ b/cpp/include/raft/core/device_resources_manager.hpp @@ -0,0 +1,606 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +namespace raft { + +/** + * @brief A singleton used to easily generate a raft::device_resources object + * + * Many calls to RAFT functions require a `raft::device_resources` object + * to provide CUDA resources like streams and stream pools. The + * `raft::device_resources_manager` singleton provides a straightforward method to create those + * objects in a way that allows consumers of RAFT to limit total consumption of device resources + * without actively managing streams or other CUDA-specific objects. + * + * To control the resources a consuming application will use, the + * resource manager provides setters for a variety of values. For + * instance, to ensure that no more than `N` CUDA streams are used per + * device, a consumer might call + * `raft::device_resources_manager::set_streams_per_device(N)`. Note that all of these + * setters must be used prior to retrieving the first `device_resources` from + * the manager. Setters invoked after this will log a warning but have no + * effect. + * + * After calling all desired setters, consumers can simply call + * `auto res = raft::device_resources_manager::get_device_resources();` to get a valid + * device_resources object for the current device based on previously-set + * parameters. Importantly, calling `get_device_resources()` again from the same + * thread is guaranteed to return a `device_resources` object with the same + * underlying CUDA stream and (if a non-zero number of stream pools has been + * requested) stream pool. + * + * Typical usage might look something like the following: + * @code + * void initialize_application() { + * raft::device_resources_manager::set_streams_per_device(16); + * } + * + * void foo_called_from_multiple_threads() { + * auto res = raft::device_resources_manager::get_device_resources(); + * // Call RAFT function using res + * res.sync_stream() // Ensure work completes before returning + * } + * @endcode + * + * Note that all public methods of the `device_resources_manager` are thread-safe, + * but the manager is designed to minimize locking required for + * retrieving `device_resources` objects. Each thread must acquire a lock + * exactly once per device when calling `get_device_resources`. Subsequent calls + * will still be thread-safe but will not require a lock. + * + * All public methods of the `device_resources_manager` are static. Please see + * documentation of those methods for additional usage information. + * + */ +struct device_resources_manager { + device_resources_manager(device_resources_manager const&) = delete; + void operator=(device_resources_manager const&) = delete; + + private: + device_resources_manager() {} + ~device_resources_manager() + { + // Ensure that we destroy any pool memory resources before CUDA context is + // lost + per_device_components_.clear(); + } + + // Get an id used to identify this thread for the purposes of assigning + // (in round-robin fashion) the same resources to the thread on subsequent calls to + // `get_device_resources` + static auto get_thread_id() + { + static std::atomic thread_counter{}; + thread_local std::size_t id = ++thread_counter; + return id; + } + + // This struct holds the various parameters used to control + // construction of the underlying resources shared by all + // `device_resources` objects returned by `get_device_resources` + struct resource_params { + // The total number of primary streams to be used by the + // application. If no value is provided, the default stream per thread + // is used. + std::optional stream_count{std::nullopt}; + // The total number of stream pools to be used by the application + std::size_t pool_count{}; + // How many streams to assign to each pool + std::size_t pool_size{rmm::cuda_stream_pool::default_size}; + // If a memory pool is requested (max_mem_pool_size is non-zero), use + // this initial size for the pool in bytes. Must be a multiple of 256. + // If nullopt, use half of the available memory on the current + // device. + thrust::optional init_mem_pool_size{thrust::nullopt}; + // If set to any non-zero value, create a memory pool with this + // maximum size. If nullopt, use up to the entire available memory of the + // device + thrust::optional max_mem_pool_size{std::size_t{}}; + // Limit on workspace memory for the returned device_resources object + std::optional workspace_allocation_limit{std::nullopt}; + // Optional specification of separate workspace memory resources for each + // device. The integer in each pair indicates the device for this memory + // resource. + std::vector, int>> workspace_mrs{}; + + auto get_workspace_memory_resource(int device_id) {} + } params_; + + // This struct stores the underlying resources to be shared among + // `device_resources` objects returned by this manager. + struct resource_components { + // Construct all underlying resources indicated by `params` for the + // indicated device. This includes primary streams, stream pools, and + // a memory pool if requested. + resource_components(int device_id, resource_params const& params) + : device_id_{device_id}, + streams_{[¶ms, this]() { + auto scoped_device = device_setter{device_id_}; + auto result = std::unique_ptr{nullptr}; + if (params.stream_count) { + result = std::make_unique(*params.stream_count); + } + return result; + }()}, + pools_{[¶ms, this]() { + auto scoped_device = device_setter{device_id_}; + auto result = std::vector>{}; + if (params.pool_size != 0) { + for (auto i = std::size_t{}; i < params.pool_count; ++i) { + result.push_back(std::make_shared(params.pool_size)); + } + } else if (params.pool_count != 0) { + RAFT_LOG_WARN("Stream pools of size 0 requested; no pools will be created"); + } + return result; + }()}, + pool_mr_{[¶ms, this]() { + auto scoped_device = device_setter{device_id_}; + auto result = + std::shared_ptr>{nullptr}; + // If max_mem_pool_size is nullopt or non-zero, create a pool memory + // resource + if (params.max_mem_pool_size.value_or(1) != 0) { + auto* upstream = + dynamic_cast(rmm::mr::get_current_device_resource()); + if (upstream != nullptr) { + result = + std::make_shared>( + upstream, params.init_mem_pool_size, params.max_mem_pool_size); + rmm::mr::set_current_device_resource(result.get()); + } else { + RAFT_LOG_WARN( + "Pool allocation requested, but other memory resource has already been set and " + "will not be overwritten"); + } + } + return result; + }()}, + workspace_mr_{[¶ms, this]() { + auto result = std::shared_ptr{nullptr}; + auto iter = std::find_if(std::begin(params.workspace_mrs), + std::end(params.workspace_mrs), + [this](auto&& pair) { return pair.second == device_id_; }); + if (iter != std::end(params.workspace_mrs)) { result = iter->first; } + return result; + }()} + { + } + + // Get the id of the device associated with the constructed resource + // components + [[nodiscard]] auto get_device_id() const { return device_id_; } + // Get the total number of streams available for this application + [[nodiscard]] auto stream_count() const + { + auto result = std::size_t{}; + if (streams_) { result = streams_->get_pool_size(); } + return result; + } + // Get the stream assigned to this host thread. Note that the same stream + // may be used by multiple threads, but any given thread will always use + // the same stream + [[nodiscard]] auto get_stream() const + { + auto result = rmm::cuda_stream_per_thread; + if (stream_count() != 0) { result = streams_->get_stream(get_thread_id() % stream_count()); } + return result; + } + // Get the total number of stream pools available for this + // application + [[nodiscard]] auto pool_count() const { return pools_.size(); } + // Get the stream pool assigned to this host thread. Note that the same stream pool + // may be used by multiple threads, but any given thread will always use + // the same stream pool + [[nodiscard]] auto get_pool() const + { + auto result = std::shared_ptr{nullptr}; + if (pool_count() != 0) { result = pools_[get_thread_id() % pool_count()]; } + return result; + } + // Return a (possibly null) shared_ptr to the pool memory resource + // created for this device by the manager + [[nodiscard]] auto get_pool_memory_resource() const { return pool_mr_; } + // Return the RAFT workspace allocation limit that will be used by + // `device_resources` returned from this manager + [[nodiscard]] auto get_workspace_allocation_limit() const + { + return workspace_allocation_limit_; + } + // Return a (possibly null) shared_ptr to the memory resource that will + // be used for workspace allocations by `device_resources` returned from + // this manager + [[nodiscard]] auto get_workspace_memory_resource() { return workspace_mr_; } + + private: + int device_id_; + std::unique_ptr streams_; + std::vector> pools_; + std::shared_ptr> pool_mr_; + std::shared_ptr workspace_mr_; + std::optional workspace_allocation_limit_{std::nullopt}; + }; + + // Mutex used to lock access to shared data until after the first + // `get_device_resources` call in each thread + mutable std::mutex manager_mutex_{}; + // Indicates whether or not `get_device_resources` has been called by any + // host thread + bool params_finalized_{}; + // Container for underlying device resources to be re-used across host + // threads for each device + std::vector per_device_components_; + // Container for device_resources objects shared among threads. The index + // of the outer vector is the thread id of the thread requesting resources + // modulo the total number of resources managed by this object. The inner + // vector contains all resources associated with that id across devices + // in any order. + std::vector> resources_{}; + + // Return a lock for accessing shared data + [[nodiscard]] auto get_lock() const { return std::unique_lock{manager_mutex_}; } + + // Retrieve the underlying resources to be shared across the + // application for the indicated device. This method acquires a lock the + // first time it is called in each thread for a specific device to ensure that the + // underlying resources have been correctly initialized exactly once across + // all host threads. + auto const& get_device_resources_(int device_id) + { + // Each thread maintains an independent list of devices it has + // accessed. If it has not marked a device as initialized, it + // acquires a lock to initialize it exactly once. This means that each + // thread will lock once for a particular device and not proceed until + // some thread has actually generated the corresponding device + // components + thread_local auto initialized_devices = std::vector{}; + auto res_iter = decltype(std::end(resources_[0])){}; + if (std::find(std::begin(initialized_devices), std::end(initialized_devices), device_id) == + std::end(initialized_devices)) { + // Only lock if we have not previously accessed this device on this + // thread + auto lock = get_lock(); + initialized_devices.push_back(device_id); + // If we are building components, do not allow any further changes to + // resource parameters. + params_finalized_ = true; + + if (resources_.empty()) { + // We will potentially need as many device_resources objects as there are combinations of + // streams and pools on a given device. + resources_.resize(std::max(params_.stream_count.value_or(1), std::size_t{1}) * + std::max(params_.pool_count, std::size_t{1})); + } + + auto res_idx = get_thread_id() % resources_.size(); + // Check to see if we have constructed device_resources for the + // requested device at the index assigned to this thread + res_iter = std::find_if(std::begin(resources_[res_idx]), + std::end(resources_[res_idx]), + [device_id](auto&& res) { return res.get_device() == device_id; }); + + if (res_iter == std::end(resources_[res_idx])) { + // Even if we have not yet built device_resources for the current + // device, we may have already built the underlying components, since + // multiple device_resources may point to the same components. + auto component_iter = std::find_if( + std::begin(per_device_components_), + std::end(per_device_components_), + [device_id](auto&& components) { return components.get_device_id() == device_id; }); + if (component_iter == std::end(per_device_components_)) { + // Build components for this device if we have not yet done so on + // another thread + per_device_components_.emplace_back(device_id, params_); + component_iter = std::prev(std::end(per_device_components_)); + } + auto scoped_device = device_setter(device_id); + // Build the device_resources object for this thread out of shared + // components + resources_[res_idx].emplace_back(component_iter->get_stream(), + component_iter->get_pool(), + component_iter->get_workspace_memory_resource(), + component_iter->get_workspace_allocation_limit()); + res_iter = std::prev(std::end(resources_[res_idx])); + } + } else { + auto res_idx = get_thread_id() % resources_.size(); + // If we have previously accessed this device on this thread, we do not + // need to lock. We know that this thread already initialized the + // resources it requires for this device if no other thread had already done so, so we simply + // retrieve the previously-generated resources. + res_iter = std::find_if(std::begin(resources_[res_idx]), + std::end(resources_[res_idx]), + [device_id](auto&& res) { return res.get_device() == device_id; }); + } + return *res_iter; + } + + // Thread-safe setter for the number of streams + void set_streams_per_device_(std::optional num_streams) + { + auto lock = get_lock(); + if (params_finalized_) { + RAFT_LOG_WARN( + "Attempted to set device_resources_manager properties after resources have already been " + "retrieved"); + } else { + params_.stream_count = num_streams; + } + } + + // Thread-safe setter for the number and size of stream pools + void set_stream_pools_per_device_(std::size_t num_pools, std::size_t num_streams) + { + auto lock = get_lock(); + if (params_finalized_) { + RAFT_LOG_WARN( + "Attempted to set device_resources_manager properties after resources have already been " + "retrieved"); + } else { + params_.pool_count = num_pools; + params_.pool_size = num_streams; + } + } + + // Thread-safe setter for the RAFT workspace allocation limit + void set_workspace_allocation_limit_(std::size_t memory_limit) + { + auto lock = get_lock(); + if (params_finalized_) { + RAFT_LOG_WARN( + "Attempted to set device_resources_manager properties after resources have already been " + "retrieved"); + } else { + params_.workspace_allocation_limit.emplace(memory_limit); + } + } + + // Thread-safe setter for the maximum memory pool size + void set_max_mem_pool_size_(std::optional memory_limit) + { + auto lock = get_lock(); + if (params_finalized_) { + RAFT_LOG_WARN( + "Attempted to set device_resources_manager properties after resources have already been " + "retrieved"); + } else { + if (memory_limit) { + params_.max_mem_pool_size.emplace(*memory_limit); + } else { + params_.max_mem_pool_size = thrust::nullopt; + } + } + } + + // Thread-safe setter for the initial memory pool size + void set_init_mem_pool_size_(std::optional init_memory) + { + auto lock = get_lock(); + if (params_finalized_) { + RAFT_LOG_WARN( + "Attempted to set device_resources_manager properties after resources have already been " + "retrieved"); + } else { + if (init_memory) { + params_.init_mem_pool_size.emplace(*init_memory); + } else { + params_.init_mem_pool_size = thrust::nullopt; + } + } + } + + // Thread-safe setter for workspace memory resources + void set_workspace_memory_resource_(std::shared_ptr mr, + int device_id) + { + auto lock = get_lock(); + if (params_finalized_) { + RAFT_LOG_WARN( + "Attempted to set device_resources_manager properties after resources have already been " + "retrieved"); + } else { + auto iter = std::find_if(std::begin(params_.workspace_mrs), + std::end(params_.workspace_mrs), + [device_id](auto&& pair) { return pair.second == device_id; }); + if (iter != std::end(params_.workspace_mrs)) { + iter->first = mr; + } else { + params_.workspace_mrs.emplace_back(mr, device_id); + } + } + } + + // Retrieve the instance of this singleton + static auto& get_manager() + { + static auto manager = device_resources_manager{}; + return manager; + } + + public: + /** + * @brief Retrieve device_resources to be used with the RAFT API + * + * This thread-safe method ensures that a `device_resources` object with + * the same underlying stream and stream pool is returned every time it is + * called by the same host thread. This means that if `get_device_resources` is + * used to provide all `device_resources` in an application, then + * `raft::get_device_resources().sync_stream()` and (if a stream pool is used) + * raft::get_device_resources().sync_stream_pool() are guaranteed to synchronize all + * work previously submitted to the device by this host thread. + * + * If the max memory pool size set with `set_max_mem_pool_size` is non-zero, + * the first call of this method will also create a memory pool to be used + * for all RMM-based allocations on device. + * + * @param device_id int If provided, the device for which resources should + * be returned. Defaults to active CUDA device. + */ + static auto const& get_device_resources(int device_id = device_setter::get_current_device()) + { + return get_manager().get_device_resources_(device_id); + } + + /** + * @brief Set the total number of CUDA streams to be used per device + * + * If nullopt, the default stream per thread will be used + * (essentially allowing as many streams as there are host threads). + * Otherwise, all returned `device_resources` will draw their streams from this + * limited pool. + * + * Limiting the total number of streams can be desirable for a number of + * reasons, but it is most often used in consuming applications to + * prevent a large number of host threads from flooding the device with + * simultaneous requests that may exhaust device memory or other + * resources. + * + * If called after the first call to + * `raft::device_resources_manager::get_device_resources`, no change will be made, + * and a warning will be emitted. + */ + static void set_streams_per_device(std::optional num_streams) + { + get_manager().set_streams_per_device_(num_streams); + } + + /** + * @brief Set the total number and size of CUDA stream pools to be used per device + * + * Setting the number of stream pools to a non-zero value will provide a + * pool of stream pools that can be shared among host threads. This can be + * useful for the same reason it is useful to limit the total number of + * primary streams assigned to `device_resoures` for each host thread. + * Repeated calls to `get_device_resources` on a given host thread are + * guaranteed to return `device_resources` with the same underlying stream + * pool. + * + * If called after the first call to + * `raft::device_resources_manager::get_device_resources`, no change will be made, + * and a warning will be emitted. + */ + static void set_stream_pools_per_device( + std::size_t num_pools, std::size_t num_streams = rmm::cuda_stream_pool::default_size) + { + get_manager().set_stream_pools_per_device_(num_pools, num_streams); + } + /** + * @brief Set the maximum size of temporary RAFT workspaces + * + * Note that this limits only the size of temporary workspace + * allocations. To cap the device memory generally available for all device + * allocations made with RMM, use + * `raft::device_manager::set_max_mem_pool_size` + * + * If called after the first call to + * `raft::device_resources_manager::get_device_resources`, no change will be made, + * and a warning will be emitted. + */ + static void set_workspace_allocation_limit(std::size_t memory_limit) + { + get_manager().set_workspace_allocation_limit_(memory_limit); + } + + /** + * @brief Set the maximum size of the device memory pool + * + * If set to 0, no memory pool will be used. If set to nullopt, the memory + * pool is allowed to grow to the size of available device memory. + * + * Note that the pool will not actually be created until the first call + * to `raft::device_manager::get_device_resources(device_id)`, after which it will become + * the current RMM device memory resource for the indicated device. If the + * current RMM device memory resource has already been set to some + * non-default resource, no pool resource will be created and a warning will be emitted. It is + * assumed that applications which have set a memory resource already wish to manage RMM + * themselves. + * + * If called after the first call to + * `raft::device_resources_manager::get_device_resources`, no change will be made, + * and a warning will be emitted. + */ + static void set_max_mem_pool_size(std::optional max_mem) + { + get_manager().set_max_mem_pool_size_(max_mem); + } + + /** + * @brief Set the initial size of the device memory pool + * + * If set to nullopt, the memory pool starts with half of the available + * device memory. + * + * If called after the first call to + * `raft::device_resources_manager::get_device_resources`, no change will be made, + * and a warning will be emitted. + */ + static void set_init_mem_pool_size(std::optional init_mem) + { + get_manager().set_init_mem_pool_size_(init_mem); + } + /** + * @brief Request a device memory pool with specified parameters + * + * This convenience method essentially combines + * `set_init_mem_pool_size` and `set_max_mem_pool_size`. It is provided + * primarily to allow users who want a memory pool but do not want to choose + * specific pool sizes to simply call + * `raft::device_manager::set_memory_pool()` and enable a memory pool using + * RMM defaults (initialize with half of available memory, allow to grow + * to all available memory). + * + * If called after the first call to + * `raft::device_resources_manager::get_device_resources`, no change will be made, + * and a warning will be emitted. + */ + static void set_mem_pool(std::optional init_mem = std::nullopt, + std::optional max_mem = std::nullopt) + { + set_init_mem_pool_size(init_mem); + set_max_mem_pool_size(max_mem); + } + + /** + * @brief Set the workspace memory resource to be used on a specific device + * + * RAFT device_resources objects can be built with a separate memory + * resource for allocating temporary workspaces. If a (non-nullptr) memory + * resource is provided by this setter, it will be used as the + * workspace memory resource for all `device_resources` returned for the + * indicated device. + * + * If called after the first call to + * `raft::device_resources_manager::get_device_resources`, no change will be made, + * and a warning will be emitted. + */ + static void set_workspace_memory_resource(std::shared_ptr mr, + int device_id = device_setter::get_current_device()) + { + get_manager().set_workspace_memory_resource_(mr, device_id); + } +}; +} // namespace raft diff --git a/cpp/include/raft/core/device_setter.hpp b/cpp/include/raft/core/device_setter.hpp new file mode 100644 index 0000000000..42049102aa --- /dev/null +++ b/cpp/include/raft/core/device_setter.hpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include +#include +namespace raft { + +/** + * @brief A scoped setter for the active CUDA device + * + * On construction, the device_setter will set the active CUDA device to the + * indicated value. On deletion, the active CUDA device will be set back to + * its previous value. If the call to set the new active device fails, an + * exception will be thrown. If the call to set the device back to its + * previously selected value throws, an error will be logged, but no + * exception will be thrown. + * + * @param int device_id The ID of the CUDA device to make active + * + */ +struct device_setter { + /** + * Return the id of the current device as an integer + */ + static auto get_current_device() + { + auto result = int{}; + RAFT_CUDA_TRY(cudaGetDevice(&result)); + return result; + } + /** + * Return the count of currently available CUDA devices + */ + static auto get_device_count() + { + auto result = int{}; + RAFT_CUDA_TRY(cudaGetDeviceCount(&result)); + return result; + } + + explicit device_setter(int new_device) : prev_device_{get_current_device()} + { + RAFT_CUDA_TRY(cudaSetDevice(new_device)); + } + ~device_setter() { RAFT_CUDA_TRY_NO_THROW(cudaSetDevice(prev_device_)); } + + private: + int prev_device_; +}; + +} // namespace raft diff --git a/cpp/include/raft/core/logger-ext.hpp b/cpp/include/raft/core/logger-ext.hpp index 8fd29cf1d6..04a6a4d060 100644 --- a/cpp/include/raft/core/logger-ext.hpp +++ b/cpp/include/raft/core/logger-ext.hpp @@ -129,4 +129,23 @@ class logger { static inline std::unordered_map> log_map; }; // class logger +/** + * @brief An object used for scoped log level setting + * + * Instances of `raft::log_level_setter` will set RAFT logging to the level + * indicated on construction and will revert to the previous set level on + * destruction. + */ +struct log_level_setter { + explicit log_level_setter(int level) + { + prev_level_ = logger::get(RAFT_NAME).get_level(); + logger::get(RAFT_NAME).set_level(level); + } + ~log_level_setter() { logger::get(RAFT_NAME).set_level(prev_level_); } + + private: + int prev_level_; +}; // class log_level_setter + }; // namespace raft diff --git a/cpp/include/raft/neighbors/cagra_serialize.cuh b/cpp/include/raft/neighbors/cagra_serialize.cuh index 2242629409..0a806402d2 100644 --- a/cpp/include/raft/neighbors/cagra_serialize.cuh +++ b/cpp/include/raft/neighbors/cagra_serialize.cuh @@ -47,12 +47,16 @@ namespace raft::neighbors::cagra { * @param[in] handle the raft handle * @param[in] os output stream * @param[in] index CAGRA index + * @param[in] include_dataset Whether or not to write out the dataset to the file. * */ template -void serialize(raft::resources const& handle, std::ostream& os, const index& index) +void serialize(raft::resources const& handle, + std::ostream& os, + const index& index, + bool include_dataset = true) { - detail::serialize(handle, os, index); + detail::serialize(handle, os, index, include_dataset); } /** @@ -77,14 +81,16 @@ void serialize(raft::resources const& handle, std::ostream& os, const index void serialize(raft::resources const& handle, const std::string& filename, - const index& index) + const index& index, + bool include_dataset = true) { - detail::serialize(handle, filename, index); + detail::serialize(handle, filename, index, include_dataset); } /** @@ -158,4 +164,4 @@ namespace raft::neighbors::experimental::cagra { using raft::neighbors::cagra::deserialize; using raft::neighbors::cagra::serialize; -} // namespace raft::neighbors::experimental::cagra \ No newline at end of file +} // namespace raft::neighbors::experimental::cagra diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh index 8d040c352b..2c9cbd2563 100644 --- a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh @@ -24,8 +24,7 @@ namespace raft::neighbors::cagra::detail { -// Serialization version 1. -constexpr int serialization_version = 2; +constexpr int serialization_version = 3; // NB: we wrap this check in a struct, so that the updated RealSize is easy to see in the error // message. @@ -50,41 +49,53 @@ template struct check_index_layout), expecte * */ template -void serialize(raft::resources const& res, std::ostream& os, const index& index_) +void serialize(raft::resources const& res, + std::ostream& os, + const index& index_, + bool include_dataset) { RAFT_LOG_DEBUG( "Saving CAGRA index, size %zu, dim %u", static_cast(index_.size()), index_.dim()); + std::string dtype_string = raft::detail::numpy_serializer::get_numpy_dtype().to_string(); + dtype_string.resize(4); + os << dtype_string; + serialize_scalar(res, os, serialization_version); serialize_scalar(res, os, index_.size()); serialize_scalar(res, os, index_.dim()); serialize_scalar(res, os, index_.graph_degree()); serialize_scalar(res, os, index_.metric()); - auto dataset = index_.dataset(); - // Remove padding before saving the dataset - auto host_dataset = make_host_matrix(dataset.extent(0), dataset.extent(1)); - RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(), - sizeof(T) * host_dataset.extent(1), - dataset.data_handle(), - sizeof(T) * dataset.stride(0), - sizeof(T) * host_dataset.extent(1), - dataset.extent(0), - cudaMemcpyDefault, - resource::get_cuda_stream(res))); - resource::sync_stream(res); - serialize_mdspan(res, os, host_dataset.view()); serialize_mdspan(res, os, index_.graph()); + + serialize_scalar(res, os, include_dataset); + if (include_dataset) { + auto dataset = index_.dataset(); + // Remove padding before saving the dataset + auto host_dataset = make_host_matrix(dataset.extent(0), dataset.extent(1)); + RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(), + sizeof(T) * host_dataset.extent(1), + dataset.data_handle(), + sizeof(T) * dataset.stride(0), + sizeof(T) * host_dataset.extent(1), + dataset.extent(0), + cudaMemcpyDefault, + resource::get_cuda_stream(res))); + resource::sync_stream(res); + serialize_mdspan(res, os, host_dataset.view()); + } } template void serialize(raft::resources const& res, const std::string& filename, - const index& index_) + const index& index_, + bool include_dataset) { std::ofstream of(filename, std::ios::out | std::ios::binary); if (!of) { RAFT_FAIL("Cannot open file %s", filename.c_str()); } - detail::serialize(res, of, index_); + detail::serialize(res, of, index_, include_dataset); of.close(); if (!of) { RAFT_FAIL("Error writing output %s", filename.c_str()); } @@ -102,6 +113,9 @@ void serialize(raft::resources const& res, template auto deserialize(raft::resources const& res, std::istream& is) -> index { + char dtype_string[4]; + is.read(dtype_string, 4); + auto ver = deserialize_scalar(res, is); if (ver != serialization_version) { RAFT_FAIL("serialization version mismatch, expected %d, got %d ", serialization_version, ver); @@ -113,9 +127,11 @@ auto deserialize(raft::resources const& res, std::istream& is) -> index auto dataset = raft::make_host_matrix(n_rows, dim); auto graph = raft::make_host_matrix(n_rows, graph_degree); - deserialize_mdspan(res, is, dataset.view()); deserialize_mdspan(res, is, graph.view()); + bool has_dataset = deserialize_scalar(res, is); + if (has_dataset) { deserialize_mdspan(res, is, dataset.view()); } + return index( res, metric, raft::make_const_mdspan(dataset.view()), raft::make_const_mdspan(graph.view())); } diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh index 18f1906dc5..81779668c4 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh @@ -884,6 +884,7 @@ void launch_kernel(Lambda lambda, queries += grid_dim_y * index.dim(); neighbors += grid_dim_y * grid_dim_x * k; distances += grid_dim_y * grid_dim_x * k; + coarse_index += grid_dim_y * n_probes; } } diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh index 90d993abd5..2ab216b13b 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh @@ -43,21 +43,39 @@ static_assert((kMaxCapacity >= 32) && !(kMaxCapacity & (kMaxCapacity - 1)), auto RAFT_WEAK_FUNCTION is_local_topk_feasible(uint32_t k, uint32_t n_probes, uint32_t n_queries) -> bool { - if (k > kMaxCapacity) { return false; } // warp_sort not possible - if (n_probes <= 16) { return false; } // too few clusters - if (n_queries * n_probes <= 256) { return false; } // overall amount of work is too small + if (k > kMaxCapacity) { return false; } // warp_sort not possible + if (n_queries * n_probes <= 16) { return false; } // overall amount of work is too small return true; } template struct pq_block_sort { - using type = matrix::detail::select::warpsort:: - block_sort; + using type = matrix::detail::select::warpsort::block_sort< + matrix::detail::select::warpsort::warp_sort_distributed_ext, + Capacity, + true, + T, + IdxT>; + + static auto get_mem_required(uint32_t k_max) + { + if (k_max == 0 || k_max > Capacity) { + return pq_block_sort<0, T, IdxT>::get_mem_required(k_max); + } + if constexpr (Capacity > 1) { + if (k_max * 2 <= Capacity) { + return pq_block_sort<(Capacity / 2), T, IdxT>::get_mem_required(k_max); + } + } + return type::queue_t::mem_required; + } }; template struct pq_block_sort<0, T, IdxT> : dummy_block_sort_t { using type = dummy_block_sort_t; + static auto mem_required(uint32_t) -> size_t { return 0; } + static auto get_mem_required(uint32_t) { return mem_required; } }; template @@ -212,7 +230,7 @@ __device__ auto ivfpq_compute_score(uint32_t pq_dim, * [n_clusters, dim]. * @param pq_centers * The device pointer to the cluster centers in the PQ space - * [pq_dim, pq_book_size, pq_len] or [n_clusters, pq_book_size, pq_len,]. + * [pq_dim, pq_book_size, pq_len] or [n_clusters, pq_book_size, pq_len]. * @param pq_dataset * The device pointer to the PQ index (data) [n_rows, ...]. * @param cluster_labels @@ -275,7 +293,9 @@ __global__ void compute_similarity_kernel(uint32_t dim, /* Shared memory: * lut_scores: lookup table (LUT) of size = `pq_dim << PqBits` (when EnableSMemLut) - * base_diff: size = dim (which is equal to `pq_dim * pq_len`) or dim*2 + * lut_end+: + * base_diff: size = dim (which is equal to `pq_dim * pq_len`) or dim*2 + * topk::warp_sort::mem_required - local topk temporary buffer (if necessary) * topk::block_sort: some amount of shared memory, but overlaps with the rest: block_sort only needs shared memory for `.done()` operation, which can come very last. */ @@ -294,13 +314,11 @@ __global__ void compute_similarity_kernel(uint32_t dim, lut_scores += lut_size * blockIdx.x; } - float* base_diff = nullptr; - if constexpr (PrecompBaseDiff) { - if constexpr (EnableSMemLut) { - base_diff = reinterpret_cast(lut_scores + lut_size); - } else { - base_diff = reinterpret_cast(smem_buf); - } + uint8_t* lut_end = nullptr; + if constexpr (EnableSMemLut) { + lut_end = reinterpret_cast(lut_scores + lut_size); + } else { + lut_end = smem_buf; } for (int ib = blockIdx.x; ib < n_queries * n_probes; ib += gridDim.x) { @@ -347,15 +365,15 @@ __global__ void compute_similarity_kernel(uint32_t dim, case distance::DistanceType::L2SqrtExpanded: case distance::DistanceType::L2Expanded: { for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) { - base_diff[i] = query[i] - cluster_center[i]; + reinterpret_cast(lut_end)[i] = query[i] - cluster_center[i]; } } break; case distance::DistanceType::InnerProduct: { float2 pvals; for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) { - pvals.x = query[i]; - pvals.y = cluster_center[i] * pvals.x; - reinterpret_cast(base_diff)[i] = pvals; + pvals.x = query[i]; + pvals.y = cluster_center[i] * pvals.x; + reinterpret_cast(lut_end)[i] = pvals; } } break; default: __builtin_unreachable(); @@ -382,7 +400,7 @@ __global__ void compute_similarity_kernel(uint32_t dim, case distance::DistanceType::L2Expanded: { float diff; if constexpr (PrecompBaseDiff) { - diff = base_diff[j]; + diff = reinterpret_cast(lut_end)[j]; } else { diff = query[j] - cluster_center[j]; } @@ -393,7 +411,7 @@ __global__ void compute_similarity_kernel(uint32_t dim, // NB: we negate the scores as we hardcoded select-topk to always compute the minimum float q; if constexpr (PrecompBaseDiff) { - float2 pvals = reinterpret_cast(base_diff)[j]; + float2 pvals = reinterpret_cast(lut_end)[j]; q = pvals.x; score -= pvals.y; } else { @@ -438,7 +456,6 @@ __global__ void compute_similarity_kernel(uint32_t dim, constexpr OutT kDummy = upper_bound(); OutT query_kth = kDummy; if constexpr (kManageLocalTopK) { query_kth = OutT(query_kths[query_ix]); } - local_topk_t block_topk(topk, nullptr, query_kth); OutT early_stop_limit = kDummy; switch (metric) { // If the metric is non-negative, we can use the query_kth approximation as an early stop @@ -453,6 +470,7 @@ __global__ void compute_similarity_kernel(uint32_t dim, // Ensure lut_scores is written by all threads before using it in ivfpq-compute-score __threadfence_block(); __syncthreads(); + local_topk_t block_topk(topk, lut_end, query_kth); // Compute a distance for each sample for (uint32_t i = threadIdx.x; i < n_samples_aligned; @@ -680,13 +698,31 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, // Shared memory for storing pre-computed pieces to speedup the lookup table construction // (e.g. the distance between a cluster center and the query for L2). size_t bdf_mem = sizeof(float) * precomp_data_count; - // Shared memory for the fused top-k component; it may overlap with the other uses of shared - // memory and depends on the number of threads. - struct ltk_mem_t { + + // Shared memory used by the fused top-k during cluster scanning; + // may overlap with the precomputed distance array + struct ltk_add_mem_t { + size_t (*mem_required)(uint32_t); + + ltk_add_mem_t(bool manage_local_topk, uint32_t topk) + : mem_required(pq_block_sort::get_mem_required( + manage_local_topk ? topk : 0)) + { + } + + [[nodiscard]] auto operator()(uint32_t n_threads) const -> size_t + { + return mem_required(n_threads); + } + } ltk_add_mem{manage_local_topk, topk}; + + // Shared memory for the fused top-k component; + // may overlap with all other uses of shared memory + struct ltk_reduce_mem_t { uint32_t subwarp_size; uint32_t topk; bool manage_local_topk; - ltk_mem_t(bool manage_local_topk, uint32_t topk) + ltk_reduce_mem_t(bool manage_local_topk, uint32_t topk) : manage_local_topk(manage_local_topk), topk(topk) { subwarp_size = WarpSize; @@ -703,7 +739,19 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, n_threads / subwarp_size, topk) : 0; } - } ltk_mem{manage_local_topk, topk}; + } ltk_reduce_mem{manage_local_topk, topk}; + + struct total_shared_mem_t { + ltk_add_mem_t& ltk_add_mem; + ltk_reduce_mem_t& ltk_reduce_mem; + size_t lut_mem; + size_t bdf_mem; + [[nodiscard]] auto operator()(uint32_t n_threads) const -> size_t + { + return std::max(ltk_reduce_mem(n_threads), + lut_mem + std::max(bdf_mem, ltk_add_mem(n_threads))); + } + }; // Total amount of work; should be enough to occupy the GPU. uint32_t n_blocks = n_queries * n_probes; @@ -749,17 +797,24 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, auto conf_no_basediff = get_compute_similarity_kernel; auto conf_no_smem_lut = get_compute_similarity_kernel; auto topk_or_zero = manage_local_topk ? topk : 0u; - std::array candidates{std::make_tuple(conf_fast(pq_bits, topk_or_zero), lut_mem + bdf_mem, true), - std::make_tuple(conf_no_basediff(pq_bits, topk_or_zero), lut_mem, true), - std::make_tuple(conf_no_smem_lut(pq_bits, topk_or_zero), bdf_mem, false)}; + std::array candidates{ + std::make_tuple(conf_fast(pq_bits, topk_or_zero), + total_shared_mem_t{ltk_add_mem, ltk_reduce_mem, lut_mem, bdf_mem}, + true), + std::make_tuple(conf_no_basediff(pq_bits, topk_or_zero), + total_shared_mem_t{ltk_add_mem, ltk_reduce_mem, lut_mem, 0}, + true), + std::make_tuple(conf_no_smem_lut(pq_bits, topk_or_zero), + total_shared_mem_t{ltk_add_mem, ltk_reduce_mem, 0, bdf_mem}, + false)}; // we may allow slightly lower than 100% occupancy; constexpr double kTargetOccupancy = 0.75; // This struct is used to select the better candidate occupancy_t selected_perf{}; selected selected_config; - for (auto [kernel, smem_size_const, lut_is_in_shmem] : candidates) { - if (smem_size_const > dev_props.sharedMemPerBlockOptin) { + for (auto [kernel, smem_size_f, lut_is_in_shmem] : candidates) { + if (smem_size_f(WarpSize) > dev_props.sharedMemPerBlockOptin) { // Even a single block cannot fit into an SM due to shmem requirements. Skip the candidate. continue; } @@ -770,7 +825,7 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, // launch configuration, we will tighten the carveout once more, based on the final memory // usage and occupancy. const int max_carveout = - estimate_carveout(preferred_shmem_carveout, smem_size_const, dev_props); + estimate_carveout(preferred_shmem_carveout, smem_size_f(WarpSize), dev_props); RAFT_CUDA_TRY( cudaFuncSetAttribute(kernel, cudaFuncAttributePreferredSharedMemoryCarveout, max_carveout)); @@ -780,7 +835,7 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, uint32_t n_threads = round_down_safe(kernel_attrs.maxThreadsPerBlock, n_threads_gty); // Actual required shmem depens on the number of threads - size_t smem_size = max(smem_size_const, ltk_mem(n_threads)); + size_t smem_size = smem_size_f(n_threads); // Make sure the kernel can get enough shmem. cudaError_t cuda_status = @@ -807,7 +862,7 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, } if (n_threads_tmp < n_threads) { while (n_threads_tmp >= n_threads_min) { - auto smem_size_tmp = max(smem_size_const, ltk_mem(n_threads_tmp)); + auto smem_size_tmp = smem_size_f(n_threads_tmp); occupancy_t tmp( smem_size_tmp, n_threads_tmp, kernel, dev_props); bool select_it = false; diff --git a/cpp/include/raft/sparse/detail/cusparse_wrappers.h b/cpp/include/raft/sparse/detail/cusparse_wrappers.h index 0740e2ab8c..e8bf9c6de5 100644 --- a/cpp/include/raft/sparse/detail/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/detail/cusparse_wrappers.h @@ -154,21 +154,21 @@ inline void cusparsecoosortByRow( // NOLINT * @defgroup cusparse Create CSR operations * @{ */ -template +template cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, int64_t rows, int64_t cols, int64_t nnz, - IndexT* csrRowOffsets, - IndexT* csrColInd, + IndptrType* csrRowOffsets, + IndicesType* csrColInd, ValueT* csrValues); template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, int64_t rows, int64_t cols, int64_t nnz, - int* csrRowOffsets, - int* csrColInd, + int32_t* csrRowOffsets, + int32_t* csrColInd, float* csrValues) { return cusparseCreateCsr(spMatDescr, @@ -188,8 +188,8 @@ inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, int64_t rows, int64_t cols, int64_t nnz, - int* csrRowOffsets, - int* csrColInd, + int32_t* csrRowOffsets, + int32_t* csrColInd, double* csrValues) { return cusparseCreateCsr(spMatDescr, @@ -1058,9 +1058,9 @@ inline cusparseStatus_t cusparsecsr2dense_buffersize(cusparseHandle_t handle, cusparseSpMatDescr_t matA; cusparsecreatecsr(&matA, - m, - n, - nnz, + static_cast(m), + static_cast(n), + static_cast(nnz), const_cast(csrRowPtrA), const_cast(csrColIndA), const_cast(csrValA)); @@ -1107,9 +1107,9 @@ inline cusparseStatus_t cusparsecsr2dense_buffersize(cusparseHandle_t handle, cusparseOrder_t order = row_major ? CUSPARSE_ORDER_ROW : CUSPARSE_ORDER_COL; cusparseSpMatDescr_t matA; cusparsecreatecsr(&matA, - m, - n, - nnz, + static_cast(m), + static_cast(n), + static_cast(nnz), const_cast(csrRowPtrA), const_cast(csrColIndA), const_cast(csrValA)); @@ -1173,9 +1173,9 @@ inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, cusparseOrder_t order = row_major ? CUSPARSE_ORDER_ROW : CUSPARSE_ORDER_COL; cusparseSpMatDescr_t matA; cusparsecreatecsr(&matA, - m, - n, - nnz, + static_cast(m), + static_cast(n), + static_cast(nnz), const_cast(csrRowPtrA), const_cast(csrColIndA), const_cast(csrValA)); @@ -1220,9 +1220,9 @@ inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, cusparseOrder_t order = row_major ? CUSPARSE_ORDER_ROW : CUSPARSE_ORDER_COL; cusparseSpMatDescr_t matA; cusparsecreatecsr(&matA, - m, - n, - nnz, + static_cast(m), + static_cast(n), + static_cast(nnz), const_cast(csrRowPtrA), const_cast(csrColIndA), const_cast(csrValA)); diff --git a/cpp/include/raft/sparse/linalg/detail/spmm.hpp b/cpp/include/raft/sparse/linalg/detail/spmm.hpp index 4ad8623076..d8d73ee83f 100644 --- a/cpp/include/raft/sparse/linalg/detail/spmm.hpp +++ b/cpp/include/raft/sparse/linalg/detail/spmm.hpp @@ -77,23 +77,25 @@ cusparseDnMatDescr_t create_descriptor( /** * @brief create a cuSparse sparse descriptor * @tparam ValueType Data type of sparse_view (float/double) + * @tparam IndptrType Data type of csr_matrix_view index pointers + * @tparam IndicesType Data type of csr_matrix_view indices * @tparam NZType Type of sparse_view * @param[in] sparse_view input raft::device_csr_matrix_view of size M rows x K columns * @returns sparse matrix descriptor to be used by cuSparse API */ -template +template cusparseSpMatDescr_t create_descriptor( - raft::device_csr_matrix_view& sparse_view) + raft::device_csr_matrix_view& sparse_view) { cusparseSpMatDescr_t descr; auto csr_structure = sparse_view.structure_view(); RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatecsr( &descr, - csr_structure.get_n_rows(), - csr_structure.get_n_cols(), - csr_structure.get_nnz(), - const_cast(csr_structure.get_indptr().data()), - const_cast(csr_structure.get_indices().data()), + static_cast(csr_structure.get_n_rows()), + static_cast(csr_structure.get_n_cols()), + static_cast(csr_structure.get_nnz()), + const_cast(csr_structure.get_indptr().data()), + const_cast(csr_structure.get_indices().data()), const_cast*>(sparse_view.get_elements().data()))); return descr; } diff --git a/cpp/include/raft/util/cache_util.cuh b/cpp/include/raft/util/cache_util.cuh index bbd84d8bf2..a7dcc22b02 100644 --- a/cpp/include/raft/util/cache_util.cuh +++ b/cpp/include/raft/util/cache_util.cuh @@ -328,15 +328,16 @@ __global__ void assign_cache_idx(const int* keys, * @param [out] is_cached whether the element is cached size[n] * @param [in] time iteration counter (used for time stamping) */ -__global__ inline void get_cache_idx(int* keys, - int n, - int* cached_keys, - int n_cache_sets, - int associativity, - int* cache_time, - int* cache_idx, - bool* is_cached, - int time) +template +__global__ void get_cache_idx(int* keys, + int n, + int* cached_keys, + int n_cache_sets, + int associativity, + int* cache_time, + int* cache_idx, + bool* is_cached, + int time) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < n) { diff --git a/cpp/include/raft/util/reduction.cuh b/cpp/include/raft/util/reduction.cuh index 74c57b4ca2..362396f9b8 100644 --- a/cpp/include/raft/util/reduction.cuh +++ b/cpp/include/raft/util/reduction.cuh @@ -108,7 +108,6 @@ DI T blockReduce(T val, char* smem, ReduceLambda reduce_op = raft::add_op{}) * @param val input value * @param idx index to be used as rank * @param reduce_op a binary reduction operation. - * @return only the thread0 will contain valid reduced result */ template DI void warpRankedReduce(T& val, i_t& idx, ReduceLambda reduce_op = raft::min_op{}) @@ -199,4 +198,4 @@ DI i_t binaryBlockReduce(i_t val, i_t* shmem) } } -} // namespace raft \ No newline at end of file +} // namespace raft diff --git a/cpp/include/raft_runtime/neighbors/cagra.hpp b/cpp/include/raft_runtime/neighbors/cagra.hpp index 6f56302776..c54ed32b77 100644 --- a/cpp/include/raft_runtime/neighbors/cagra.hpp +++ b/cpp/include/raft_runtime/neighbors/cagra.hpp @@ -56,14 +56,16 @@ namespace raft::runtime::neighbors::cagra { raft::device_matrix_view distances); \ void serialize_file(raft::resources const& handle, \ const std::string& filename, \ - const raft::neighbors::cagra::index& index); \ + const raft::neighbors::cagra::index& index, \ + bool include_dataset = true); \ \ void deserialize_file(raft::resources const& handle, \ const std::string& filename, \ raft::neighbors::cagra::index* index); \ void serialize(raft::resources const& handle, \ std::string& str, \ - const raft::neighbors::cagra::index& index); \ + const raft::neighbors::cagra::index& index, \ + bool include_dataset = true); \ \ void deserialize(raft::resources const& handle, \ const std::string& str, \ diff --git a/cpp/src/raft_runtime/neighbors/cagra_serialize.cu b/cpp/src/raft_runtime/neighbors/cagra_serialize.cu index be9788562a..69b48b93a4 100644 --- a/cpp/src/raft_runtime/neighbors/cagra_serialize.cu +++ b/cpp/src/raft_runtime/neighbors/cagra_serialize.cu @@ -27,9 +27,10 @@ namespace raft::runtime::neighbors::cagra { #define RAFT_INST_CAGRA_SERIALIZE(DTYPE) \ void serialize_file(raft::resources const& handle, \ const std::string& filename, \ - const raft::neighbors::cagra::index& index) \ + const raft::neighbors::cagra::index& index, \ + bool include_dataset) \ { \ - raft::neighbors::cagra::serialize(handle, filename, index); \ + raft::neighbors::cagra::serialize(handle, filename, index, include_dataset); \ }; \ \ void deserialize_file(raft::resources const& handle, \ @@ -41,10 +42,11 @@ namespace raft::runtime::neighbors::cagra { }; \ void serialize(raft::resources const& handle, \ std::string& str, \ - const raft::neighbors::cagra::index& index) \ + const raft::neighbors::cagra::index& index, \ + bool include_dataset) \ { \ std::stringstream os; \ - raft::neighbors::cagra::serialize(handle, os, index); \ + raft::neighbors::cagra::serialize(handle, os, index, include_dataset); \ str = os.str(); \ } \ \ diff --git a/cpp/template/CMakeLists.txt b/cpp/template/CMakeLists.txt index 34eb4128b5..44b06e1b5f 100644 --- a/cpp/template/CMakeLists.txt +++ b/cpp/template/CMakeLists.txt @@ -34,5 +34,5 @@ rapids_cpm_init() include(cmake/thirdparty/get_raft.cmake) # -------------- compile tasks ----------------- # -add_executable(TEST_RAFT src/test_distance.cu) +add_executable(TEST_RAFT src/test_vector_search.cu) target_link_libraries(TEST_RAFT PRIVATE raft::raft raft::compiled) diff --git a/cpp/template/src/test_distance.cu b/cpp/template/src/test_distance.cu deleted file mode 100644 index e165cd8f14..0000000000 --- a/cpp/template/src/test_distance.cu +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -int main() -{ - raft::device_resources handle; - - int n_samples = 5000; - int n_features = 50; - - auto input = raft::make_device_matrix(handle, n_samples, n_features); - auto labels = raft::make_device_vector(handle, n_samples); - auto output = raft::make_device_matrix(handle, n_samples, n_samples); - - raft::random::make_blobs(handle, input.view(), labels.view()); - - auto metric = raft::distance::DistanceType::L2SqrtExpanded; - raft::distance::pairwise_distance(handle, input.view(), input.view(), output.view(), metric); -} diff --git a/cpp/template/src/test_vector_search.cu b/cpp/template/src/test_vector_search.cu new file mode 100644 index 0000000000..f54cfc03e7 --- /dev/null +++ b/cpp/template/src/test_vector_search.cu @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +int main() +{ + using namespace raft::neighbors; + raft::device_resources dev_resources; + // Use 5 GB of pool memory + raft::resource::set_workspace_to_pool_resource( + dev_resources, std::make_optional(5 * 1024 * 1024 * 1024ull)); + + int64_t n_samples = 50000; + int64_t n_dim = 90; + int64_t topk = 12; + int64_t n_queries = 1; + + // create input and output arrays + auto input = raft::make_device_matrix(dev_resources, n_samples, n_dim); + auto labels = raft::make_device_vector(dev_resources, n_samples); + auto queries = raft::make_device_matrix(dev_resources, n_queries, n_dim); + auto neighbors = raft::make_device_matrix(dev_resources, n_queries, topk); + auto distances = raft::make_device_matrix(dev_resources, n_queries, topk); + + raft::random::make_blobs(dev_resources, input.view(), labels.view()); + + // use default index parameters + cagra::index_params index_params; + // create and fill the index from a [n_samples, n_dim] input + auto index = cagra::build( + dev_resources, index_params, raft::make_const_mdspan(input.view())); + // use default search parameters + cagra::search_params search_params; + // search K nearest neighbors + cagra::search(dev_resources, + search_params, + index, + raft::make_const_mdspan(queries.view()), + neighbors.view(), + distances.view()); +} diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index efcd48cd1d..deefd9644a 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -105,6 +105,8 @@ if(BUILD_TESTS) NAME CORE_TEST PATH + test/core/device_resources_manager.cpp + test/core/device_setter.cpp test/core/logger.cpp test/core/math_device.cu test/core/math_host.cpp diff --git a/cpp/test/core/device_resources_manager.cpp b/cpp/test/core/device_resources_manager.cpp new file mode 100644 index 0000000000..11d07e3c7b --- /dev/null +++ b/cpp/test/core/device_resources_manager.cpp @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft { +auto get_test_device_ids() +{ + auto devices = std::array{int{}, int{}}; + auto device_count = 0; + RAFT_CUDA_TRY(cudaGetDeviceCount(&device_count)); + devices[1] = int{device_count > 1}; + return devices; +} + +TEST(DeviceResourcesManager, ObeysSetters) +{ + auto devices = get_test_device_ids(); + + auto streams_per_device = 3; + auto pools_per_device = 3; + auto streams_per_pool = 7; + auto workspace_limit = 2048; + auto workspace_init = 1024; + device_resources_manager::set_streams_per_device(streams_per_device); + device_resources_manager::set_stream_pools_per_device(pools_per_device, streams_per_pool); + device_resources_manager::set_mem_pool(); + device_resources_manager::set_workspace_allocation_limit(workspace_limit); + + auto unique_streams = std::array, 2>{}; + auto unique_pools = std::array, 2>{}; + + // Provide lock for counting unique objects + auto mtx = std::mutex{}; + auto workspace_mrs = + std::array>, 2>{ + nullptr, nullptr}; + auto alternate_workspace_mrs = std::array, 2>{}; + auto upstream_mrs = std::array{ + dynamic_cast( + rmm::mr::get_per_device_resource(rmm::cuda_device_id{devices[0]})), + dynamic_cast( + rmm::mr::get_per_device_resource(rmm::cuda_device_id{devices[1]}))}; + + for (auto i = std::size_t{}; i < devices.size(); ++i) { + auto scoped_device = device_setter{devices[i]}; + if (upstream_mrs[i] == nullptr) { + RAFT_LOG_WARN( + "RMM memory resource already set. Tests for device_resources_manger will be incomplete."); + } else { + workspace_mrs[i] = + std::make_shared>( + upstream_mrs[i], workspace_init, workspace_limit); + alternate_workspace_mrs[i] = std::make_shared(); + } + } + + device_resources_manager::set_workspace_memory_resource(workspace_mrs[0], devices[0]); + device_resources_manager::set_workspace_memory_resource(workspace_mrs[1], devices[1]); + + // Suppress the many warnings from testing use of setters after initial + // get_device_resources call + auto scoped_log_level = log_level_setter{RAFT_LEVEL_ERROR}; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(5) + for (auto i = std::size_t{}; i < 101; ++i) { + thread_local auto prev_streams = std::array, 2>{}; + auto device = devices[i % devices.size()]; + auto const& res = device_resources_manager::get_device_resources(device); + + auto primary_stream = res.get_stream().value(); + prev_streams[device] = prev_streams[device].value_or(primary_stream); + // Expect to receive the same stream every time for a given thread + EXPECT_EQ(*prev_streams[device], primary_stream); + + // Using RAII device setter here to avoid changing device in other tests + // that depend on a specific device to be set + auto scoped_device = device_setter{device}; + auto const& res2 = device_resources_manager::get_device_resources(); + // Expect device_resources to default to current device + EXPECT_EQ(primary_stream, res2.get_stream().value()); + + auto const& pool = res.get_stream_pool(); + EXPECT_EQ(streams_per_pool, pool.get_pool_size()); + + auto* mr = dynamic_cast*>( + rmm::mr::get_current_device_resource()); + auto* workspace_mr = + dynamic_cast*>( + dynamic_cast*>( + res.get_workspace_resource()) + ->get_upstream()); + if (upstream_mrs[i % devices.size()] != nullptr) { + // Expect that the current memory resource is a pool memory resource as requested + EXPECT_NE(mr, nullptr); + // Expect that the upstream workspace memory resource is a pool memory + // resource as requested + EXPECT_NE(workspace_mr, nullptr); + } + + { + auto lock = std::unique_lock{mtx}; + unique_streams[device].insert(primary_stream); + unique_pools[device].insert(&pool); + } + // Ensure that setters have no effect after get_device_resources call + device_resources_manager::set_streams_per_device(streams_per_device + 1); + device_resources_manager::set_stream_pools_per_device(pools_per_device - 1); + device_resources_manager::set_mem_pool(); + device_resources_manager::set_workspace_allocation_limit(1024); + device_resources_manager::set_workspace_memory_resource( + alternate_workspace_mrs[i % devices.size()], devices[i % devices.size()]); + } + + EXPECT_EQ(streams_per_device, unique_streams[devices[0]].size()); + EXPECT_EQ(streams_per_device, unique_streams[devices[1]].size()); + EXPECT_EQ(pools_per_device, unique_pools[devices[0]].size()); + EXPECT_EQ(pools_per_device, unique_pools[devices[1]].size()); +} + +} // namespace raft diff --git a/cpp/test/core/device_setter.cpp b/cpp/test/core/device_setter.cpp new file mode 100644 index 0000000000..5a4ff01346 --- /dev/null +++ b/cpp/test/core/device_setter.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include + +namespace raft { +TEST(DeviceSetter, ScopedDevice) +{ + auto device_a = int{}; + auto device_b = int{device_setter::get_device_count() > 1}; + if (device_b == device_a) { + RAFT_LOG_WARN("Only 1 CUDA device detected. device_setter test will be trivial"); + } + auto initial_device = 0; + RAFT_CUDA_TRY(cudaGetDevice(&initial_device)); + auto current_device = initial_device; + { + auto scoped_device = device_setter{device_a}; + // Confirm that device is currently device_a + RAFT_CUDA_TRY(cudaGetDevice(¤t_device)); + EXPECT_EQ(current_device, device_a); + // Confirm that get_current_device reports expected device + EXPECT_EQ(current_device, device_setter::get_current_device()); + } + + // Confirm that device went back to initial value once setter was out of + // scope + RAFT_CUDA_TRY(cudaGetDevice(¤t_device)); + EXPECT_EQ(current_device, initial_device); + + { + auto scoped_device = device_setter{device_b}; + // Confirm that device is currently device_b + RAFT_CUDA_TRY(cudaGetDevice(¤t_device)); + EXPECT_EQ(current_device, device_b); + // Confirm that get_current_device reports expected device + EXPECT_EQ(current_device, device_setter::get_current_device()); + } + + // Confirm that device went back to initial value once setter was out of + // scope + RAFT_CUDA_TRY(cudaGetDevice(¤t_device)); + EXPECT_EQ(current_device, initial_device); + + { + auto scoped_device1 = device_setter{device_b}; + auto scoped_device2 = device_setter{device_a}; + RAFT_CUDA_TRY(cudaGetDevice(¤t_device)); + // Confirm that multiple setters behave as expected, with the last + // constructed taking precedence + EXPECT_EQ(current_device, device_a); + } +} +} // namespace raft diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh index 89cb070afc..ea905d2089 100644 --- a/cpp/test/neighbors/ann_cagra.cuh +++ b/cpp/test/neighbors/ann_cagra.cuh @@ -137,6 +137,7 @@ struct AnnCagraInputs { int search_width; raft::distance::DistanceType metric; bool host_dataset; + bool include_serialized_dataset; // std::optional double min_recall; // = std::nullopt; }; @@ -217,9 +218,11 @@ class AnnCagraTest : public ::testing::TestWithParam { } else { index = cagra::build(handle_, index_params, database_view); }; - cagra::serialize(handle_, "cagra_index", index); + cagra::serialize(handle_, "cagra_index", index, ps.include_serialized_dataset); } + auto index = cagra::deserialize(handle_, "cagra_index"); + if (!ps.include_serialized_dataset) { index.update_dataset(handle_, database_view); } auto search_queries_view = raft::make_device_matrix_view( search_queries.data(), ps.n_queries, ps.dim); @@ -340,9 +343,7 @@ class AnnCagraSortTest : public ::testing::TestWithParam { void SetUp() override { - std::cout << "Resizing database: " << ps.n_rows * ps.dim << std::endl; database.resize(((size_t)ps.n_rows) * ps.dim, handle_.get_stream()); - std::cout << "Done.\nRuning rng" << std::endl; raft::random::Rng r(1234ULL); if constexpr (std::is_same{}) { GenerateRoundingErrorFreeDataset(database.data(), ps.n_rows, ps.dim, r, handle_.get_stream()); @@ -379,6 +380,7 @@ inline std::vector generate_inputs() {1}, {raft::distance::DistanceType::L2Expanded}, {false}, + {true}, {0.995}); auto inputs2 = raft::util::itertools::product( @@ -393,6 +395,7 @@ inline std::vector generate_inputs() {1}, {raft::distance::DistanceType::L2Expanded}, {false}, + {true}, {0.995}); inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); inputs2 = @@ -407,6 +410,7 @@ inline std::vector generate_inputs() {1}, {raft::distance::DistanceType::L2Expanded}, {false}, + {false}, {0.995}); inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); @@ -422,6 +426,7 @@ inline std::vector generate_inputs() {1}, {raft::distance::DistanceType::L2Expanded}, {false}, + {true}, {0.995}); inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); @@ -437,6 +442,7 @@ inline std::vector generate_inputs() {1}, {raft::distance::DistanceType::L2Expanded}, {false, true}, + {false}, {0.995}); inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); @@ -452,6 +458,7 @@ inline std::vector generate_inputs() {1}, {raft::distance::DistanceType::L2Expanded}, {false, true}, + {true}, {0.995}); inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh index d72d73680a..71d48cdeb7 100644 --- a/cpp/test/neighbors/ann_ivf_flat.cuh +++ b/cpp/test/neighbors/ann_ivf_flat.cuh @@ -497,6 +497,11 @@ const std::vector> inputs = { raft::matrix::detail::select::warpsort::kMaxCapacity * 4, raft::matrix::detail::select::warpsort::kMaxCapacity * 4, raft::distance::DistanceType::InnerProduct, - false}}; + false}, + + // The following two test cases should show very similar recall. + // num_queries, num_db_vecs, dim, k, nprobe, nlist, metric, adaptive_centers + {20000, 8712, 3, 10, 51, 66, raft::distance::DistanceType::L2Expanded, false}, + {100000, 8712, 3, 10, 51, 66, raft::distance::DistanceType::L2Expanded, false}}; } // namespace raft::neighbors::ivf_flat diff --git a/dependencies.yaml b/dependencies.yaml index 4b825b48f0..cf8170b9a1 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -170,6 +170,7 @@ dependencies: - libfaiss>=1.7.1 - faiss-proc=*=cuda - matplotlib + - pyyaml cudatoolkit: specific: @@ -304,16 +305,16 @@ dependencies: common: - output_types: [conda, pyproject] packages: - - dask>=2023.5.1 + - dask>=2023.7.1 - dask-cuda==23.10.* - - distributed>=2023.5.1 + - distributed>=2023.7.1 - joblib>=0.11 - numba>=0.57 - *numpy - ucx-py==0.34.* - output_types: conda packages: - - dask-core>=2023.5.1 + - dask-core>=2023.7.1 - ucx>=1.13.0 - ucx-proc=*=gpu - output_types: pyproject diff --git a/docs/source/cpp_api/core_resources.rst b/docs/source/cpp_api/core_resources.rst index e3d402d6af..85c454b355 100644 --- a/docs/source/cpp_api/core_resources.rst +++ b/docs/source/cpp_api/core_resources.rst @@ -35,6 +35,25 @@ namespace *raft::core* :project: RAFT :members: +Device Resources Manager +------------------------ + +While `raft::device_resources` provides a convenient way to access +device-related resources for a sequence of RAFT calls, it is sometimes useful +to be able to limit those resources across an entire application. For +instance, in highly multi-threaded applications, it can be helpful to limit +the total number of streams rather than relying on the default stream per +thread. `raft::device_resources_manager` offers a way to access +`raft::device_resources` instances that draw from a limited pool of +underlying device resources. + +``#include `` + +namespace *raft::core* + +.. doxygenclass:: raft::device_resources_manager + :project: RAFT + :members: Resource Functions ------------------ diff --git a/docs/source/cpp_api/neighbors_cagra.rst b/docs/source/cpp_api/neighbors_cagra.rst index 6613b0b06d..99ecd3a985 100644 --- a/docs/source/cpp_api/neighbors_cagra.rst +++ b/docs/source/cpp_api/neighbors_cagra.rst @@ -19,3 +19,13 @@ namespace *raft::neighbors::cagra* :content-only: +Serializer Methods +------------------ +``#include `` + +namespace *raft::neighbors::cagra* + +.. doxygengroup:: cagra_serialize + :project: RAFT + :members: + :content-only: diff --git a/docs/source/cpp_api/neighbors_ivf_pq.rst b/docs/source/cpp_api/neighbors_ivf_pq.rst index 348928d719..17948a37fe 100644 --- a/docs/source/cpp_api/neighbors_ivf_pq.rst +++ b/docs/source/cpp_api/neighbors_ivf_pq.rst @@ -21,6 +21,17 @@ Serializer Methods namespace *raft::neighbors::ivf_pq* .. doxygengroup:: ivf_pq_serialize + :project: RAFT + :members: + :content-only: + +Candidate Refinement +-------------------- +``#include `` + +namespace *raft::neighbors* + +.. doxygengroup:: ann_refine :project: RAFT :members: :content-only: \ No newline at end of file diff --git a/docs/source/pylibraft_api/cluster.rst b/docs/source/pylibraft_api/cluster.rst index 59e53e7d4c..085297fe34 100644 --- a/docs/source/pylibraft_api/cluster.rst +++ b/docs/source/pylibraft_api/cluster.rst @@ -7,6 +7,9 @@ This page provides pylibraft class references for the publicly-exposed elements :language: python :class: highlight +KMeans +###### + .. autoclass:: pylibraft.cluster.kmeans.KMeansParams :members: @@ -14,8 +17,4 @@ This page provides pylibraft class references for the publicly-exposed elements .. autofunction:: pylibraft.cluster.kmeans.cluster_cost -.. autofunction:: pylibraft.cluster.compute_new_centroids - - - - +.. autofunction:: pylibraft.cluster.kmeans.compute_new_centroids diff --git a/docs/source/pylibraft_api/neighbors.rst b/docs/source/pylibraft_api/neighbors.rst index ca89c25ed4..680a2982cb 100644 --- a/docs/source/pylibraft_api/neighbors.rst +++ b/docs/source/pylibraft_api/neighbors.rst @@ -27,6 +27,11 @@ CAGRA .. autofunction:: pylibraft.neighbors.cagra.search +Serializer Methods +------------------ +.. autofunction:: pylibraft.neighbors.cagra.save + +.. autofunction:: pylibraft.neighbors.cagra.load IVF-Flat ######## @@ -43,6 +48,12 @@ IVF-Flat .. autofunction:: pylibraft.neighbors.ivf_flat.search +Serializer Methods +------------------ + +.. autofunction:: pylibraft.neighbors.ivf_flat.save + +.. autofunction:: pylibraft.neighbors.ivf_flat.load IVF-PQ ###### @@ -59,8 +70,14 @@ IVF-PQ .. autofunction:: pylibraft.neighbors.ivf_pq.search +Serializer Methods +------------------ + +.. autofunction:: pylibraft.neighbors.ivf_pq.save + +.. autofunction:: pylibraft.neighbors.ivf_pq.load Candidate Refinement -#################### +-------------------- .. autofunction:: pylibraft.neighbors.refine diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md index 3a3db0f3ea..2985a8120b 100644 --- a/docs/source/raft_ann_benchmarks.md +++ b/docs/source/raft_ann_benchmarks.md @@ -6,10 +6,13 @@ This project provides a benchmark program for various ANN search implementations The easiest way to install these benchmarks is through conda. We suggest using mamba as it generally leads to a faster install time:: ```bash +git clone https://github.com/rapidsai/raft.git && cd raft +export RAFT_HOME=$(pwd) + mamba env create --name raft_ann_benchmarks -f conda/environments/bench_ann_cuda-118_arch-x86_64.yaml conda activate raft_ann_benchmarks -mamba install -c rapidsai libraft-ann-bench +mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-ann-bench cudatoolkit=11.8* ``` The channel `rapidsai` can easily be substituted `rapidsai-nightly` if nightly benchmarks are desired. @@ -32,50 +35,50 @@ expected to be defined to run these scripts; this variable holds the directory w ### End-to-end example: Million-scale ```bash export RAFT_HOME=$(pwd) -# All scripts are present in directory raft/scripts/ann-benchmarks +# All scripts are present in directory raft/bench/ann # (1) prepare dataset -python scripts/ann-benchmarks/get_dataset.py --name glove-100-angular --normalize +python bench/ann/get_dataset.py --dataset glove-100-angular --normalize # (2) build and search index -python scripts/ann-benchmarks/run.py --configuration conf/glove-100-inner.json +python bench/ann/run.py --dataset glove-100-inner # (3) evaluate results -python scripts/ann-benchmarks/data_export.py --output out.csv --groundtruth data/glove-100-inner/groundtruth.neighbors.ibin result/glove-100-inner/ +python bench/ann/data_export.py --output out.csv --dataset glove-100-inner # (4) plot results -python scripts/ann-benchmarks/plot.py --result_csv out.csv +python bench/ann/plot.py --result-csv out.csv ``` ### End-to-end example: Billion-scale -`scripts/get_dataset.py` cannot be used to download the [billion-scale datasets](ann_benchmarks_dataset.md#billion-scale) +`bench/ann/get_dataset.py` cannot be used to download the [billion-scale datasets](ann_benchmarks_dataset.md#billion-scale) because they are so large. You should instead use our billion-scale datasets guide to download and prepare them. -All other python scripts mentioned below work as intended once the +All other python mentioned below work as intended once the billion-scale dataset has been downloaded. To download Billion-scale datasets, visit [big-ann-benchmarks](http://big-ann-benchmarks.com/neurips21.html) ```bash export RAFT_HOME=$(pwd) -# All scripts are present in directory raft/scripts/ann-benchmarks +# All scripts are present in directory raft/bench/ann mkdir -p data/deep-1B # (1) prepare dataset # download manually "Ground Truth" file of "Yandex DEEP" # suppose the file name is deep_new_groundtruth.public.10K.bin -python scripts/ann-benchmarks/split_groundtruth.py data/deep-1B/deep_new_groundtruth.public.10K.bin +python bench/ann/split_groundtruth.py --groundtruth data/deep-1B/deep_new_groundtruth.public.10K.bin # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced # (2) build and search index -python scripts/ann-benchmarks/run.py --configuration conf/deep-1B.json +python bench/ann/run.py --dataset deep-1B # (3) evaluate results -python scripts/ann-benchmarks/data_export.py --output out.csv --groundtruth data/deep-1B/groundtruth.neighbors.ibin result/deep-1B/ +python bench/ann/data_export.py --output out.csv --dataset deep-1B # (4) plot results -python scripts/ann-benchmarks/plot.py --result_csv out.csv +python bench/ann/plot.py --result-csv out.csv ``` -The usage of `scripts/ann-benchmarks/split-groundtruth.py` is: +The usage of `bench/ann/split-groundtruth.py` is: ```bash usage: split_groundtruth.py [-h] --groundtruth GROUNDTRUTH @@ -86,27 +89,28 @@ options: ``` ##### Step 1: Prepare Dataset -The script `scripts/ann-benchmarks/get_dataset.py` will download and unpack the dataset in directory +The script `bench/ann/get_dataset.py` will download and unpack the dataset in directory that the user provides. As of now, only million-scale datasets are supported by this script. For more information on [datasets and formats](ann_benchmarks_dataset.md). The usage of this script is: ```bash -usage: get_dataset.py [-h] [--name NAME] [--path PATH] [--normalize] +usage: get_dataset.py [-h] [--name NAME] [--dataset-path DATASET_PATH] [--normalize] options: - -h, --help show this help message and exit - --name NAME dataset to download (default: glove-100-angular) - --path PATH path to download dataset (default: {os.getcwd()}/data) - --normalize normalize cosine distance to inner product (default: False) -``` + -h, --help show this help message and exit + --dataset DATASET dataset to download (default: glove-100-angular) + --dataset-path DATASET_PATH + path to download dataset (default: ${RAFT_HOME}/bench/ann/data) + --normalize normalize cosine distance to inner product (default: False) When option `normalize` is provided to the script, any dataset that has cosine distances will be normalized to inner product. So, for example, the dataset `glove-100-angular` -will be written at location `data/glove-100-inner/`. +will be written at location `${RAFT_HOME}/bench/ann/data/glove-100-inner/`. +``` #### Step 2: Build and Search Index -The script `scripts/ann-benchmarks/run.py` will build and search indices for a given dataset and its +The script `bench/ann/run.py` will build and search indices for a given dataset and its specified configuration. To confirgure which algorithms are available, we use `algos.yaml`. To configure building/searching indices for a dataset, look at [index configuration](#json-index-config). @@ -120,21 +124,36 @@ raft_ivf_pq: available in `raft/cpp/build/`. `disabled` : denotes whether an algorithm should be excluded from benchmark runs. -The usage of the script `scripts/run.py` is: +The usage of the script `bench/ann/run.py` is: ```bash -usage: run.py [-h] --configuration CONFIGURATION [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] [--force] +usage: run.py [-h] [--configuration CONFIGURATION] [--dataset DATASET] [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] [-f] + +options: +usage: run.py [-h] [--configuration CONFIGURATION] [--dataset DATASET] [--dataset-path DATASET_PATH] [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] [-f] options: -h, --help show this help message and exit --configuration CONFIGURATION path to configuration file for a dataset (default: None) + --dataset DATASET dataset whose configuration file will be used (default: glove-100-inner) + --dataset-path DATASET_PATH + path to dataset folder (default: ${RAFT_HOME}/bench/ann/data) --build --search --algorithms ALGORITHMS run only comma separated list of named algorithms (default: None) --indices INDICES run only comma separated list of named indices. parameter `algorithms` is ignored (default: None) - --force re-run algorithms even if their results already exist (default: False) + -f, --force re-run algorithms even if their results already exist (default: False) ``` +`configuration` and `dataset` : `configuration` is a path to a configuration file for a given dataset. +The configuration file should be name as `.json`. It is optional if the name of the dataset is +provided with the `dataset` argument, in which case +a configuration file will be searched for as `${RAFT_HOME}/bench/ann/conf/.json` + +`dataset-path` : +1. data is read from `/` +2. indices are built in `//index` +3. search results are stored in `//result` `build` and `search` : if both parameters are not supplied to the script then it is assumed both are `True`. @@ -143,26 +162,25 @@ it is assumed both are `True`. is available in `algos.yaml` and not disabled, as well as having an associated executable. #### Step 3: Evaluating Results -The script `scripts/ann-benchmarks/data_export.py` will evaluate results for a dataset whose index has been built -and search with at least one algorithm. For every result file that is supplied to the script, the output +The script `bench/ann/data_export.py` will evaluate results for a dataset whose index has been built +and searched with at least one algorithm. For every result file that is available to the script, the output will be combined and written to a CSV file. The usage of this script is: ```bash -usage: data_export.py [-h] --output OUTPUT [--recompute] --groundtruth GROUNDTRUTH +usage: data_export.py [-h] --output OUTPUT [--recompute] [--dataset DATASET] [--dataset-path DATASET_PATH] options: -h, --help show this help message and exit --output OUTPUT Path to the CSV output file (default: None) --recompute Recompute metrics (default: False) - --groundtruth GROUNDTRUTH - Path to groundtruth.neighbors.ibin file for a dataset (default: None) + --dataset DATASET Name of the dataset to export results for (default: glove-100-inner) + --dataset-path DATASET_PATH + path to dataset folder (default: ${RAFT_HOME}/bench/ann/data) ``` -`result_filepaths` : whitespace delimited list of result files/directories that can be captured via pattern match. For more [information and examples](ann_benchmarks_low_level.html#result-filepath-example) - #### Step 4: Plot Results -The script `scripts/ann-benchmarks/plot.py` will plot all results evaluated to a CSV file for a given dataset. +The script `bench/ann/plot.py` will plot all results evaluated to a CSV file for a given dataset. The usage of this script is: ```bash @@ -170,9 +188,9 @@ usage: plot.py [-h] --result_csv RESULT_CSV [--output OUTPUT] [--x-scale X_SCALE options: -h, --help show this help message and exit - --result_csv RESULT_CSV + --result-csv RESULT_CSV Path to CSV Results (default: None) - --output OUTPUT Path to the PNG output file (default: /home/nfs/dgala/raft/out.png) + --output OUTPUT Path to the PNG output file (default: ${RAFT_HOME}/out.png) --x-scale X_SCALE Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear) --y-scale {linear,log,symlog,logit} Scale to use when drawing the Y-axis (default: linear) @@ -182,6 +200,10 @@ options: All algorithms present in the CSV file supplied to this script with parameter `result_csv` will appear in the plot. +The figure below is the resulting plot of running our benchmarks as of August 2023 for a batch size of 10, on an NVIDIA H100 GPU and an Intel Xeon Platinum 8480CL CPU. It presents the throughput (in Queries-Per-Second) performance for every level of recall. + +![Throughput vs recall plot comparing popular ANN algorithms with RAFT's at batch size 10](../../img/raft-vector-search-batch-10.png) + ## Adding a new ANN algorithm ### Implementation and Configuration Implementation of a new algorithm should be a C++ class that inherits `class ANN` (defined in `cpp/bench/ann/src/ann.h`) and implements all the pure virtual functions. diff --git a/img/raft-vector-search-batch-10.png b/img/raft-vector-search-batch-10.png new file mode 100644 index 0000000000..5416e611f2 Binary files /dev/null and b/img/raft-vector-search-batch-10.png differ diff --git a/python/pylibraft/pylibraft/cluster/kmeans.pyx b/python/pylibraft/pylibraft/cluster/kmeans.pyx index b61fb4ab02..f4af519dc1 100644 --- a/python/pylibraft/pylibraft/cluster/kmeans.pyx +++ b/python/pylibraft/pylibraft/cluster/kmeans.pyx @@ -85,33 +85,26 @@ def compute_new_centroids(X, -------- >>> import cupy as cp - >>> from pylibraft.common import Handle >>> from pylibraft.cluster.kmeans import compute_new_centroids - >>> # A single RAFT handle can optionally be reused across >>> # pylibraft functions. >>> handle = Handle() - >>> n_samples = 5000 >>> n_features = 50 >>> n_clusters = 3 - >>> X = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> centroids = cp.random.random_sample((n_clusters, n_features), ... dtype=cp.float32) ... >>> labels = cp.random.randint(0, high=n_clusters, size=n_samples, ... dtype=cp.int32) - - >>> new_centroids = cp.empty((n_clusters, n_features), dtype=cp.float32) - + >>> new_centroids = cp.empty((n_clusters, n_features), + ... dtype=cp.float32) >>> compute_new_centroids( ... X, centroids, labels, new_centroids, handle=handle ... ) - >>> # pylibraft functions are often asynchronous so the >>> # handle needs to be explicitly synchronized >>> handle.sync() @@ -221,11 +214,9 @@ def init_plus_plus(X, n_clusters=None, seed=None, handle=None, centroids=None): >>> import cupy as cp >>> from pylibraft.cluster.kmeans import init_plus_plus - >>> n_samples = 5000 >>> n_features = 50 >>> n_clusters = 3 - >>> X = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) @@ -301,19 +292,14 @@ def cluster_cost(X, centroids, handle=None): -------- >>> import cupy as cp - >>> >>> from pylibraft.cluster.kmeans import cluster_cost - >>> >>> n_samples = 5000 >>> n_features = 50 >>> n_clusters = 3 - >>> >>> X = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> centroids = cp.random.random_sample((n_clusters, n_features), ... dtype=cp.float32) - >>> inertia = cluster_cost(X, centroids) """ x_cai = X.__cuda_array_interface__ @@ -524,13 +510,10 @@ def fit( -------- >>> import cupy as cp - >>> >>> from pylibraft.cluster.kmeans import fit, KMeansParams - >>> >>> n_samples = 5000 >>> n_features = 50 >>> n_clusters = 3 - >>> >>> X = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) diff --git a/python/pylibraft/pylibraft/common/handle.pyx b/python/pylibraft/pylibraft/common/handle.pyx index b4cdb9b0c1..7e3dc289e0 100644 --- a/python/pylibraft/pylibraft/common/handle.pyx +++ b/python/pylibraft/pylibraft/common/handle.pyx @@ -197,8 +197,8 @@ cdef class Handle(DeviceResources): _HANDLE_PARAM_DOCSTRING = """ - handle : Optional RAFT resource handle for reusing expensive CUDA - resources. If a handle isn't supplied, CUDA resources will be + handle : Optional RAFT resource handle for reusing CUDA resources. + If a handle isn't supplied, CUDA resources will be allocated inside this function and synchronized before the function exits. If a handle is supplied, you will need to explicitly synchronize yourself by calling `handle.sync()` diff --git a/python/pylibraft/pylibraft/common/mdspan.pxd b/python/pylibraft/pylibraft/common/mdspan.pxd index 6b202c2b69..17dd2d8bfd 100644 --- a/python/pylibraft/pylibraft/common/mdspan.pxd +++ b/python/pylibraft/pylibraft/common/mdspan.pxd @@ -30,6 +30,12 @@ from pylibraft.common.cpp.mdspan cimport ( from pylibraft.common.handle cimport device_resources from pylibraft.common.optional cimport make_optional, optional +# Cython doesn't like `const float` inside template parameters +# hack around this with using typedefs +ctypedef const float const_float +ctypedef const int8_t const_int8_t +ctypedef const uint8_t const_uint8_t + cdef device_matrix_view[float, int64_t, row_major] get_dmv_float( array, check_shape) except * @@ -49,6 +55,15 @@ cdef optional[device_matrix_view[int64_t, int64_t, row_major]] make_optional_vie cdef device_matrix_view[uint32_t, int64_t, row_major] get_dmv_uint32( array, check_shape) except * +cdef device_matrix_view[const_float, int64_t, row_major] get_const_dmv_float( + array, check_shape) except * + +cdef device_matrix_view[const_uint8_t, int64_t, row_major] get_const_dmv_uint8( + array, check_shape) except * + +cdef device_matrix_view[const_int8_t, int64_t, row_major] get_const_dmv_int8( + array, check_shape) except * + cdef host_matrix_view[float, int64_t, row_major] get_hmv_float( array, check_shape) except * @@ -63,3 +78,12 @@ cdef host_matrix_view[int64_t, int64_t, row_major] get_hmv_int64( cdef host_matrix_view[uint32_t, int64_t, row_major] get_hmv_uint32( array, check_shape) except * + +cdef host_matrix_view[const_float, int64_t, row_major] get_const_hmv_float( + array, check_shape) except * + +cdef host_matrix_view[const_uint8_t, int64_t, row_major] get_const_hmv_uint8( + array, check_shape) except * + +cdef host_matrix_view[const_int8_t, int64_t, row_major] get_const_hmv_int8( + array, check_shape) except * diff --git a/python/pylibraft/pylibraft/common/mdspan.pyx b/python/pylibraft/pylibraft/common/mdspan.pyx index 1219b1612d..7442a6bb89 100644 --- a/python/pylibraft/pylibraft/common/mdspan.pyx +++ b/python/pylibraft/pylibraft/common/mdspan.pyx @@ -193,6 +193,39 @@ cdef device_matrix_view[int64_t, int64_t, row_major] \ cai.data, shape[0], shape[1]) +cdef device_matrix_view[const_float, int64_t, row_major] \ + get_const_dmv_float(cai, check_shape) except *: + if cai.dtype != np.float32: + raise TypeError("dtype %s not supported" % cai.dtype) + if check_shape and len(cai.shape) != 2: + raise ValueError("Expected a 2D array, got %d D" % len(cai.shape)) + shape = (cai.shape[0], cai.shape[1] if len(cai.shape) == 2 else 1) + return make_device_matrix_view[const_float, int64_t, row_major]( + cai.data, shape[0], shape[1]) + + +cdef device_matrix_view[const_uint8_t, int64_t, row_major] \ + get_const_dmv_uint8(cai, check_shape) except *: + if cai.dtype != np.uint8: + raise TypeError("dtype %s not supported" % cai.dtype) + if check_shape and len(cai.shape) != 2: + raise ValueError("Expected a 2D array, got %d D" % len(cai.shape)) + shape = (cai.shape[0], cai.shape[1] if len(cai.shape) == 2 else 1) + return make_device_matrix_view[const_uint8_t, int64_t, row_major]( + cai.data, shape[0], shape[1]) + + +cdef device_matrix_view[const_int8_t, int64_t, row_major] \ + get_const_dmv_int8(cai, check_shape) except *: + if cai.dtype != np.int8: + raise TypeError("dtype %s not supported" % cai.dtype) + if check_shape and len(cai.shape) != 2: + raise ValueError("Expected a 2D array, got %d D" % len(cai.shape)) + shape = (cai.shape[0], cai.shape[1] if len(cai.shape) == 2 else 1) + return make_device_matrix_view[const_int8_t, int64_t, row_major]( + cai.data, shape[0], shape[1]) + + cdef optional[device_matrix_view[int64_t, int64_t, row_major]] \ make_optional_view_int64(device_matrix_view[int64_t, int64_t, row_major]& dmv) except *: # noqa: E501 return make_optional[device_matrix_view[int64_t, int64_t, row_major]](dmv) @@ -222,7 +255,6 @@ cdef host_matrix_view[float, int64_t, row_major] \ return make_host_matrix_view[float, int64_t, row_major]( cai.data, shape[0], shape[1]) - cdef host_matrix_view[uint8_t, int64_t, row_major] \ get_hmv_uint8(cai, check_shape) except *: if cai.dtype != np.uint8: @@ -265,3 +297,36 @@ cdef host_matrix_view[uint32_t, int64_t, row_major] \ shape = (cai.shape[0], cai.shape[1] if len(cai.shape) == 2 else 1) return make_host_matrix_view[uint32_t, int64_t, row_major]( cai.data, shape[0], shape[1]) + + +cdef host_matrix_view[const_float, int64_t, row_major] \ + get_const_hmv_float(cai, check_shape) except *: + if cai.dtype != np.float32: + raise TypeError("dtype %s not supported" % cai.dtype) + if check_shape and len(cai.shape) != 2: + raise ValueError("Expected a 2D array, got %d D" % len(cai.shape)) + shape = (cai.shape[0], cai.shape[1] if len(cai.shape) == 2 else 1) + return make_host_matrix_view[const_float, int64_t, row_major]( + cai.data, shape[0], shape[1]) + + +cdef host_matrix_view[const_uint8_t, int64_t, row_major] \ + get_const_hmv_uint8(cai, check_shape) except *: + if cai.dtype != np.uint8: + raise TypeError("dtype %s not supported" % cai.dtype) + if check_shape and len(cai.shape) != 2: + raise ValueError("Expected a 2D array, got %d D" % len(cai.shape)) + shape = (cai.shape[0], cai.shape[1] if len(cai.shape) == 2 else 1) + return make_host_matrix_view[const_uint8_t, int64_t, row_major]( + cai.data, shape[0], shape[1]) + + +cdef host_matrix_view[const_int8_t, int64_t, row_major] \ + get_const_hmv_int8(cai, check_shape) except *: + if cai.dtype != np.int8: + raise TypeError("dtype %s not supported" % cai.dtype) + if check_shape and len(cai.shape) != 2: + raise ValueError("Expected a 2D array, got %d D" % len(cai.shape)) + shape = (cai.shape[0], cai.shape[1] if len(cai.shape) == 2 else 1) + return make_host_matrix_view[const_int8_t, int64_t, row_major]( + cai.data, shape[0], shape[1]) diff --git a/python/pylibraft/pylibraft/neighbors/brute_force.pyx b/python/pylibraft/pylibraft/neighbors/brute_force.pyx index 2d118072ab..4aa47b8a18 100644 --- a/python/pylibraft/pylibraft/neighbors/brute_force.pyx +++ b/python/pylibraft/pylibraft/neighbors/brute_force.pyx @@ -95,7 +95,6 @@ def knn(dataset, queries, k=None, indices=None, distances=None, distances : Optional array interface compliant matrix shape (n_queries, k), dtype float. If supplied, neighbor indices will be written here in-place. (default None) - {handle_docstring} Returns @@ -108,16 +107,12 @@ def knn(dataset, queries, k=None, indices=None, distances=None, Examples -------- - >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors.brute_force import knn - >>> n_samples = 50000 >>> n_features = 50 >>> n_queries = 1000 - >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) >>> # Search using the built index diff --git a/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx b/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx index 7d758a32ef..e0c59a5ed3 100644 --- a/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx +++ b/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx @@ -69,6 +69,12 @@ from pylibraft.common.cpp.mdspan cimport ( row_major, ) from pylibraft.common.mdspan cimport ( + get_const_dmv_float, + get_const_dmv_int8, + get_const_dmv_uint8, + get_const_hmv_float, + get_const_hmv_int8, + get_const_hmv_uint8, get_dmv_float, get_dmv_int8, get_dmv_int64, @@ -85,6 +91,25 @@ from pylibraft.neighbors.common cimport _get_metric_string cdef class IndexParams: + """" + Parameters to build index for CAGRA nearest neighbor search + + Parameters + ---------- + metric : string denoting the metric type, default="sqeuclidean" + Valid values for metric: ["sqeuclidean"], where + - sqeuclidean is the euclidean distance without the square root + operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2 + intermediate_graph_degree : int, default = 128 + + graph_degree : int, default = 64 + + add_data_on_build : bool, default = True + After training the coarse and fine quantizers, we will populate + the index with the dataset if add_data_on_build == True, otherwise + the index is left empty, and the extend method can be used + to add new vectors to the index. + """ cdef c_cagra.index_params params def __init__(self, *, @@ -92,29 +117,6 @@ cdef class IndexParams: intermediate_graph_degree=128, graph_degree=64, add_data_on_build=True): - """" - Parameters to build index for CAGRA nearest neighbor search - - Parameters - ---------- - metric : string denoting the metric type, default="sqeuclidean" - Valid values for metric: ["sqeuclidean", "inner_product", - "euclidean"], where - - sqeuclidean is the euclidean distance without the square root - operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2, - - euclidean is the euclidean distance - - inner product distance is defined as - distance(a, b) = \\sum_i a_i * b_i. - intermediate_graph_degree : int, default = 128 - - graph_degree : int, default = 64 - - add_data_on_build : bool, default = True - After training the coarse and fine quantizers, we will populate - the index with the dataset if add_data_on_build == True, otherwise - the index is left empty, and the extend method can be used - to add new vectors to the index. - """ self.params.metric = _get_metric(metric) self.params.metric_arg = 0 self.params.intermediate_graph_degree = intermediate_graph_degree @@ -163,9 +165,34 @@ cdef class IndexFloat(Index): m_str = "metric=" + _get_metric_string(self.index.metric()) attr_str = [attr + "=" + str(getattr(self, attr)) for attr in ["metric", "dim", "graph_degree"]] - attr_str = m_str + attr_str + attr_str = [m_str] + attr_str return "Index(type=CAGRA, " + (", ".join(attr_str)) + ")" + @auto_sync_handle + def update_dataset(self, dataset, handle=None): + """ Replace the dataset with a new dataset. + + Parameters + ---------- + dataset : array interface compliant matrix shape (n_samples, dim) + {handle_docstring} + """ + cdef device_resources* handle_ = \ + handle.getHandle() + + dataset_ai = wrap_array(dataset) + dataset_dt = dataset_ai.dtype + _check_input_array(dataset_ai, [np.dtype("float32")]) + + if dataset_ai.from_cai: + self.index[0].update_dataset(deref(handle_), + get_const_dmv_float(dataset_ai, + check_shape=True)) + else: + self.index[0].update_dataset(deref(handle_), + get_const_hmv_float(dataset_ai, + check_shape=True)) + @property def metric(self): return self.index[0].metric() @@ -199,11 +226,36 @@ cdef class IndexInt8(Index): self.index = new c_cagra.index[int8_t, uint32_t]( deref(handle_)) + @auto_sync_handle + def update_dataset(self, dataset, handle=None): + """ Replace the dataset with a new dataset. + + Parameters + ---------- + dataset : array interface compliant matrix shape (n_samples, dim) + {handle_docstring} + """ + cdef device_resources* handle_ = \ + handle.getHandle() + + dataset_ai = wrap_array(dataset) + dataset_dt = dataset_ai.dtype + _check_input_array(dataset_ai, [np.dtype("byte")]) + + if dataset_ai.from_cai: + self.index[0].update_dataset(deref(handle_), + get_const_dmv_int8(dataset_ai, + check_shape=True)) + else: + self.index[0].update_dataset(deref(handle_), + get_const_hmv_int8(dataset_ai, + check_shape=True)) + def __repr__(self): m_str = "metric=" + _get_metric_string(self.index.metric()) attr_str = [attr + "=" + str(getattr(self, attr)) for attr in ["metric", "dim", "graph_degree"]] - attr_str = m_str + attr_str + attr_str = [m_str] + attr_str return "Index(type=CAGRA, " + (", ".join(attr_str)) + ")" @property @@ -239,11 +291,36 @@ cdef class IndexUint8(Index): self.index = new c_cagra.index[uint8_t, uint32_t]( deref(handle_)) + @auto_sync_handle + def update_dataset(self, dataset, handle=None): + """ Replace the dataset with a new dataset. + + Parameters + ---------- + dataset : array interface compliant matrix shape (n_samples, dim) + {handle_docstring} + """ + cdef device_resources* handle_ = \ + handle.getHandle() + + dataset_ai = wrap_array(dataset) + dataset_dt = dataset_ai.dtype + _check_input_array(dataset_ai, [np.dtype("ubyte")]) + + if dataset_ai.from_cai: + self.index[0].update_dataset(deref(handle_), + get_const_dmv_uint8(dataset_ai, + check_shape=True)) + else: + self.index[0].update_dataset(deref(handle_), + get_const_hmv_uint8(dataset_ai, + check_shape=True)) + def __repr__(self): m_str = "metric=" + _get_metric_string(self.index.metric()) attr_str = [attr + "=" + str(getattr(self, attr)) for attr in ["metric", "dim", "graph_degree"]] - attr_str = m_str + attr_str + attr_str = [m_str] + attr_str return "Index(type=CAGRA, " + (", ".join(attr_str)) + ")" @property @@ -280,8 +357,8 @@ def build(IndexParams index_params, dataset, handle=None): It is required that both the dataset and the optimized graph fit the GPU memory. - The following distance metrics are supported: - - L2 + The following distance metrics are supported: + - L2 Parameters ---------- @@ -298,31 +375,23 @@ def build(IndexParams index_params, dataset, handle=None): -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import cagra - >>> n_samples = 50000 >>> n_features = 50 >>> n_queries = 1000 >>> k = 10 - >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> handle = DeviceResources() >>> build_params = cagra.IndexParams(metric="sqeuclidean") - >>> index = cagra.build(build_params, dataset, handle=handle) - >>> distances, neighbors = cagra.search(cagra.SearchParams(), ... index, dataset, ... k, handle=handle) - >>> # pylibraft functions are often asynchronous so the >>> # handle needs to be explicitly synchronized >>> handle.sync() - >>> distances = cp.asarray(distances) >>> neighbors = cp.asarray(neighbors) """ @@ -415,6 +484,54 @@ def build(IndexParams index_params, dataset, handle=None): cdef class SearchParams: + """ + CAGRA search parameters + + Parameters + ---------- + max_queries: int, default = 0 + Maximum number of queries to search at the same time (batch size). + Auto select when 0. + itopk_size: int, default = 64 + Number of intermediate search results retained during the search. + This is the main knob to adjust trade off between accuracy and + search speed. Higher values improve the search accuracy. + max_iterations: int, default = 0 + Upper limit of search iterations. Auto select when 0. + algo: string denoting the search algorithm to use, default = "auto" + Valid values for algo: ["auto", "single_cta", "multi_cta"], where + - auto will automatically select the best value based on query size + - single_cta is better when query contains larger number of + vectors (e.g >10) + - multi_cta is better when query contains only a few vectors + team_size: int, default = 0 + Number of threads used to calculate a single distance. 4, 8, 16, + or 32. + search_width: int, default = 1 + Number of graph nodes to select as the starting point for the + search in each iteration. + min_iterations: int, default = 0 + Lower limit of search iterations. + thread_block_size: int, default = 0 + Thread block size. 0, 64, 128, 256, 512, 1024. + Auto selection when 0. + hashmap_mode: string denoting the type of hash map to use. It's + usually better to allow the algorithm to select this value., + default = "auto" + Valid values for hashmap_mode: ["auto", "small", "hash"], where + - auto will automatically select the best value based on algo + - small will use the small shared memory hash table with resetting. + - hash will use a single hash table in global memory. + hashmap_min_bitlen: int, default = 0 + Upper limit of hashmap fill rate. More than 0.1, less than 0.9. + hashmap_max_fill_rate: float, default = 0.5 + Upper limit of hashmap fill rate. More than 0.1, less than 0.9. + num_random_samplings: int, default = 1 + Number of iterations of initial random seed node selection. 1 or + more. + rand_xor_mask: int, default = 0x128394 + Bit mask used for initial random seed node selection. + """ cdef c_cagra.search_params params def __init__(self, *, @@ -431,56 +548,6 @@ cdef class SearchParams: hashmap_max_fill_rate=0.5, num_random_samplings=1, rand_xor_mask=0x128394): - """ - CAGRA search parameters - - Parameters - ---------- - max_queries: int, default = 0 - Maximum number of queries to search at the same time (batch size). - Auto select when 0. - itopk_size: int, default = 64 - Number of intermediate search results retained during the search. - This is the main knob to adjust trade off between accuracy and - search speed. Higher values improve the search accuracy. - max_iterations: int, default = 0 - Upper limit of search iterations. Auto select when 0. - algo: string denoting the search algorithm to use, default = "auto" - Valid values for algo: ["auto", "single_cta", "multi_cta"], where - - auto will automatically select the best value based on query size - - single_cta is better when query contains larger number of - vectors (e.g >10) - - multi_cta is better when query contains only a few vectors - team_size: int, default = 0 - Number of threads used to calculate a single distance. 4, 8, 16, - or 32. - search_width: int, default = 1 - Number of graph nodes to select as the starting point for the - search in each iteration. - min_iterations: int, default = 0 - Lower limit of search iterations. - thread_block_size: int, default = 0 - Thread block size. 0, 64, 128, 256, 512, 1024. - Auto selection when 0. - hashmap_mode: string denoting the type of hash map to use. It's - usually better to allow the algorithm to select this value., - default = "auto" - Valid values for hashmap_mode: ["auto", "small", "hash"], where - - auto will automatically select the best value based on algo - - small will use the small shared memory hash table with resetting. - - hash will use a single hash table in global memory. - hashmap_min_bitlen: int, default = 0 - Upper limit of hashmap fill rate. More than 0.1, less than 0.9. - hashmap_max_fill_rate: float, default = 0.5 - Upper limit of hashmap fill rate. More than 0.1, less than 0.9. - num_random_samplings: int, default = 1 - Number of iterations of initial random seed node selection. 1 or - more. - rand_xor_mask: int, default = 0x128394 - Bit mask used for initial random seed node selection. - - - """ self.params.max_queries = max_queries self.params.itopk_size = itopk_size self.params.max_iterations = max_iterations @@ -514,9 +581,13 @@ cdef class SearchParams: self.params.rand_xor_mask = rand_xor_mask def __repr__(self): - # todo(dantegd): add all relevant attrs attr_str = [attr + "=" + str(getattr(self, attr)) - for attr in ["max_queries"]] + for attr in [ + "max_queries", "itopk_size", "max_iterations", "algo", + "team_size", "search_width", "min_iterations", + "thread_block_size", "hashmap_mode", + "hashmap_min_bitlen", "hashmap_max_fill_rate", + "num_random_samplings", "rand_xor_mask"]] return "SearchParams(type=CAGRA, " + (", ".join(attr_str)) + ")" @property @@ -604,20 +675,16 @@ def search(SearchParams search_params, Examples -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import cagra - >>> n_samples = 50000 >>> n_features = 50 >>> n_queries = 1000 >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> # Build index >>> handle = DeviceResources() >>> index = cagra.build(cagra.IndexParams(), dataset, handle=handle) - >>> # Search using the built index >>> queries = cp.random.random_sample((n_queries, n_features), ... dtype=cp.float32) @@ -626,17 +693,14 @@ def search(SearchParams search_params, ... max_queries=100, ... itopk_size=64 ... ) - >>> # Using a pooling allocator reduces overhead of temporary array >>> # creation during search. This is useful if multiple searches >>> # are performad with same query size. >>> distances, neighbors = cagra.search(search_params, index, queries, ... k, handle=handle) - >>> # pylibraft functions are often asynchronous so the >>> # handle needs to be explicitly synchronized >>> handle.sync() - >>> neighbors = cp.asarray(neighbors) >>> distances = cp.asarray(distances) """ @@ -710,11 +774,11 @@ def search(SearchParams search_params, @auto_sync_handle -def save(filename, Index index, handle=None): +def save(filename, Index index, bool include_dataset=True, handle=None): """ - Saves the index to file. + Saves the index to a file. - Saving / loading the index is. The serialization format is + Saving / loading the index is experimental. The serialization format is subject to change. Parameters @@ -723,24 +787,29 @@ def save(filename, Index index, handle=None): Name of the file. index : Index Trained CAGRA index. + include_dataset : bool + Whether or not to write out the dataset along with the index. Including + the dataset in the serialized index will use extra disk space, and + might not be desired if you already have a copy of the dataset on + disk. If this option is set to false, you will have to call + `index.update_dataset(dataset)` after loading the index. {handle_docstring} Examples -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import cagra - >>> n_samples = 50000 >>> n_features = 50 >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> # Build index >>> handle = DeviceResources() >>> index = cagra.build(cagra.IndexParams(), dataset, handle=handle) + >>> # Serialize and deserialize the cagra index built >>> cagra.save("my_index.bin", index, handle=handle) + >>> index_loaded = cagra.load("my_index.bin", handle=handle) """ if not index.trained: raise ValueError("Index need to be built before saving it.") @@ -759,15 +828,17 @@ def save(filename, Index index, handle=None): if index.active_index_type == "float32": idx_float = index c_cagra.serialize_file( - deref(handle_), c_filename, deref(idx_float.index)) + deref(handle_), c_filename, deref(idx_float.index), + include_dataset) elif index.active_index_type == "byte": idx_int8 = index c_cagra.serialize_file( - deref(handle_), c_filename, deref(idx_int8.index)) + deref(handle_), c_filename, deref(idx_int8.index), include_dataset) elif index.active_index_type == "ubyte": idx_uint8 = index c_cagra.serialize_file( - deref(handle_), c_filename, deref(idx_uint8.index)) + deref(handle_), c_filename, deref(idx_uint8.index), + include_dataset) else: raise ValueError( "Index dtype %s not supported" % index.active_index_type) @@ -778,7 +849,7 @@ def load(filename, handle=None): """ Loads index from file. - Saving / loading the index is. The serialization format is + Saving / loading the index is experimental. The serialization format is subject to change, therefore loading an index saved with a previous version of raft is not guaranteed to work. @@ -792,13 +863,6 @@ def load(filename, handle=None): ------- index : Index - Examples - -------- - >>> import cupy as cp - - >>> from pylibraft.common import DeviceResources - >>> from pylibraft.neighbors import cagra - """ if handle is None: handle = DeviceResources() @@ -810,11 +874,9 @@ def load(filename, handle=None): cdef IndexInt8 idx_int8 cdef IndexUint8 idx_uint8 - # we extract the dtype from the arrai interfaces in the file - with open(filename, 'rb') as f: - type_str = f.read(700).decode("utf-8", errors='ignore') - - dataset_dt = np.dtype(type_str[673:676]) + with open(filename, "rb") as f: + type_str = f.read(3).decode("utf8") + dataset_dt = np.dtype(type_str) if dataset_dt == np.float32: idx_float = IndexFloat(handle) diff --git a/python/pylibraft/pylibraft/neighbors/cagra/cpp/c_cagra.pxd b/python/pylibraft/pylibraft/neighbors/cagra/cpp/c_cagra.pxd index 284c75b771..0c683bcd9b 100644 --- a/python/pylibraft/pylibraft/neighbors/cagra/cpp/c_cagra.pxd +++ b/python/pylibraft/pylibraft/neighbors/cagra/cpp/c_cagra.pxd @@ -36,6 +36,7 @@ from pylibraft.common.cpp.mdspan cimport ( row_major, ) from pylibraft.common.handle cimport device_resources +from pylibraft.common.mdspan cimport const_float, const_int8_t, const_uint8_t from pylibraft.common.optional cimport optional from pylibraft.distance.distance_type cimport DistanceType from pylibraft.neighbors.ivf_pq.cpp.c_ivf_pq cimport ( @@ -90,6 +91,17 @@ cdef extern from "raft/neighbors/cagra_types.hpp" \ device_matrix_view[T, IdxT, row_major] dataset() device_matrix_view[T, IdxT, row_major] graph() + # hack: can't use the T template param here because of issues handling + # const w/ cython. introduce a new template param to get around this + void update_dataset[ValueT](const device_resources & handle, + host_matrix_view[ValueT, + int64_t, + row_major] dataset) + void update_dataset[ValueT](const device_resources & handle, + device_matrix_view[ValueT, + int64_t, + row_major] dataset) + cdef extern from "raft_runtime/neighbors/cagra.hpp" \ namespace "raft::runtime::neighbors::cagra" nogil: @@ -155,7 +167,8 @@ cdef extern from "raft_runtime/neighbors/cagra.hpp" \ cdef void serialize(const device_resources& handle, string& str, - const index[float, uint32_t]& index) except + + const index[float, uint32_t]& index, + bool include_dataset) except + cdef void deserialize(const device_resources& handle, const string& str, @@ -163,7 +176,8 @@ cdef extern from "raft_runtime/neighbors/cagra.hpp" \ cdef void serialize(const device_resources& handle, string& str, - const index[uint8_t, uint32_t]& index) except + + const index[uint8_t, uint32_t]& index, + bool include_dataset) except + cdef void deserialize(const device_resources& handle, const string& str, @@ -171,7 +185,8 @@ cdef extern from "raft_runtime/neighbors/cagra.hpp" \ cdef void serialize(const device_resources& handle, string& str, - const index[int8_t, uint32_t]& index) except + + const index[int8_t, uint32_t]& index, + bool include_dataset) except + cdef void deserialize(const device_resources& handle, const string& str, @@ -179,7 +194,8 @@ cdef extern from "raft_runtime/neighbors/cagra.hpp" \ cdef void serialize_file(const device_resources& handle, const string& filename, - const index[float, uint32_t]& index) except + + const index[float, uint32_t]& index, + bool include_dataset) except + cdef void deserialize_file(const device_resources& handle, const string& filename, @@ -187,7 +203,8 @@ cdef extern from "raft_runtime/neighbors/cagra.hpp" \ cdef void serialize_file(const device_resources& handle, const string& filename, - const index[uint8_t, uint32_t]& index) except + + const index[uint8_t, uint32_t]& index, + bool include_dataset) except + cdef void deserialize_file(const device_resources& handle, const string& filename, @@ -195,7 +212,8 @@ cdef extern from "raft_runtime/neighbors/cagra.hpp" \ cdef void serialize_file(const device_resources& handle, const string& filename, - const index[int8_t, uint32_t]& index) except + + const index[int8_t, uint32_t]& index, + bool include_dataset) except + cdef void deserialize_file(const device_resources& handle, const string& filename, diff --git a/python/pylibraft/pylibraft/neighbors/ivf_flat/ivf_flat.pyx b/python/pylibraft/pylibraft/neighbors/ivf_flat/ivf_flat.pyx index e265bee23b..d8fbdc74da 100644 --- a/python/pylibraft/pylibraft/neighbors/ivf_flat/ivf_flat.pyx +++ b/python/pylibraft/pylibraft/neighbors/ivf_flat/ivf_flat.pyx @@ -75,6 +75,45 @@ from pylibraft.neighbors.ivf_flat.cpp.c_ivf_flat cimport ( cdef class IndexParams: + """ + Parameters to build index for IVF-FLAT nearest neighbor search + + Parameters + ---------- + n_list : int, default = 1024 + The number of clusters used in the coarse quantizer. + metric : string denoting the metric type, default="sqeuclidean" + Valid values for metric: ["sqeuclidean", "inner_product", + "euclidean"], where + - sqeuclidean is the euclidean distance without the square root + operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2, + - euclidean is the euclidean distance + - inner product distance is defined as + distance(a, b) = \\sum_i a_i * b_i. + kmeans_n_iters : int, default = 20 + The number of iterations searching for kmeans centers during index + building. + kmeans_trainset_fraction : int, default = 0.5 + If kmeans_trainset_fraction is less than 1, then the dataset is + subsampled, and only n_samples * kmeans_trainset_fraction rows + are used for training. + add_data_on_build : bool, default = True + After training the coarse and fine quantizers, we will populate + the index with the dataset if add_data_on_build == True, otherwise + the index is left empty, and the extend method can be used + to add new vectors to the index. + adaptive_centers : bool, default = False + By default (adaptive_centers = False), the cluster centers are + trained in `ivf_flat::build`, and and never modified in + `ivf_flat::extend`. The alternative behavior (adaptive_centers + = true) is to update the cluster centers for new data when it is + added. In this case, `index.centers()` are always exactly the + centroids of the data in the corresponding clusters. The drawback + of this behavior is that the centroids depend on the order of + adding new data (through the classification of the added data); + that is, `index.centers()` "drift" together with the changing + distribution of the newly added data. + """ cdef c_ivf_flat.index_params params def __init__(self, *, @@ -84,45 +123,6 @@ cdef class IndexParams: kmeans_trainset_fraction=0.5, add_data_on_build=True, bool adaptive_centers=False): - """" - Parameters to build index for IVF-FLAT nearest neighbor search - - Parameters - ---------- - n_list : int, default = 1024 - The number of clusters used in the coarse quantizer. - metric : string denoting the metric type, default="sqeuclidean" - Valid values for metric: ["sqeuclidean", "inner_product", - "euclidean"], where - - sqeuclidean is the euclidean distance without the square root - operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2, - - euclidean is the euclidean distance - - inner product distance is defined as - distance(a, b) = \\sum_i a_i * b_i. - kmeans_n_iters : int, default = 20 - The number of iterations searching for kmeans centers during index - building. - kmeans_trainset_fraction : int, default = 0.5 - If kmeans_trainset_fraction is less than 1, then the dataset is - subsampled, and only n_samples * kmeans_trainset_fraction rows - are used for training. - add_data_on_build : bool, default = True - After training the coarse and fine quantizers, we will populate - the index with the dataset if add_data_on_build == True, otherwise - the index is left empty, and the extend method can be used - to add new vectors to the index. - adaptive_centers : bool, default = False - By default (adaptive_centers = False), the cluster centers are - trained in `ivf_flat::build`, and and never modified in - `ivf_flat::extend`. The alternative behavior (adaptive_centers - = true) is to update the cluster centers for new data when it is - added. In this case, `index.centers()` are always exactly the - centroids of the data in the corresponding clusters. The drawback - of this behavior is that the centroids depend on the order of - adding new data (through the classification of the added data); - that is, `index.centers()` "drift" together with the changing - distribution of the newly added data. - """ self.params.n_lists = n_lists self.params.metric = _get_metric(metric) self.params.metric_arg = 0 @@ -333,33 +333,27 @@ def build(IndexParams index_params, dataset, handle=None): -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_flat - >>> n_samples = 50000 >>> n_features = 50 >>> n_queries = 1000 - >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) >>> handle = DeviceResources() >>> index_params = ivf_flat.IndexParams( ... n_lists=1024, ... metric="sqeuclidean") - >>> index = ivf_flat.build(index_params, dataset, handle=handle) - >>> # Search using the built index >>> queries = cp.random.random_sample((n_queries, n_features), ... dtype=cp.float32) >>> k = 10 - >>> distances, neighbors = ivf_flat.search(ivf_flat.SearchParams(), index, - ... queries, k, handle=handle) - + >>> distances, neighbors = ivf_flat.search(ivf_flat.SearchParams(), + ... index, queries, k, + ... handle=handle) >>> distances = cp.asarray(distances) >>> neighbors = cp.asarray(neighbors) - >>> # pylibraft functions are often asynchronous so the >>> # handle needs to be explicitly synchronized >>> handle.sync() @@ -439,25 +433,21 @@ def extend(Index index, new_vectors, new_indices, handle=None): -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_flat - >>> n_samples = 50000 >>> n_features = 50 >>> n_queries = 1000 - >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) >>> handle = DeviceResources() - >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset, handle=handle) - + >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset, + ... handle=handle) >>> n_rows = 100 >>> more_data = cp.random.random_sample((n_rows, n_features), ... dtype=cp.float32) >>> indices = index.size + cp.arange(n_rows, dtype=cp.int64) >>> index = ivf_flat.extend(index, more_data, indices) - >>> # Search using the built index >>> queries = cp.random.random_sample((n_queries, n_features), ... dtype=cp.float32) @@ -465,7 +455,6 @@ def extend(Index index, new_vectors, new_indices, handle=None): >>> distances, neighbors = ivf_flat.search(ivf_flat.SearchParams(), ... index, queries, ... k, handle=handle) - >>> # pylibraft functions are often asynchronous so the >>> # handle needs to be explicitly synchronized >>> handle.sync() @@ -540,17 +529,17 @@ def extend(Index index, new_vectors, new_indices, handle=None): cdef class SearchParams: + """ + IVF-FLAT search parameters + + Parameters + ---------- + n_probes: int, default = 1024 + The number of course clusters to select for the fine search. + """ cdef c_ivf_flat.search_params params def __init__(self, *, n_probes=20): - """ - IVF-FLAT search parameters - - Parameters - ---------- - n_probes: int, default = 1024 - The number of course clusters to select for the fine search. - """ self.params.n_probes = n_probes def __repr__(self): @@ -595,20 +584,17 @@ def search(SearchParams search_params, Examples -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_flat - >>> n_samples = 50000 >>> n_features = 50 >>> n_queries = 1000 >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> # Build index >>> handle = DeviceResources() - >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset, handle=handle) - + >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset, + ... handle=handle) >>> # Search using the built index >>> queries = cp.random.random_sample((n_queries, n_features), ... dtype=cp.float32) @@ -616,13 +602,11 @@ def search(SearchParams search_params, >>> search_params = ivf_flat.SearchParams( ... n_probes=20 ... ) - >>> distances, neighbors = ivf_flat.search(search_params, index, queries, - ... k, handle=handle) - + >>> distances, neighbors = ivf_flat.search(search_params, index, + ... queries, k, handle=handle) >>> # pylibraft functions are often asynchronous so the >>> # handle needs to be explicitly synchronized >>> handle.sync() - >>> neighbors = cp.asarray(neighbors) >>> distances = cp.asarray(distances) """ @@ -697,7 +681,7 @@ def search(SearchParams search_params, @auto_sync_handle def save(filename, Index index, handle=None): """ - Saves the index to file. + Saves the index to a file. Saving / loading the index is experimental. The serialization format is subject to change. @@ -713,18 +697,16 @@ def save(filename, Index index, handle=None): Examples -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_flat - >>> n_samples = 50000 >>> n_features = 50 >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> # Build index >>> handle = DeviceResources() - >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset, handle=handle) + >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset, + ... handle=handle) >>> ivf_flat.save("my_index.bin", index, handle=handle) """ if not index.trained: @@ -761,7 +743,7 @@ def save(filename, Index index, handle=None): @auto_sync_handle def load(filename, handle=None): """ - Loads index from file. + Loads index from a file. Saving / loading the index is experimental. The serialization format is subject to change, therefore loading an index saved with a previous @@ -780,29 +762,26 @@ def load(filename, handle=None): Examples -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_flat - >>> n_samples = 50000 >>> n_features = 50 >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> # Build and save index >>> handle = DeviceResources() - >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset, handle=handle) + >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset, + ... handle=handle) >>> ivf_flat.save("my_index.bin", index, handle=handle) >>> del index - >>> n_queries = 100 >>> queries = cp.random.random_sample((n_queries, n_features), ... dtype=cp.float32) >>> handle = DeviceResources() >>> index = ivf_flat.load("my_index.bin", handle=handle) - - >>> distances, neighbors = ivf_flat.search(ivf_flat.SearchParams(), index, - ... queries, k=10, handle=handle) + >>> distances, neighbors = ivf_flat.search(ivf_flat.SearchParams(), + ... index, queries, k=10, + ... handle=handle) """ if handle is None: handle = DeviceResources() diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx index 413a9a1d4b..0c1bbf6b9c 100644 --- a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx +++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx @@ -95,7 +95,68 @@ cdef _get_dtype_string(dtype): cdef class IndexParams: + """ + Parameters to build index for IVF-PQ nearest neighbor search + Parameters + ---------- + n_list : int, default = 1024 + The number of clusters used in the coarse quantizer. + metric : string denoting the metric type, default="sqeuclidean" + Valid values for metric: ["sqeuclidean", "inner_product", + "euclidean"], where + - sqeuclidean is the euclidean distance without the square root + operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2, + - euclidean is the euclidean distance + - inner product distance is defined as + distance(a, b) = \\sum_i a_i * b_i. + kmeans_n_iters : int, default = 20 + The number of iterations searching for kmeans centers during index + building. + kmeans_trainset_fraction : int, default = 0.5 + If kmeans_trainset_fraction is less than 1, then the dataset is + subsampled, and only n_samples * kmeans_trainset_fraction rows + are used for training. + pq_bits : int, default = 8 + The bit length of the vector element after quantization. + pq_dim : int, default = 0 + The dimensionality of a the vector after product quantization. + When zero, an optimal value is selected using a heuristic. Note + pq_dim * pq_bits must be a multiple of 8. Hint: a smaller 'pq_dim' + results in a smaller index size and better search performance, but + lower recall. If 'pq_bits' is 8, 'pq_dim' can be set to any number, + but multiple of 8 are desirable for good performance. If 'pq_bits' + is not 8, 'pq_dim' should be a multiple of 8. For good performance, + it is desirable that 'pq_dim' is a multiple of 32. Ideally, + 'pq_dim' should be also a divisor of the dataset dim. + codebook_kind : string, default = "subspace" + Valid values ["subspace", "cluster"] + force_random_rotation : bool, default = False + Apply a random rotation matrix on the input data and queries even + if `dim % pq_dim == 0`. Note: if `dim` is not multiple of `pq_dim`, + a random rotation is always applied to the input data and queries + to transform the working space from `dim` to `rot_dim`, which may + be slightly larger than the original space and and is a multiple + of `pq_dim` (`rot_dim % pq_dim == 0`). However, this transform is + not necessary when `dim` is multiple of `pq_dim` (`dim == rot_dim`, + hence no need in adding "extra" data columns / features). By + default, if `dim == rot_dim`, the rotation transform is + initialized with the identity matrix. When + `force_random_rotation == True`, a random orthogonal transform + matrix is generated regardless of the values of `dim` and `pq_dim`. + add_data_on_build : bool, default = True + After training the coarse and fine quantizers, we will populate + the index with the dataset if add_data_on_build == True, otherwise + the index is left empty, and the extend method can be used + to add new vectors to the index. + conservative_memory_allocation : bool, default = True + By default, the algorithm allocates more space than necessary for + individual clusters (`list_data`). This allows to amortize the cost + of memory allocation and reduce the number of data copies during + repeated calls to `extend` (extending the database). + To disable this behavior and use as little GPU memory for the + database as possible, set this flat to `True`. + """ def __init__(self, *, n_lists=1024, metric="sqeuclidean", @@ -107,69 +168,6 @@ cdef class IndexParams: force_random_rotation=False, add_data_on_build=True, conservative_memory_allocation=False): - """" - Parameters to build index for IVF-PQ nearest neighbor search - - Parameters - ---------- - n_list : int, default = 1024 - The number of clusters used in the coarse quantizer. - metric : string denoting the metric type, default="sqeuclidean" - Valid values for metric: ["sqeuclidean", "inner_product", - "euclidean"], where - - sqeuclidean is the euclidean distance without the square root - operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2, - - euclidean is the euclidean distance - - inner product distance is defined as - distance(a, b) = \\sum_i a_i * b_i. - kmeans_n_iters : int, default = 20 - The number of iterations searching for kmeans centers during index - building. - kmeans_trainset_fraction : int, default = 0.5 - If kmeans_trainset_fraction is less than 1, then the dataset is - subsampled, and only n_samples * kmeans_trainset_fraction rows - are used for training. - pq_bits : int, default = 8 - The bit length of the vector element after quantization. - pq_dim : int, default = 0 - The dimensionality of a the vector after product quantization. - When zero, an optimal value is selected using a heuristic. Note - pq_dim * pq_bits must be a multiple of 8. Hint: a smaller 'pq_dim' - results in a smaller index size and better search performance, but - lower recall. If 'pq_bits' is 8, 'pq_dim' can be set to any number, - but multiple of 8 are desirable for good performance. If 'pq_bits' - is not 8, 'pq_dim' should be a multiple of 8. For good performance, - it is desirable that 'pq_dim' is a multiple of 32. Ideally, - 'pq_dim' should be also a divisor of the dataset dim. - codebook_kind : string, default = "subspace" - Valid values ["subspace", "cluster"] - force_random_rotation : bool, default = False - Apply a random rotation matrix on the input data and queries even - if `dim % pq_dim == 0`. Note: if `dim` is not multiple of `pq_dim`, - a random rotation is always applied to the input data and queries - to transform the working space from `dim` to `rot_dim`, which may - be slightly larger than the original space and and is a multiple - of `pq_dim` (`rot_dim % pq_dim == 0`). However, this transform is - not necessary when `dim` is multiple of `pq_dim` (`dim == rot_dim`, - hence no need in adding "extra" data columns / features). By - default, if `dim == rot_dim`, the rotation transform is - initialized with the identity matrix. When - `force_random_rotation == True`, a random orthogonal transform - matrix is generated regardless of the values of `dim` and `pq_dim`. - add_data_on_build : bool, default = True - After training the coarse and fine quantizers, we will populate - the index with the dataset if add_data_on_build == True, otherwise - the index is left empty, and the extend method can be used - to add new vectors to the index. - conservative_memory_allocation : bool, default = True - By default, the algorithm allocates more space than necessary for - individual clusters (`list_data`). This allows to amortize the cost - of memory allocation and reduce the number of data copies during - repeated calls to `extend` (extending the database). - To disable this behavior and use as little GPU memory for the - database as possible, set this flat to `True`. - - """ self.params.n_lists = n_lists self.params.metric = _get_metric(metric) self.params.metric_arg = 0 @@ -333,14 +331,11 @@ def build(IndexParams index_params, dataset, handle=None): -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_pq - >>> n_samples = 50000 >>> n_features = 50 >>> n_queries = 1000 - >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) >>> handle = DeviceResources() @@ -349,17 +344,14 @@ def build(IndexParams index_params, dataset, handle=None): ... metric="sqeuclidean", ... pq_dim=10) >>> index = ivf_pq.build(index_params, dataset, handle=handle) - >>> # Search using the built index >>> queries = cp.random.random_sample((n_queries, n_features), ... dtype=cp.float32) >>> k = 10 >>> distances, neighbors = ivf_pq.search(ivf_pq.SearchParams(), index, ... queries, k, handle=handle) - >>> distances = cp.asarray(distances) >>> neighbors = cp.asarray(neighbors) - >>> # pylibraft functions are often asynchronous so the >>> # handle needs to be explicitly synchronized >>> handle.sync() @@ -433,25 +425,20 @@ def extend(Index index, new_vectors, new_indices, handle=None): -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_pq - >>> n_samples = 50000 >>> n_features = 50 >>> n_queries = 1000 - >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) >>> handle = DeviceResources() >>> index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle) - >>> n_rows = 100 >>> more_data = cp.random.random_sample((n_rows, n_features), ... dtype=cp.float32) >>> indices = index.size + cp.arange(n_rows, dtype=cp.int64) >>> index = ivf_pq.extend(index, more_data, indices) - >>> # Search using the built index >>> queries = cp.random.random_sample((n_queries, n_features), ... dtype=cp.float32) @@ -459,11 +446,9 @@ def extend(Index index, new_vectors, new_indices, handle=None): >>> distances, neighbors = ivf_pq.search(ivf_pq.SearchParams(), ... index, queries, ... k, handle=handle) - >>> # pylibraft functions are often asynchronous so the >>> # handle needs to be explicitly synchronized >>> handle.sync() - >>> distances = cp.asarray(distances) >>> neighbors = cp.asarray(neighbors) """ @@ -520,29 +505,27 @@ def extend(Index index, new_vectors, new_indices, handle=None): cdef class SearchParams: + """ + IVF-PQ search parameters + Parameters + ---------- + n_probes: int, default = 1024 + The number of course clusters to select for the fine search. + lut_dtype: default = np.float32 + Data type of look up table to be created dynamically at search + time. The use of low-precision types reduces the amount of shared + memory required at search time, so fast shared memory kernels can + be used even for datasets with large dimansionality. Note that + the recall is slightly degraded when low-precision type is + selected. Possible values [np.float32, np.float16, np.uint8] + internal_distance_dtype: default = np.float32 + Storage data type for distance/similarity computation. + Possible values [np.float32, np.float16] + """ def __init__(self, *, n_probes=20, lut_dtype=np.float32, internal_distance_dtype=np.float32): - """ - IVF-PQ search parameters - - Parameters - ---------- - n_probes: int, default = 1024 - The number of course clusters to select for the fine search. - lut_dtype: default = np.float32 - Data type of look up table to be created dynamically at search - time. The use of low-precision types reduces the amount of shared - memory required at search time, so fast shared memory kernels can - be used even for datasets with large dimansionality. Note that - the recall is slightly degraded when low-precision type is - selected. Possible values [np.float32, np.float16, np.uint8] - internal_distance_dtype: default = np.float32 - Storage data type for distance/similarity computation. - Possible values [np.float32, np.float16] - """ - self.params.n_probes = n_probes self.params.lut_dtype = _map_dtype_np_to_cuda(lut_dtype) self.params.internal_distance_dtype = \ @@ -611,20 +594,16 @@ def search(SearchParams search_params, Examples -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_pq - >>> n_samples = 50000 >>> n_features = 50 >>> n_queries = 1000 >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> # Build index >>> handle = DeviceResources() >>> index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle) - >>> # Search using the built index >>> queries = cp.random.random_sample((n_queries, n_features), ... dtype=cp.float32) @@ -634,7 +613,6 @@ def search(SearchParams search_params, ... lut_dtype=cp.float16, ... internal_distance_dtype=cp.float32 ... ) - >>> # Using a pooling allocator reduces overhead of temporary array >>> # creation during search. This is useful if multiple searches >>> # are performad with same query size. @@ -647,11 +625,9 @@ def search(SearchParams search_params, >>> distances, neighbors = ivf_pq.search(search_params, index, queries, ... k, memory_resource=mr, ... handle=handle) - >>> # pylibraft functions are often asynchronous so the >>> # handle needs to be explicitly synchronized >>> handle.sync() - >>> neighbors = cp.asarray(neighbors) >>> distances = cp.asarray(distances) """ @@ -728,7 +704,7 @@ def search(SearchParams search_params, @auto_sync_handle def save(filename, Index index, handle=None): """ - Saves the index to file. + Saves the index to a file. Saving / loading the index is experimental. The serialization format is subject to change. @@ -744,15 +720,12 @@ def save(filename, Index index, handle=None): Examples -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_pq - >>> n_samples = 50000 >>> n_features = 50 >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> # Build index >>> handle = DeviceResources() >>> index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle) @@ -774,7 +747,7 @@ def save(filename, Index index, handle=None): @auto_sync_handle def load(filename, handle=None): """ - Loads index from file. + Loads index from a file. Saving / loading the index is experimental. The serialization format is subject to change, therefore loading an index saved with a previous @@ -793,27 +766,22 @@ def load(filename, handle=None): Examples -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_pq - >>> n_samples = 50000 >>> n_features = 50 >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> # Build and save index >>> handle = DeviceResources() >>> index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle) >>> ivf_pq.save("my_index.bin", index, handle=handle) >>> del index - >>> n_queries = 100 >>> queries = cp.random.random_sample((n_queries, n_features), ... dtype=cp.float32) >>> handle = DeviceResources() >>> index = ivf_pq.load("my_index.bin", handle=handle) - >>> distances, neighbors = ivf_pq.search(ivf_pq.SearchParams(), index, ... queries, k=10, handle=handle) """ diff --git a/python/pylibraft/pylibraft/neighbors/refine.pyx b/python/pylibraft/pylibraft/neighbors/refine.pyx index 5e57da713c..a9bf811c9f 100644 --- a/python/pylibraft/pylibraft/neighbors/refine.pyx +++ b/python/pylibraft/pylibraft/neighbors/refine.pyx @@ -192,19 +192,19 @@ def refine(dataset, queries, candidates, k=None, indices=None, distances=None, queries : array interface compliant matrix, shape (n_queries, dim) Supported dtype [float, int8, uint8] candidates : array interface compliant matrix, shape (n_queries, k0) - dtype int64 + Supported dtype int64 k : int Number of neighbors to search (k <= k0). Optional if indices or distances arrays are given (in which case their second dimension is k). - indices : Optional array interface compliant matrix shape - (n_queries, k), dtype int64. If supplied, neighbor - indices will be written here in-place. (default None) - Supported dtype int64 - distances : Optional array interface compliant matrix shape - (n_queries, k), dtype float. If supplied, neighbor - indices will be written here in-place. (default None) - + indices : Optional array interface compliant matrix shape \ + (n_queries, k). + If supplied, neighbor indices will be written here in-place. + (default None). Supported dtype int64. + distances : Optional array interface compliant matrix shape \ + (n_queries, k). + If supplied, neighbor indices will be written here in-place. + (default None) Supported dtype float. {handle_docstring} Returns @@ -213,36 +213,30 @@ def refine(dataset, queries, candidates, k=None, indices=None, distances=None, Examples -------- - >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_pq, refine - >>> n_samples = 50000 >>> n_features = 50 >>> n_queries = 1000 - >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) >>> handle = DeviceResources() - >>> index_params = ivf_pq.IndexParams(n_lists=1024, metric="sqeuclidean", + >>> index_params = ivf_pq.IndexParams(n_lists=1024, + ... metric="sqeuclidean", ... pq_dim=10) >>> index = ivf_pq.build(index_params, dataset, handle=handle) - >>> # Search using the built index >>> queries = cp.random.random_sample((n_queries, n_features), ... dtype=cp.float32) >>> k = 40 >>> _, candidates = ivf_pq.search(ivf_pq.SearchParams(), index, ... queries, k, handle=handle) - >>> k = 10 >>> distances, neighbors = refine(dataset, queries, candidates, k, ... handle=handle) >>> distances = cp.asarray(distances) >>> neighbors = cp.asarray(neighbors) - >>> # pylibraft functions are often asynchronous so the >>> # handle needs to be explicitly synchronized >>> handle.sync() diff --git a/python/pylibraft/pylibraft/test/test_cagra.py b/python/pylibraft/pylibraft/test/test_cagra.py index 435b2878a2..74e9f53b91 100644 --- a/python/pylibraft/pylibraft/test/test_cagra.py +++ b/python/pylibraft/pylibraft/test/test_cagra.py @@ -255,7 +255,8 @@ def test_cagra_search_params(params): @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.ubyte]) -def test_save_load(dtype): +@pytest.mark.parametrize("include_dataset", [True, False]) +def test_save_load(dtype, include_dataset): n_rows = 10000 n_cols = 50 n_queries = 1000 @@ -268,9 +269,14 @@ def test_save_load(dtype): assert index.trained filename = "my_index.bin" - cagra.save(filename, index) + cagra.save(filename, index, include_dataset=include_dataset) loaded_index = cagra.load(filename) + # if we didn't save the dataset with the index, we need to update the + # index with an already loaded copy + if not include_dataset: + loaded_index.update_dataset(dataset) + queries = generate_data((n_queries, n_cols), dtype) queries_device = device_ndarray(queries) diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml index 68b6e7e2ad..bdbcf61e0f 100644 --- a/python/raft-dask/pyproject.toml +++ b/python/raft-dask/pyproject.toml @@ -35,8 +35,8 @@ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ "dask-cuda==23.10.*", - "dask>=2023.5.1", - "distributed>=2023.5.1", + "dask>=2023.7.1", + "distributed>=2023.7.1", "joblib>=0.11", "numba>=0.57", "numpy>=1.21",