Skip to content

Commit

Permalink
Merge branch 'branch-24.10' into cagra_fp16
Browse files Browse the repository at this point in the history
  • Loading branch information
achirkin committed Sep 27, 2024
2 parents c6ae3b4 + b93b8f6 commit ef3314c
Show file tree
Hide file tree
Showing 249 changed files with 7,205 additions and 8,086 deletions.
1 change: 0 additions & 1 deletion conda/environments/all_cuda-118_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ dependencies:
- libcusparse=11.7.5.86
- librmm==24.10.*,>=0.0.0a0
- make
- nccl>=2.9.9
- ninja
- numpy>=1.23,<3.0a0
- numpydoc
Expand Down
1 change: 0 additions & 1 deletion conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ dependencies:
- libcusparse=11.7.5.86
- librmm==24.10.*,>=0.0.0a0
- make
- nccl>=2.9.9
- ninja
- numpy>=1.23,<3.0a0
- numpydoc
Expand Down
1 change: 0 additions & 1 deletion conda/environments/all_cuda-125_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ dependencies:
- libcusparse-dev
- librmm==24.10.*,>=0.0.0a0
- make
- nccl>=2.9.9
- ninja
- numpy>=1.23,<3.0a0
- numpydoc
Expand Down
1 change: 0 additions & 1 deletion conda/environments/all_cuda-125_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ dependencies:
- libcusparse-dev
- librmm==24.10.*,>=0.0.0a0
- make
- nccl>=2.9.9
- ninja
- numpy>=1.23,<3.0a0
- numpydoc
Expand Down
1 change: 0 additions & 1 deletion conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ dependencies:
- libcusparse=11.7.5.86
- librmm==24.10.*,>=0.0.0a0
- matplotlib
- nccl>=2.9.9
- ninja
- nlohmann_json>=3.11.2
- nvcc_linux-aarch64=11.8
Expand Down
1 change: 0 additions & 1 deletion conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ dependencies:
- libcusparse=11.7.5.86
- librmm==24.10.*,>=0.0.0a0
- matplotlib
- nccl>=2.9.9
- ninja
- nlohmann_json>=3.11.2
- nvcc_linux-64=11.8
Expand Down
1 change: 0 additions & 1 deletion conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ dependencies:
- libcusparse-dev
- librmm==24.10.*,>=0.0.0a0
- matplotlib
- nccl>=2.9.9
- ninja
- nlohmann_json>=3.11.2
- openblas
Expand Down
1 change: 0 additions & 1 deletion conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ dependencies:
- libcusparse-dev
- librmm==24.10.*,>=0.0.0a0
- matplotlib
- nccl>=2.9.9
- ninja
- nlohmann_json>=3.11.2
- openblas
Expand Down
3 changes: 0 additions & 3 deletions conda/recipes/libcuvs/conda_build_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@ c_stdlib_version:
cmake_version:
- ">=3.26.4,!=3.30.0"

nccl_version:
- ">=2.9.9"

h5py_version:
- ">=3.8.0"

Expand Down
199 changes: 94 additions & 105 deletions cpp/CMakeLists.txt

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions cpp/bench/ann/src/common/ann_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ enum class Mode {
enum class MemoryType {
kHost,
kHostMmap,
kHostPinned,
kDevice,
};

Expand All @@ -60,6 +61,8 @@ inline auto parse_memory_type(const std::string& memory_type) -> MemoryType
return MemoryType::kHost;
} else if (memory_type == "mmap") {
return MemoryType::kHostMmap;
} else if (memory_type == "pinned") {
return MemoryType::kHostPinned;
} else if (memory_type == "device") {
return MemoryType::kDevice;
} else {
Expand Down
56 changes: 51 additions & 5 deletions cpp/bench/ann/src/common/dataset.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -286,15 +286,56 @@ class dataset {
{
switch (memory_type) {
case MemoryType::kDevice: return query_set_on_gpu();
default: return query_set();
case MemoryType::kHost: {
auto r = query_set();
#ifndef BUILD_CPU_ONLY
if (query_set_pinned_) {
cudaHostUnregister(const_cast<T*>(r));
query_set_pinned_ = false;
}
#endif
return r;
}
case MemoryType::kHostPinned: {
auto r = query_set();
#ifndef BUILD_CPU_ONLY
if (!query_set_pinned_) {
cudaHostRegister(
const_cast<T*>(r), query_set_size() * dim() * sizeof(T), cudaHostRegisterDefault);
query_set_pinned_ = true;
}
#endif
return r;
}
default: return nullptr;
}
}

auto base_set(MemoryType memory_type) const -> const T*
{
switch (memory_type) {
case MemoryType::kDevice: return base_set_on_gpu();
case MemoryType::kHost: return base_set();
case MemoryType::kHost: {
auto r = base_set();
#ifndef BUILD_CPU_ONLY
if (base_set_pinned_) {
cudaHostUnregister(const_cast<T*>(r));
base_set_pinned_ = false;
}
#endif
return r;
}
case MemoryType::kHostPinned: {
auto r = base_set();
#ifndef BUILD_CPU_ONLY
if (!base_set_pinned_) {
cudaHostRegister(
const_cast<T*>(r), base_set_size() * dim() * sizeof(T), cudaHostRegisterDefault);
base_set_pinned_ = true;
}
#endif
return r;
}
case MemoryType::kHostMmap: return mapped_base_set();
default: return nullptr;
}
Expand All @@ -315,18 +356,23 @@ class dataset {
mutable T* d_query_set_ = nullptr;
mutable T* mapped_base_set_ = nullptr;
mutable int32_t* gt_set_ = nullptr;

mutable bool base_set_pinned_ = false;
mutable bool query_set_pinned_ = false;
};

template <typename T>
dataset<T>::~dataset()
{
delete[] base_set_;
delete[] query_set_;
delete[] gt_set_;
#ifndef BUILD_CPU_ONLY
if (d_base_set_) { cudaFree(d_base_set_); }
if (d_query_set_) { cudaFree(d_query_set_); }
if (base_set_pinned_) { cudaHostUnregister(base_set_); }
if (query_set_pinned_) { cudaHostUnregister(query_set_); }
#endif
delete[] base_set_;
delete[] query_set_;
delete[] gt_set_;
}

template <typename T>
Expand Down
133 changes: 98 additions & 35 deletions cpp/bench/ann/src/common/util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,42 +198,71 @@ inline auto get_stream_from_global_pool() -> cudaStream_t
#endif
}

struct result_buffer {
explicit result_buffer(size_t size, cudaStream_t stream) : size_{size}, stream_{stream}
/** The workspace buffer for use thread-locally. */
struct ws_buffer {
explicit ws_buffer(size_t size, cudaStream_t stream) : size_{size}, stream_{stream} {}
ws_buffer() = delete;
ws_buffer(ws_buffer&&) = delete;
auto operator=(ws_buffer&&) -> ws_buffer& = delete;
ws_buffer(const ws_buffer&) = delete;
auto operator=(const ws_buffer&) -> ws_buffer& = delete;
~ws_buffer() noexcept
{
if (size_ == 0) { return; }
data_host_ = malloc(size_);
#ifndef BUILD_CPU_ONLY
cudaMallocAsync(&data_device_, size_, stream_);
cudaStreamSynchronize(stream_);
#endif
}
result_buffer() = delete;
result_buffer(result_buffer&&) = delete;
auto operator=(result_buffer&&) -> result_buffer& = delete;
result_buffer(const result_buffer&) = delete;
auto operator=(const result_buffer&) -> result_buffer& = delete;
~result_buffer() noexcept
{
if (size_ == 0) { return; }
#ifndef BUILD_CPU_ONLY
cudaFreeAsync(data_device_, stream_);
cudaStreamSynchronize(stream_);
if (data_device_ != nullptr) {
cudaFreeAsync(data_device_, stream_);
cudaStreamSynchronize(stream_);
}
if (data_host_ != nullptr) { cudaFreeHost(data_host_); }
#else
if (data_host_ != nullptr) { free(data_host_); }
#endif
free(data_host_);
}

[[nodiscard]] auto size() const noexcept { return size_; }
[[nodiscard]] auto data(MemoryType loc) const noexcept
[[nodiscard]] auto data(MemoryType loc) const noexcept -> void*
{
if (size_ == 0) { return nullptr; }
switch (loc) {
case MemoryType::kDevice: return data_device_;
default: return data_host_;
#ifndef BUILD_CPU_ONLY
case MemoryType::kDevice: {
if (data_device_ == nullptr) {
cudaMallocAsync(&data_device_, size_, stream_);
cudaStreamSynchronize(stream_);
needs_cleanup_device_ = false;
} else if (needs_cleanup_device_) {
cudaMemsetAsync(data_device_, 0, size_, stream_);
cudaStreamSynchronize(stream_);
needs_cleanup_device_ = false;
}
return data_device_;
}
#endif
default: {
if (data_host_ == nullptr) {
#ifndef BUILD_CPU_ONLY
cudaMallocHost(&data_host_, size_);
#else
data_host_ = malloc(size_);
#endif
needs_cleanup_host_ = false;
} else if (needs_cleanup_host_) {
memset(data_host_, 0, size_);
needs_cleanup_host_ = false;
}
return data_host_;
}
}
}

void transfer_data(MemoryType dst, MemoryType src)
{
// The destination is overwritten and thus does not need cleanup
if (dst == MemoryType::kDevice) {
needs_cleanup_device_ = false;
} else {
needs_cleanup_host_ = false;
}
auto dst_ptr = data(dst);
auto src_ptr = data(src);
if (dst_ptr == src_ptr) { return; }
Expand All @@ -243,15 +272,25 @@ struct result_buffer {
#endif
}

/** Mark the buffer for reuse - it needs to be cleared to make sure the previous results are not
* leaked to the new iteration. */
void reuse()
{
needs_cleanup_host_ = true;
needs_cleanup_device_ = true;
}

private:
size_t size_{0};
cudaStream_t stream_ = nullptr;
void* data_host_ = nullptr;
void* data_device_ = nullptr;
cudaStream_t stream_ = nullptr;
mutable void* data_host_ = nullptr;
mutable void* data_device_ = nullptr;
mutable bool needs_cleanup_host_ = false;
mutable bool needs_cleanup_device_ = false;
};

namespace detail {
inline std::vector<std::unique_ptr<result_buffer>> global_result_buffer_pool(0);
inline std::vector<std::unique_ptr<ws_buffer>> global_result_buffer_pool(0);
inline std::mutex grp_mutex;
} // namespace detail

Expand All @@ -262,24 +301,47 @@ inline std::mutex grp_mutex;
* This reduces the setup overhead and number of times the context is being blocked
* (this is relevant if there is a persistent kernel running across multiples benchmark cases).
*/
inline auto get_result_buffer_from_global_pool(size_t size) -> result_buffer&
inline auto get_result_buffer_from_global_pool(size_t size) -> ws_buffer&
{
auto stream = get_stream_from_global_pool();
auto& rb = [stream, size]() -> result_buffer& {
auto& rb = [stream, size]() -> ws_buffer& {
std::lock_guard guard(detail::grp_mutex);
if (static_cast<int>(detail::global_result_buffer_pool.size()) < benchmark_n_threads) {
detail::global_result_buffer_pool.resize(benchmark_n_threads);
}
auto& rb = detail::global_result_buffer_pool[benchmark_thread_id];
if (!rb || rb->size() < size) { rb = std::make_unique<result_buffer>(size, stream); }
if (!rb || rb->size() < size) {
rb = std::make_unique<ws_buffer>(size, stream);
} else {
rb->reuse();
}
return *rb;
}();
return rb;
}

memset(rb.data(MemoryType::kHost), 0, size);
#ifndef BUILD_CPU_ONLY
cudaMemsetAsync(rb.data(MemoryType::kDevice), 0, size, stream);
cudaStreamSynchronize(stream);
#endif
namespace detail {
inline std::vector<std::unique_ptr<ws_buffer>> global_tmp_buffer_pool(0);
inline std::mutex gtp_mutex;
} // namespace detail

/**
* Global temporary buffer pool for use by algorithms.
* In contrast to `get_result_buffer_from_global_pool`, the content of these buffers is never
* initialized.
*/
inline auto get_tmp_buffer_from_global_pool(size_t size) -> ws_buffer&
{
auto stream = get_stream_from_global_pool();
auto& rb = [stream, size]() -> ws_buffer& {
std::lock_guard guard(detail::gtp_mutex);
if (static_cast<int>(detail::global_tmp_buffer_pool.size()) < benchmark_n_threads) {
detail::global_tmp_buffer_pool.resize(benchmark_n_threads);
}
auto& rb = detail::global_tmp_buffer_pool[benchmark_thread_id];
if (!rb || rb->size() < size) { rb = std::make_unique<ws_buffer>(size, stream); }
return *rb;
}();
return rb;
}

Expand All @@ -293,6 +355,7 @@ inline void reset_global_device_resources()
{
#ifndef BUILD_CPU_ONLY
std::lock_guard guard(detail::gsp_mutex);
detail::global_tmp_buffer_pool.resize(0);
detail::global_result_buffer_pool.resize(0);
detail::global_stream_pool.resize(0);
#endif
Expand Down
10 changes: 10 additions & 0 deletions cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,16 @@ void parse_search_param(const nlohmann::json& conf,
if (conf.contains("itopk")) { param.p.itopk_size = conf.at("itopk"); }
if (conf.contains("search_width")) { param.p.search_width = conf.at("search_width"); }
if (conf.contains("max_iterations")) { param.p.max_iterations = conf.at("max_iterations"); }
if (conf.contains("persistent")) { param.p.persistent = conf.at("persistent"); }
if (conf.contains("persistent_lifetime")) {
param.p.persistent_lifetime = conf.at("persistent_lifetime");
}
if (conf.contains("persistent_device_usage")) {
param.p.persistent_device_usage = conf.at("persistent_device_usage");
}
if (conf.contains("thread_block_size")) {
param.p.thread_block_size = conf.at("thread_block_size");
}
if (conf.contains("algo")) {
if (conf.at("algo") == "single_cta") {
param.p.algo = cuvs::neighbors::cagra::search_algo::SINGLE_CTA;
Expand Down
Loading

0 comments on commit ef3314c

Please sign in to comment.