Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/branch-23.10' into enh-ann-bench…
Browse files Browse the repository at this point in the history
…-faiss-cpu
  • Loading branch information
cjnolet committed Sep 25, 2023
2 parents 1ec75ba + 8522a14 commit a5585fa
Show file tree
Hide file tree
Showing 121 changed files with 5,296 additions and 1,033 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ jobs:
arch: "amd64"
branch: ${{ inputs.branch }}
build_type: ${{ inputs.build_type || 'branch' }}
container_image: "rapidsai/ci:latest"
container_image: "rapidsai/ci-conda:latest"
date: ${{ inputs.date }}
node_type: "gpu-v100-latest-1"
run_script: "ci/build_docs.sh"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ jobs:
build_type: pull-request
node_type: "gpu-v100-latest-1"
arch: "amd64"
container_image: "rapidsai/ci:latest"
container_image: "rapidsai/ci-conda:latest"
run_script: "ci/build_docs.sh"
wheel-build-pylibraft:
needs: checks
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ repos:
additional_dependencies: [toml]
args: ["--config=pyproject.toml"]
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v16.0.1
rev: v16.0.6
hooks:
- id: clang-format
types_or: [c, c++, cuda]
Expand Down
5 changes: 1 addition & 4 deletions ci/release/update-version.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,6 @@ sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cma
sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/pylibraft/pylibraft/__init__.py
sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/raft-dask/raft_dask/__init__.py

# Python pyproject.toml updates
sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/pylibraft/pyproject.toml
sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/raft-dask/pyproject.toml

# Wheel testing script
sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_raft_dask.sh

Expand All @@ -74,6 +70,7 @@ for FILE in python/*/pyproject.toml; do
for DEP in "${DEPENDENCIES[@]}"; do
sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*\"/g" ${FILE}
done
sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" "${FILE}"
sed_runner "/\"ucx-py==/ s/==.*\"/==${NEXT_UCX_PY_SHORT_TAG_PEP440}.*\"/g" ${FILE}
done

Expand Down
4 changes: 2 additions & 2 deletions conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ channels:
dependencies:
- breathe
- c-compiler
- clang-tools=16.0.1
- clang=16.0.1
- clang-tools=16.0.6
- clang=16.0.6
- cmake>=3.26.4
- cuda-profiler-api=11.8.86
- cuda-python>=11.7.1,<12.0a0
Expand Down
4 changes: 2 additions & 2 deletions conda/environments/all_cuda-120_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ channels:
dependencies:
- breathe
- c-compiler
- clang-tools=16.0.1
- clang=16.0.1
- clang-tools=16.0.6
- clang=16.0.6
- cmake>=3.26.4
- cuda-cudart-dev
- cuda-profiler-api
Expand Down
4 changes: 2 additions & 2 deletions conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ channels:
dependencies:
- benchmark>=1.8.2
- c-compiler
- clang-tools=16.0.1
- clang=16.0.1
- clang-tools=16.0.6
- clang=16.0.6
- cmake>=3.26.4
- cuda-profiler-api=11.8.86
- cuda-version=11.8
Expand Down
6 changes: 4 additions & 2 deletions conda/recipes/raft-ann-bench/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,11 @@ requirements:
- h5py {{ h5py_version }}
- benchmark
- matplotlib
# rmm is needed to determine if package is gpu-enabled
- rmm ={{ minor_version }}
- python
- pandas
- pyyaml
# rmm is needed to determine if package is gpu-enabled
- rmm ={{ minor_version }}

run:
- python
Expand All @@ -106,6 +106,8 @@ requirements:
- python
- pandas
- pyyaml
# rmm is needed to determine if package is gpu-enabled
- rmm ={{ minor_version }}
about:
home: https://rapids.ai/
license: Apache-2.0
Expand Down
96 changes: 79 additions & 17 deletions cpp/bench/prims/matrix/select_k.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <common/benchmark.hpp>

#include <raft/core/device_resources.hpp>
#include <raft/core/nvtx.hpp>
#include <raft/random/rng.cuh>
#include <raft/sparse/detail/utils.h>
#include <raft/util/cudart_utils.hpp>
Expand All @@ -38,6 +39,19 @@
namespace raft::matrix {
using namespace raft::bench; // NOLINT

template <typename KeyT>
struct replace_with_mask {
KeyT replacement;
int64_t line_length;
int64_t spared_inputs;
constexpr auto inline operator()(int64_t offset, KeyT x, uint8_t mask) -> KeyT
{
auto i = offset % line_length;
// don't replace all the inputs, spare a few elements at the beginning of the input
return (mask && i >= spared_inputs) ? replacement : x;
}
};

template <typename KeyT, typename IdxT, select::Algo Algo>
struct selection : public fixture {
explicit selection(const select::params& p)
Expand Down Expand Up @@ -67,6 +81,21 @@ struct selection : public fixture {
}
}
raft::random::uniform(handle, state, in_dists_.data(), in_dists_.size(), min_value, max_value);
if (p.frac_infinities > 0.0) {
rmm::device_uvector<uint8_t> mask_buf(p.batch_size * p.len, stream);
auto mask = make_device_vector_view<uint8_t, size_t>(mask_buf.data(), mask_buf.size());
raft::random::bernoulli(handle, state, mask, p.frac_infinities);
KeyT bound = p.select_min ? raft::upper_bound<KeyT>() : raft::lower_bound<KeyT>();
auto mask_in =
make_device_vector_view<const uint8_t, size_t>(mask_buf.data(), mask_buf.size());
auto dists_in = make_device_vector_view<const KeyT>(in_dists_.data(), in_dists_.size());
auto dists_out = make_device_vector_view<KeyT>(in_dists_.data(), in_dists_.size());
raft::linalg::map_offset(handle,
dists_out,
replace_with_mask<KeyT>{bound, int64_t(p.len), int64_t(p.k / 2)},
dists_in,
mask_in);
}
}

void run_benchmark(::benchmark::State& state) override // NOLINT
Expand All @@ -75,8 +104,12 @@ struct selection : public fixture {
std::ostringstream label_stream;
label_stream << params_.batch_size << "#" << params_.len << "#" << params_.k;
if (params_.use_same_leading_bits) { label_stream << "#same-leading-bits"; }
if (params_.frac_infinities > 0) { label_stream << "#infs-" << params_.frac_infinities; }
state.SetLabel(label_stream.str());
loop_on_state(state, [this]() {
common::nvtx::range case_scope("%s - %s", state.name().c_str(), label_stream.str().c_str());
int iter = 0;
loop_on_state(state, [&iter, this]() {
common::nvtx::range lap_scope("lap-", iter++);
select::select_k_impl<KeyT, IdxT>(handle,
Algo,
in_dists_.data(),
Expand Down Expand Up @@ -149,6 +182,35 @@ const std::vector<select::params> kInputs{
{10, 1000000, 64, true, false, true},
{10, 1000000, 128, true, false, true},
{10, 1000000, 256, true, false, true},

{10, 1000000, 1, true, false, false, true, 0.1},
{10, 1000000, 16, true, false, false, true, 0.1},
{10, 1000000, 64, true, false, false, true, 0.1},
{10, 1000000, 128, true, false, false, true, 0.1},
{10, 1000000, 256, true, false, false, true, 0.1},

{10, 1000000, 1, true, false, false, true, 0.9},
{10, 1000000, 16, true, false, false, true, 0.9},
{10, 1000000, 64, true, false, false, true, 0.9},
{10, 1000000, 128, true, false, false, true, 0.9},
{10, 1000000, 256, true, false, false, true, 0.9},
{1000, 10000, 1, true, false, false, true, 0.9},
{1000, 10000, 16, true, false, false, true, 0.9},
{1000, 10000, 64, true, false, false, true, 0.9},
{1000, 10000, 128, true, false, false, true, 0.9},
{1000, 10000, 256, true, false, false, true, 0.9},

{10, 1000000, 1, true, false, false, true, 1.0},
{10, 1000000, 16, true, false, false, true, 1.0},
{10, 1000000, 64, true, false, false, true, 1.0},
{10, 1000000, 128, true, false, false, true, 1.0},
{10, 1000000, 256, true, false, false, true, 1.0},
{1000, 10000, 1, true, false, false, true, 1.0},
{1000, 10000, 16, true, false, false, true, 1.0},
{1000, 10000, 64, true, false, false, true, 1.0},
{1000, 10000, 128, true, false, false, true, 1.0},
{1000, 10000, 256, true, false, false, true, 1.0},
{1000, 10000, 256, true, false, false, true, 0.999},
};

#define SELECTION_REGISTER(KeyT, IdxT, A) \
Expand All @@ -157,28 +219,28 @@ const std::vector<select::params> kInputs{
RAFT_BENCH_REGISTER(SelectK, #KeyT "/" #IdxT "/" #A, kInputs); \
}

SELECTION_REGISTER(float, uint32_t, kPublicApi); // NOLINT
SELECTION_REGISTER(float, uint32_t, kRadix8bits); // NOLINT
SELECTION_REGISTER(float, uint32_t, kRadix11bits); // NOLINT
SELECTION_REGISTER(float, uint32_t, kRadix11bitsExtraPass); // NOLINT
SELECTION_REGISTER(float, uint32_t, kWarpAuto); // NOLINT
SELECTION_REGISTER(float, uint32_t, kWarpImmediate); // NOLINT
SELECTION_REGISTER(float, uint32_t, kWarpFiltered); // NOLINT
SELECTION_REGISTER(float, uint32_t, kWarpDistributed); // NOLINT
SELECTION_REGISTER(float, uint32_t, kWarpDistributedShm); // NOLINT
SELECTION_REGISTER(float, uint32_t, kPublicApi); // NOLINT
SELECTION_REGISTER(float, uint32_t, kRadix8bits); // NOLINT
SELECTION_REGISTER(float, uint32_t, kRadix11bits); // NOLINT
SELECTION_REGISTER(float, uint32_t, kRadix11bitsExtraPass); // NOLINT
SELECTION_REGISTER(float, uint32_t, kWarpAuto); // NOLINT
SELECTION_REGISTER(float, uint32_t, kWarpImmediate); // NOLINT
SELECTION_REGISTER(float, uint32_t, kWarpFiltered); // NOLINT
SELECTION_REGISTER(float, uint32_t, kWarpDistributed); // NOLINT
SELECTION_REGISTER(float, uint32_t, kWarpDistributedShm); // NOLINT

SELECTION_REGISTER(double, uint32_t, kRadix8bits); // NOLINT
SELECTION_REGISTER(double, uint32_t, kRadix11bits); // NOLINT
SELECTION_REGISTER(double, uint32_t, kRadix11bitsExtraPass); // NOLINT
SELECTION_REGISTER(double, uint32_t, kWarpAuto); // NOLINT

SELECTION_REGISTER(double, int64_t, kRadix8bits); // NOLINT
SELECTION_REGISTER(double, int64_t, kRadix11bits); // NOLINT
SELECTION_REGISTER(double, int64_t, kRadix11bitsExtraPass); // NOLINT
SELECTION_REGISTER(double, int64_t, kWarpImmediate); // NOLINT
SELECTION_REGISTER(double, int64_t, kWarpFiltered); // NOLINT
SELECTION_REGISTER(double, int64_t, kWarpDistributed); // NOLINT
SELECTION_REGISTER(double, int64_t, kWarpDistributedShm); // NOLINT
SELECTION_REGISTER(double, int64_t, kRadix8bits); // NOLINT
SELECTION_REGISTER(double, int64_t, kRadix11bits); // NOLINT
SELECTION_REGISTER(double, int64_t, kRadix11bitsExtraPass); // NOLINT
SELECTION_REGISTER(double, int64_t, kWarpImmediate); // NOLINT
SELECTION_REGISTER(double, int64_t, kWarpFiltered); // NOLINT
SELECTION_REGISTER(double, int64_t, kWarpDistributed); // NOLINT
SELECTION_REGISTER(double, int64_t, kWarpDistributedShm); // NOLINT

// For learning a heuristic of which selection algorithm to use, we
// have a couple of additional constraints when generating the dataset:
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/raft/cluster/detail/kmeans_balanced.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,7 @@ __global__ void __launch_bounds__((WarpSize * BlockDimY))
adjust_centers_kernel(MathT* centers, // [n_clusters, dim]
IdxT n_clusters,
IdxT dim,
const T* dataset, // [n_rows, dim]
const T* dataset, // [n_rows, dim]
IdxT n_rows,
const LabelT* labels, // [n_rows]
const CounterT* cluster_sizes, // [n_clusters]
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ namespace numpy_serializer {

#if RAFT_SYSTEM_LITTLE_ENDIAN == 1
#define RAFT_NUMPY_HOST_ENDIAN_CHAR RAFT_NUMPY_LITTLE_ENDIAN_CHAR
#else // RAFT_SYSTEM_LITTLE_ENDIAN == 1
#else // RAFT_SYSTEM_LITTLE_ENDIAN == 1
#define RAFT_NUMPY_HOST_ENDIAN_CHAR RAFT_NUMPY_BIG_ENDIAN_CHAR
#endif // RAFT_SYSTEM_LITTLE_ENDIAN == 1

Expand Down
2 changes: 1 addition & 1 deletion cpp/include/raft/core/detail/nvtx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ inline void pop_range()

} // namespace raft::common::nvtx::detail

#else // NVTX_ENABLED
#else // NVTX_ENABLED

namespace raft::common::nvtx::detail {

Expand Down
4 changes: 2 additions & 2 deletions cpp/include/raft/core/kvp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ struct KeyValuePair {
typedef _Key Key; ///< Key data type
typedef _Value Value; ///< Value data type

Key key; ///< Item key
Value value; ///< Item value
Key key; ///< Item key
Value value; ///< Item value

/// Constructor
RAFT_INLINE_FUNCTION KeyValuePair() {}
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/raft/core/resource/resource_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ enum resource_type {
THRUST_POLICY, // thrust execution policy
WORKSPACE_RESOURCE, // rmm device memory resource

LAST_KEY // reserved for the last key
LAST_KEY // reserved for the last key
};

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ class EpilogueWithBroadcastCustom : public EpilogueBase<Shape_,
TensorTileIterator
tensor_iterator, ///< Threadblock tile iterator for additional tensor operand
MatrixCoord const&
problem_size = ///< Problem size needed to guard against out-of-bounds accesses
problem_size = ///< Problem size needed to guard against out-of-bounds accesses
MatrixCoord(Shape::kM, Shape::kN),
MatrixCoord const&
threadblock_offset = ///< Threadblock's initial offset within the problem size space
Expand All @@ -418,7 +418,7 @@ class EpilogueWithBroadcastCustom : public EpilogueBase<Shape_,
broadcast_fragment, ///< Fragment containing the accumulated partial reduction over columns
ElementVector const* broadcast_ptr, ///< Broadcast vector
MatrixCoord const&
problem_size, ///< Problem size needed to guard against out-of-bounds accesses
problem_size, ///< Problem size needed to guard against out-of-bounds accesses
MatrixCoord const&
threadblock_offset ///< Threadblock's initial offset within the problem size space
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ namespace threadblock {
///
/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
///
template <typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap)
typename Element_, ///< Element data type
template <typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap)
typename Element_, ///< Element data type
typename Layout_,
bool ScatterD = false, ///< Scatter D operand or not
bool UseCUDAStore = false>
Expand Down
10 changes: 5 additions & 5 deletions cpp/include/raft/distance/detail/fused_l2_nn.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@

#pragma once

#include <cstddef> // size_t
#include <limits> // std::numeric_limits
#include <raft/core/kvp.hpp> // raft::KeyValuePair
#include <raft/core/operators.hpp> // raft::identity_op
#include <raft/distance/detail/distance_ops/l2_exp.cuh> // ops::l2_exp_distance_op
#include <cstddef> // size_t
#include <limits> // std::numeric_limits
#include <raft/core/kvp.hpp> // raft::KeyValuePair
#include <raft/core/operators.hpp> // raft::identity_op
#include <raft/distance/detail/distance_ops/l2_exp.cuh> // ops::l2_exp_distance_op
#include <raft/distance/detail/fused_distance_nn/cutlass_base.cuh>
#include <raft/distance/detail/pairwise_distance_base.cuh> // PairwiseDistances
#include <raft/linalg/contractions.cuh> // Policy
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/raft/distance/detail/masked_distance_base.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ struct MaskedDistances : public BaseClass {
} // tile_idx_n
} // idx_g
rowEpilog_op(tile_idx_m);
} // tile_idx_m
} // tile_idx_m
}

private:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#include <raft/util/cuda_dev_essentials.cuh> // ceildiv
#include <raft/util/cuda_rt_essentials.hpp> // RAFT_CUDA_TRY

#include <cstddef> // size_t
#include <cstddef> // size_t

namespace raft {
namespace distance {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ void pairwise_matrix_dispatch(OpT distance_op,
cudaStream_t stream,
bool is_row_major) RAFT_EXPLICIT;

}; // namespace raft::distance::detail
}; // namespace raft::distance::detail

#endif // RAFT_EXPLICIT_INSTANTIATE_ONLY

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ namespace threadblock {
///
/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
///
template <typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap)
typename Element_, ///< Element data type
template <typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap)
typename Element_, ///< Element data type
typename Layout_,
bool ScatterD = false, ///< Scatter D operand or not
bool UseCUDAStore = false>
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/raft/distance/distance-ext.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ void pairwise_distance(raft::resources const& handle,
raft::distance::DistanceType metric,
Type metric_arg = 2.0f) RAFT_EXPLICIT;

}; // namespace distance
}; // namespace raft
}; // namespace distance
}; // namespace raft

#endif // RAFT_EXPLICIT_INSTANTIATE_ONLY

Expand Down
4 changes: 2 additions & 2 deletions cpp/include/raft/linalg/add.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ void add_scalar(raft::resources const& handle,

/** @} */ // end of group add

}; // end namespace linalg
}; // end namespace raft
}; // end namespace linalg
}; // end namespace raft

#endif
Loading

0 comments on commit a5585fa

Please sign in to comment.