Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feat] Support bitset_to_csr #2523

Open
wants to merge 8 commits into
base: branch-25.02
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 76 additions & 33 deletions cpp/bench/prims/linalg/masked_matmul.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,14 @@ inline auto operator<<(std::ostream& os, const MaskedMatmulBenchParams<value_t>&
{
os << " m*k*n=" << params.m << "*" << params.k << "*" << params.n
<< "\tsparsity=" << params.sparsity;
if (params.sparsity == 1.0) { os << "<-inner product for comparison"; }
if (params.sparsity == 0.0) { os << "<-inner product for comparison"; }
return os;
}

template <typename value_t, typename index_t = int64_t, typename bitmap_t = uint32_t>
template <typename value_t,
bool bitmap_or_bitset = true,
typename index_t = int64_t,
typename bits_t = uint32_t>
struct MaskedMatmulBench : public fixture {
MaskedMatmulBench(const MaskedMatmulBenchParams<value_t>& p)
: fixture(true),
Expand All @@ -64,15 +67,15 @@ struct MaskedMatmulBench : public fixture {
c_indptr_d(0, stream),
c_indices_d(0, stream),
c_data_d(0, stream),
bitmap_d(0, stream),
bits_d(0, stream),
c_dense_data_d(0, stream)
{
index_t element = raft::ceildiv(index_t(params.m * params.n), index_t(sizeof(bitmap_t) * 8));
std::vector<bitmap_t> bitmap_h(element);
index_t element = raft::ceildiv(index_t(params.m * params.n), index_t(sizeof(bits_t) * 8));
std::vector<bits_t> bits_h(element);

a_data_d.resize(params.m * params.k, stream);
b_data_d.resize(params.k * params.n, stream);
bitmap_d.resize(element, stream);
bits_d.resize(element, stream);

raft::random::RngState rng(2024ULL);
raft::random::uniform(
Expand All @@ -82,7 +85,13 @@ struct MaskedMatmulBench : public fixture {

std::vector<bool> c_dense_data_h(params.m * params.n);

c_true_nnz = create_sparse_matrix(params.m, params.n, params.sparsity, bitmap_h);
if constexpr (bitmap_or_bitset) {
c_true_nnz = create_sparse_matrix(params.m, params.n, params.sparsity, bits_h);
} else {
c_true_nnz = create_sparse_matrix(1, params.n, params.sparsity, bits_h);
repeat_cpu_bitset_inplace(bits_h, params.n, params.m - 1);
c_true_nnz *= params.m;
}

std::vector<value_t> values(c_true_nnz);
std::vector<index_t> indices(c_true_nnz);
Expand All @@ -93,24 +102,49 @@ struct MaskedMatmulBench : public fixture {
c_indices_d.resize(c_true_nnz, stream);
c_dense_data_d.resize(params.m * params.n, stream);

cpu_convert_to_csr(bitmap_h, params.m, params.n, indices, indptr);
cpu_convert_to_csr(bits_h, params.m, params.n, indices, indptr);
RAFT_EXPECTS(c_true_nnz == c_indices_d.size(),
"Something wrong. The c_true_nnz != c_indices_d.size()!");

update_device(c_data_d.data(), values.data(), c_true_nnz, stream);
update_device(c_indices_d.data(), indices.data(), c_true_nnz, stream);
update_device(c_indptr_d.data(), indptr.data(), params.m + 1, stream);
update_device(bitmap_d.data(), bitmap_h.data(), element, stream);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We want to support both bitmap and bitset inputs but it appears we're removing the bitmap support.

Copy link
Member Author

@rhdong rhdong Dec 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The bits is only a naming; the code is needed to be compatible with bitset and bitmap, so I need to change bitmap to bits, as the compatible control point is here

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not easy to understand and it’s not obvious which one is selected when true or false (future eyes are going to be confused too) . Let’s create an enum for this that we can share across benchmarks and tests. It’ll make this more straightforward for future eyes too.

update_device(bits_d.data(), bits_h.data(), element, stream);
}

void repeat_cpu_bitset_inplace(std::vector<bits_t>& inout, size_t input_bits, size_t repeat)
{
size_t output_bit_index = input_bits;

for (size_t r = 0; r < repeat; ++r) {
for (size_t i = 0; i < input_bits; ++i) {
size_t input_unit_index = i / (sizeof(bits_t) * 8);
size_t input_bit_offset = i % (sizeof(bits_t) * 8);
bool bit = (inout[input_unit_index] >> input_bit_offset) & 1;

size_t output_unit_index = output_bit_index / (sizeof(bits_t) * 8);
size_t output_bit_offset = output_bit_index % (sizeof(bits_t) * 8);

inout[output_unit_index] |= (static_cast<bits_t>(bit) << output_bit_offset);

++output_bit_index;
}
}
}

index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bitmap_t>& bitmap)
index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bits_t>& bits)
{
index_t total = static_cast<index_t>(m * n);
index_t num_ones = static_cast<index_t>((total * 1.0f) * sparsity);
index_t num_ones = static_cast<index_t>((total * 1.0f) * (1.0f - sparsity));
index_t res = num_ones;

for (auto& item : bitmap) {
item = static_cast<bitmap_t>(0);
if (sparsity == 0.0f) {
std::fill(bits.begin(), bits.end(), 0xffffffff);
return num_ones;
}

for (auto& item : bits) {
item = static_cast<bits_t>(0);
}

std::random_device rd;
Expand All @@ -120,8 +154,8 @@ struct MaskedMatmulBench : public fixture {
while (num_ones > 0) {
index_t index = dis(gen);

bitmap_t& element = bitmap[index / (8 * sizeof(bitmap_t))];
index_t bit_position = index % (8 * sizeof(bitmap_t));
bits_t& element = bits[index / (8 * sizeof(bits_t))];
index_t bit_position = index % (8 * sizeof(bits_t));

if (((element >> bit_position) & 1) == 0) {
element |= (static_cast<index_t>(1) << bit_position);
Expand All @@ -131,7 +165,7 @@ struct MaskedMatmulBench : public fixture {
return res;
}

void cpu_convert_to_csr(std::vector<bitmap_t>& bitmap,
void cpu_convert_to_csr(std::vector<bits_t>& bits,
index_t rows,
index_t cols,
std::vector<index_t>& indices,
Expand All @@ -142,14 +176,14 @@ struct MaskedMatmulBench : public fixture {
indptr[offset_indptr++] = 0;

index_t index = 0;
bitmap_t element = 0;
bits_t element = 0;
index_t bit_position = 0;

for (index_t i = 0; i < rows; ++i) {
for (index_t j = 0; j < cols; ++j) {
index = i * cols + j;
element = bitmap[index / (8 * sizeof(bitmap_t))];
bit_position = index % (8 * sizeof(bitmap_t));
element = bits[index / (8 * sizeof(bits_t))];
bit_position = index % (8 * sizeof(bits_t));

if (((element >> bit_position) & 1)) {
indices[offset_values] = static_cast<index_t>(j);
Expand Down Expand Up @@ -181,13 +215,17 @@ struct MaskedMatmulBench : public fixture {
params.n,
static_cast<index_t>(c_indices_d.size()));

auto mask =
raft::core::bitmap_view<const bitmap_t, index_t>(bitmap_d.data(), params.m, params.n);

auto c = raft::make_device_csr_matrix_view<value_t>(c_data_d.data(), c_structure);

if (params.sparsity < 1.0) {
raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
if (params.sparsity > 0.0) {
if constexpr (bitmap_or_bitset) {
auto mask =
raft::core::bitmap_view<const bits_t, index_t>(bits_d.data(), params.m, params.n);
raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
} else {
auto mask = raft::core::bitset_view<const bits_t, index_t>(bits_d.data(), params.n);
raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
}
} else {
raft::distance::pairwise_distance(handle,
a_data_d.data(),
Expand All @@ -201,12 +239,16 @@ struct MaskedMatmulBench : public fixture {
}
resource::sync_stream(handle);

raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
resource::sync_stream(handle);

loop_on_state(state, [this, &a, &b, &mask, &c]() {
if (params.sparsity < 1.0) {
raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
loop_on_state(state, [this, &a, &b, &c]() {
if (params.sparsity > 0.0) {
if constexpr (bitmap_or_bitset) {
auto mask =
raft::core::bitmap_view<const bits_t, index_t>(bits_d.data(), params.m, params.n);
raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
} else {
auto mask = raft::core::bitset_view<const bits_t, index_t>(bits_d.data(), params.n);
raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
}
} else {
raft::distance::pairwise_distance(handle,
a_data_d.data(),
Expand All @@ -228,7 +270,7 @@ struct MaskedMatmulBench : public fixture {

rmm::device_uvector<value_t> a_data_d;
rmm::device_uvector<value_t> b_data_d;
rmm::device_uvector<bitmap_t> bitmap_d;
rmm::device_uvector<bits_t> bits_d;

rmm::device_uvector<value_t> c_dense_data_d;

Expand All @@ -253,7 +295,7 @@ static std::vector<MaskedMatmulBenchParams<value_t>> getInputs()
raft::util::itertools::product<TestParams>({size_t(10), size_t(1024)},
{size_t(128), size_t(1024)},
{size_t(1024 * 1024)},
{0.01f, 0.1f, 0.2f, 0.5f, 1.0f});
{0.99f, 0.9f, 0.8f, 0.5f, 0.0f});

param_vec.reserve(params_group.size());
for (TestParams params : params_group) {
Expand All @@ -263,6 +305,7 @@ static std::vector<MaskedMatmulBenchParams<value_t>> getInputs()
return param_vec;
}

RAFT_BENCH_REGISTER((MaskedMatmulBench<float>), "", getInputs<float>());
RAFT_BENCH_REGISTER((MaskedMatmulBench<float, true>), "", getInputs<float>());
RAFT_BENCH_REGISTER((MaskedMatmulBench<float, false>), "", getInputs<float>());

} // namespace raft::bench::linalg
178 changes: 178 additions & 0 deletions cpp/bench/prims/sparse/bitset_to_csr.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <common/benchmark.hpp>

#include <raft/core/device_resources.hpp>
#include <raft/core/resource/cuda_stream.hpp>
#include <raft/core/resources.hpp>
#include <raft/sparse/convert/csr.cuh>
#include <raft/util/itertools.hpp>

#include <rmm/device_uvector.hpp>

#include <sstream>
#include <vector>

namespace raft::bench::sparse {

template <typename index_t>
struct bench_param {
index_t n_repeat;
index_t n_cols;
float sparsity;
};

template <typename index_t>
inline auto operator<<(std::ostream& os, const bench_param<index_t>& params) -> std::ostream&
{
os << " rows*cols=" << params.n_repeat << "*" << params.n_cols
<< "\tsparsity=" << params.sparsity;
return os;
}

template <typename bitset_t, typename index_t, typename value_t = float>
struct BitsetToCsrBench : public fixture {
BitsetToCsrBench(const bench_param<index_t>& p)
: fixture(true),
params(p),
handle(stream),
bitset_d(0, stream),
nnz(0),
indptr_d(0, stream),
indices_d(0, stream),
values_d(0, stream)
{
index_t element = raft::ceildiv(1 * params.n_cols, index_t(sizeof(bitset_t) * 8));
std::vector<bitset_t> bitset_h(element);
nnz = create_sparse_matrix(1, params.n_cols, params.sparsity, bitset_h);

bitset_d.resize(bitset_h.size(), stream);
indptr_d.resize(params.n_repeat + 1, stream);
indices_d.resize(nnz, stream);
values_d.resize(nnz, stream);

update_device(bitset_d.data(), bitset_h.data(), bitset_h.size(), stream);

resource::sync_stream(handle);
}

index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bitset_t>& bitset)
{
index_t total = static_cast<index_t>(m * n);
index_t num_ones = static_cast<index_t>((total * 1.0f) * (1.0f - sparsity));
index_t res = num_ones;

for (auto& item : bitset) {
item = static_cast<bitset_t>(0);
}

std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<index_t> dis(0, total - 1);

while (num_ones > 0) {
index_t index = dis(gen);

bitset_t& element = bitset[index / (8 * sizeof(bitset_t))];
index_t bit_position = index % (8 * sizeof(bitset_t));

if (((element >> bit_position) & 1) == 0) {
element |= (static_cast<index_t>(1) << bit_position);
num_ones--;
}
}
return res;
}

void run_benchmark(::benchmark::State& state) override
{
std::ostringstream label_stream;
label_stream << params;
state.SetLabel(label_stream.str());

auto bitset = raft::core::bitset_view<bitset_t, index_t>(bitset_d.data(), 1 * params.n_cols);

auto csr_view = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
indptr_d.data(), indices_d.data(), params.n_repeat, params.n_cols, nnz);
auto csr = raft::make_device_csr_matrix<value_t, index_t>(handle, csr_view);

raft::sparse::convert::bitset_to_csr<bitset_t, index_t>(handle, bitset, csr);

resource::sync_stream(handle);
loop_on_state(state, [this, &bitset, &csr]() {
raft::sparse::convert::bitset_to_csr<bitset_t, index_t>(handle, bitset, csr);
});
}

protected:
const raft::device_resources handle;

bench_param<index_t> params;

rmm::device_uvector<bitset_t> bitset_d;
rmm::device_uvector<index_t> indptr_d;
rmm::device_uvector<index_t> indices_d;
rmm::device_uvector<value_t> values_d;

index_t nnz;
}; // struct BitsetToCsrBench

template <typename index_t>
const std::vector<bench_param<index_t>> getInputs()
{
std::vector<bench_param<index_t>> param_vec;
struct TestParams {
index_t m;
index_t n;
float sparsity;
};

const std::vector<TestParams> params_group = raft::util::itertools::product<TestParams>(
{index_t(10), index_t(1024)}, {index_t(1024 * 1024)}, {0.99f, 0.9f, 0.8f, 0.5f});

param_vec.reserve(params_group.size());
for (TestParams params : params_group) {
param_vec.push_back(bench_param<index_t>({params.m, params.n, params.sparsity}));
}
return param_vec;
}

template <typename index_t = int64_t>
const std::vector<bench_param<index_t>> getLargeInputs()
{
std::vector<bench_param<index_t>> param_vec;
struct TestParams {
index_t m;
index_t n;
float sparsity;
};

const std::vector<TestParams> params_group = raft::util::itertools::product<TestParams>(
{index_t(1), index_t(100)}, {index_t(100 * 1000000)}, {0.95f, 0.99f});

param_vec.reserve(params_group.size());
for (TestParams params : params_group) {
param_vec.push_back(bench_param<index_t>({params.m, params.n, params.sparsity}));
}
return param_vec;
}

RAFT_BENCH_REGISTER((BitsetToCsrBench<uint32_t, int, float>), "", getInputs<int>());
RAFT_BENCH_REGISTER((BitsetToCsrBench<uint64_t, int, double>), "", getInputs<int>());

RAFT_BENCH_REGISTER((BitsetToCsrBench<uint32_t, int64_t, float>), "", getLargeInputs<int64_t>());

} // namespace raft::bench::sparse
Loading
Loading