-
Notifications
You must be signed in to change notification settings - Fork 86
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ENH: new function zip_sp_matmul_topn that can zip matrices zip_j A.do…
…t(B_j) Function will return a zipped matrix Z in CSR format, zip_j C_j, where Z = [sorted top n results > lower_bound for each row of C_j], where C_j = A.dot(B_j) and where B has been split row-wise into sub-matrices B_j. Function only allows for sorted variant of sp_matzip function; unsorted variant (sorted based on insertion order) cannot be (made) equal to unsorted function on full matrices. zip_sp_matmul_topn by default sorts by value. And added python function to zip split matrices. Plus added two unit tests to test functionality. NB Skip unit test test_stack_zip_sp_matmul_topn for python 3.8 due to bug in scipy vstack function, it does not support all data types.
- Loading branch information
Showing
12 changed files
with
518 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
117 changes: 117 additions & 0 deletions
117
src/sparse_dot_topn_core/include/sparse_dot_topn/zip_sp_matmul_topn.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
/* Copyright (c) 2023 ING Analytics Wholesale Banking | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#pragma once | ||
|
||
#include <limits> | ||
#include <vector> | ||
|
||
#include <sparse_dot_topn/common.hpp> | ||
#include <sparse_dot_topn/maxheap.hpp> | ||
|
||
namespace sdtn::core { | ||
|
||
/** | ||
* \brief Zip and compute Z = zip_j C_j = zip_j A.dot(B_j) keeping only the | ||
* top-n of the zipped results. | ||
* | ||
* \details This function will return a zipped matrix Z in CSR format, zip_j | ||
* C_j, where Z = [sorted top n results > lower_bound for each row of C_j], | ||
* where C_j = A.dot(B_j) and where B has been split row-wise into sub-matrices | ||
* B_j. Note that `C_j` must be `CSR` format where the nonzero elements of the | ||
* `i`th row are located in ``data[indptr[i]:indptr[i+1]]``. The column indices | ||
* for row `i` are stored in | ||
* ``indices[indptr[i]:indptr[i+1]]``. | ||
* | ||
* \tparam eT element type of the matrices | ||
* \tparam idxT integer type of the index arrays, must be at least 32 bit int | ||
* \param[in] top_n the top n values to store | ||
* \param[in] nrowsA the number of rows in A | ||
* \param[in] ncolsB_vec the number of columns in each B_i sub-matrix | ||
* \param[in] C_data_vec vector of the nonzero elements of each C_data_j | ||
* sub-matrix | ||
* \param[in] C_indptr_vec vector of arrays containing the row indices for | ||
* `C_data_j` sub-matrices | ||
* \param[in] C_indices_vec vector of arrays containing the column indices | ||
for the C_j sub-matrices | ||
* \param[out] Z_data the nonzero elements of zipped Z matrix | ||
* \param[out] Z_indptr array containing the row indices for zipped `Z_data` | ||
* \param[out] Z_indices array containing the zipped column indices | ||
*/ | ||
template <typename eT, typename idxT, iffInt<idxT> = true> | ||
inline void zip_sp_matmul_topn( | ||
const idxT top_n, | ||
const idxT nrows, | ||
const idxT* B_ncols, | ||
const std::vector<const eT*>& C_data, | ||
const std::vector<const idxT*>& C_indptrs, | ||
const std::vector<const idxT*>& C_indices, | ||
eT* __restrict Z_data, | ||
idxT* __restrict Z_indptr, | ||
idxT* __restrict Z_indices | ||
) { | ||
idxT nnz = 0; | ||
Z_indptr[0] = 0; | ||
eT* Z_data_head = Z_data; | ||
idxT* Z_indices_head = Z_indices; | ||
const int n_mat = C_data.size(); | ||
|
||
// threshold is already consistent between matrices, so accept every line. | ||
auto max_heap = MaxHeap<eT, idxT>(top_n, std::numeric_limits<eT>::min()); | ||
|
||
// offset the index when concatenating the C sub-matrices (split by row) | ||
std::vector<idxT> offset(n_mat, idxT(0)); | ||
for (int i = 0; i < n_mat - 1; ++i) { | ||
for (int j = i; j < n_mat - 1; ++j) { | ||
offset[j + 1] += B_ncols[i]; | ||
} | ||
} | ||
|
||
// concatenate the results of each row, apply top_n and add those results to | ||
// the C matrix | ||
for (idxT i = 0; i < nrows; ++i) { | ||
eT min = max_heap.reset(); | ||
|
||
// keep topn of stacked lines for each row insert in reverse order, | ||
// similar to the reverse linked list in sp_matmul_topn | ||
for (int j = n_mat - 1; j >= 0; --j) { | ||
const idxT* C_indptr_j = C_indptrs[j]; | ||
const idxT* C_indices_j = C_indices[j]; | ||
for (idxT k = C_indptr_j[i]; k < C_indptr_j[i + 1]; ++k) { | ||
eT val = (C_data[j])[k]; | ||
if (val > min) { | ||
min = max_heap.push_pop(offset[j] + C_indices_j[k], val); | ||
} | ||
} | ||
} | ||
|
||
// sort the heap s.t. the first value is the largest | ||
max_heap.value_sort(); | ||
|
||
// fill the zipped sparse matrix Z | ||
int n_set = max_heap.get_n_set(); | ||
for (int ii = 0; ii < n_set; ++ii) { | ||
*Z_indices_head = max_heap.heap[ii].idx; | ||
*Z_data_head = max_heap.heap[ii].val; | ||
Z_indices_head++; | ||
Z_data_head++; | ||
} | ||
nnz += n_set; | ||
Z_indptr[i + 1] = nnz; | ||
} | ||
} | ||
|
||
} // namespace sdtn::core |
88 changes: 88 additions & 0 deletions
88
src/sparse_dot_topn_core/include/sparse_dot_topn/zip_sp_matmul_topn_bindings.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
/* Copyright (c) 2023 ING Analytics Wholesale Banking | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#pragma once | ||
|
||
#include <nanobind/nanobind.h> | ||
#include <nanobind/ndarray.h> | ||
#include <nanobind/stl/vector.h> | ||
|
||
#include <memory> | ||
#include <numeric> | ||
#include <vector> | ||
|
||
#include <sparse_dot_topn/common.hpp> | ||
#include <sparse_dot_topn/maxheap.hpp> | ||
#include <sparse_dot_topn/zip_sp_matmul_topn.hpp> | ||
|
||
namespace sdtn { | ||
namespace nb = nanobind; | ||
|
||
namespace api { | ||
|
||
template <typename eT, typename idxT, core::iffInt<idxT> = true> | ||
inline nb::tuple zip_sp_matmul_topn( | ||
const int top_n, | ||
const idxT Z_max_nnz, | ||
const idxT nrows, | ||
const nb_vec<idxT>& B_ncols, | ||
const std::vector<nb_vec<eT>>& data, | ||
const std::vector<nb_vec<idxT>>& indptr, | ||
const std::vector<nb_vec<idxT>>& indices | ||
) { | ||
const int n_mats = B_ncols.size(); | ||
std::vector<const eT*> data_ptrs; | ||
data_ptrs.reserve(n_mats); | ||
std::vector<const idxT*> indptr_ptrs; | ||
indptr_ptrs.reserve(n_mats); | ||
std::vector<const idxT*> indices_ptrs; | ||
indices_ptrs.reserve(n_mats); | ||
|
||
for (int i = 0; i < n_mats; ++i) { | ||
data_ptrs.push_back(data[i].data()); | ||
indptr_ptrs.push_back(indptr[i].data()); | ||
indices_ptrs.push_back(indices[i].data()); | ||
} | ||
|
||
auto Z_indptr = std::unique_ptr<idxT[]>(new idxT[nrows + 1]); | ||
auto Z_indices = std::unique_ptr<idxT>(new idxT[Z_max_nnz]); | ||
auto Z_data = std::unique_ptr<eT>(new eT[Z_max_nnz]); | ||
|
||
core::zip_sp_matmul_topn<eT, idxT>( | ||
top_n, | ||
nrows, | ||
B_ncols.data(), | ||
data_ptrs, | ||
indptr_ptrs, | ||
indices_ptrs, | ||
Z_data.get(), | ||
Z_indptr.get(), | ||
Z_indices.get() | ||
); | ||
|
||
return nb::make_tuple( | ||
to_nbvec<eT>(Z_data.release(), Z_max_nnz), | ||
to_nbvec<idxT>(Z_indices.release(), Z_max_nnz), | ||
to_nbvec<idxT>(Z_indptr.release(), nrows + 1) | ||
); | ||
} | ||
} // namespace api | ||
|
||
namespace bindings { | ||
void bind_zip_sp_matmul_topn(nb::module_& m); | ||
} | ||
|
||
} // namespace sdtn |
Oops, something went wrong.