-
Notifications
You must be signed in to change notification settings - Fork 39
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix issue #991 : Parallelize apply_Sparse_Matrix in lightning.qubit #992
base: master
Are you sure you want to change the base?
Changes from 1 commit
42b8207
e8af71e
8e27978
cead5eb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,10 +20,43 @@ | |
#pragma once | ||
#include <complex> | ||
#include <vector> | ||
#include <thread> | ||
|
||
namespace Pennylane::LightningQubit::Util { | ||
|
||
/** | ||
* @brief Worker function to compute a segment of the matrix-vector multiplication for a sparse matrix. | ||
* | ||
* @tparam fp_precision data float point precision. | ||
* @tparam index_type integer type used as indices of the sparse matrix. | ||
* @param vector_ptr pointer to the vector. | ||
* @param row_map_ptr Pointer to the row_map array. Elements of this array | ||
* return the number of non-zero terms in all rows before it. | ||
* @param entries_ptr pointer to the column indices of the non-zero elements. | ||
* @param values_ptr non-zero elements. | ||
* @param result Reference to the output vector where results are stored. | ||
* @param start Index of the first row to process. | ||
* @param end Index of the last row (exclusive) to process. | ||
*/ | ||
template <class fp_precision, class index_type> | ||
void sparse_worker(const std::complex<fp_precision> *vector_ptr, | ||
const index_type *row_map_ptr, | ||
const index_type *entries_ptr, | ||
const std::complex<fp_precision> *values_ptr, | ||
std::vector<std::complex<fp_precision>> &result, | ||
index_type start, index_type end) { | ||
for (index_type i = start; i < end; i++) { | ||
std::complex<fp_precision> temp = 0.0; | ||
// Loop through all non-zero elements in row `i` | ||
for (index_type j = row_map_ptr[i]; j < row_map_ptr[i + 1]; j++) { | ||
temp += values_ptr[j] * vector_ptr[entries_ptr[j]]; | ||
} | ||
result[i] = temp; // Store the computed value in the result vector | ||
} | ||
} | ||
|
||
/** | ||
* @brief Apply a sparse matrix to a vector. | ||
* @brief Applies a sparse matrix to a vector using multi-threading. | ||
* | ||
* @tparam fp_precision data float point precision. | ||
* @tparam index_type integer type used as indices of the sparse matrix. | ||
|
@@ -40,21 +73,47 @@ namespace Pennylane::LightningQubit::Util { | |
template <class fp_precision, class index_type> | ||
std::vector<std::complex<fp_precision>> | ||
apply_Sparse_Matrix(const std::complex<fp_precision> *vector_ptr, | ||
const index_type vector_size, const index_type *row_map_ptr, | ||
[[maybe_unused]] const index_type row_map_size, | ||
const index_type *entries_ptr, | ||
const std::complex<fp_precision> *values_ptr, | ||
[[maybe_unused]] const index_type numNNZ) { | ||
std::vector<std::complex<fp_precision>> result; | ||
result.resize(vector_size); | ||
std::size_t count = 0; | ||
for (index_type i = 0; i < vector_size; i++) { | ||
result[i] = 0.0; | ||
for (index_type j = 0; j < row_map_ptr[i + 1] - row_map_ptr[i]; j++) { | ||
result[i] += values_ptr[count] * vector_ptr[entries_ptr[count]]; | ||
count++; | ||
const index_type vector_size, | ||
const index_type *row_map_ptr, | ||
[[maybe_unused]] const index_type row_map_size, | ||
const index_type *entries_ptr, | ||
const std::complex<fp_precision> *values_ptr, | ||
[[maybe_unused]] const index_type numNNZ, | ||
index_type num_threads = 0) { | ||
// Output vector initialized to zero | ||
std::vector<std::complex<fp_precision>> result(vector_size, std::complex<fp_precision>(0.0)); | ||
|
||
// Determine the number of threads to use | ||
if (num_threads <= 0) { | ||
const int max_threads = std::thread::hardware_concurrency(); | ||
num_threads = std::min(vector_size, static_cast<index_type>(max_threads)); | ||
} | ||
|
||
// Divide the rows approximately evenly among the threads | ||
index_type chunk_size = (vector_size + num_threads - 1) / num_threads; | ||
std::vector<std::thread> threads; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How would you rewrite this code using C++20 multi-threading features (e.g., jthreads)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could rewrite line 93 "std::vectorstd::thread threads;" as "std::vectorstd::jthread threads;" in order to use jthread. It would simplify the thread management since jthread doesnt require explicite use of join on the different threads after execution. We could also use std::views::iota(start, end) from the std::range library when calling emplace_back later in the code, it would helps avoid manually handling the iteration indices and make the code easier to read. Also, if later we would change the chunk logic, using ranges we won't have to rewrite the loop logic. |
||
|
||
// Create and launch threads | ||
for (index_type t = 0; t < num_threads; ++t) { | ||
index_type start = t * chunk_size; | ||
index_type end = std::min(start + chunk_size, vector_size); | ||
|
||
// Only launch threads if there are rows to process | ||
if (start < vector_size) { | ||
threads.emplace_back(sparse_worker<fp_precision, index_type>, | ||
vector_ptr, row_map_ptr, entries_ptr, values_ptr, | ||
std::ref(result), start, end); | ||
} | ||
} | ||
|
||
// Wait for all threads to complete | ||
for (auto &th : threads) { | ||
if (th.joinable()) { | ||
th.join(); | ||
} | ||
} | ||
|
||
return result; | ||
}; | ||
} // namespace Pennylane::LightningQubit::Util | ||
|
||
} // namespace Pennylane::LightningQubit::Util |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you explain how collapsing these nested for-loops can improve performance?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
By collapsing nested these for-loops it would lower the the loop control overhead and lower the number of access to memory which could improve performance. Instead of looping for every value in the sparse matrix and adding it to the the temporary result we could here add them and then do the multiplication by the vector value. It would limit the number of call to the data and therefore improve performance.