From 2f475aef0c2abacd62076af1f246194708c3d80f Mon Sep 17 00:00:00 2001 From: Sergei Isakov Date: Wed, 10 Aug 2022 15:17:01 +0200 Subject: [PATCH] New CUDA simulator. --- apps/qsim_base_cuda.cu | 24 +- apps/qsim_qtrajectory_cuda.cu | 16 +- docs/cirq_interface.md | 8 +- lib/run_qsim.h | 4 + lib/simulator_cuda.h | 7487 ++------------------ lib/simulator_cuda_kernels.h | 4747 ++----------- lib/vectorspace.h | 2 + lib/vectorspace_cuda.h | 4 + pybind_interface/cuda/pybind_main_cuda.cpp | 6 +- pybind_interface/pybind_main.cpp | 5 - qsimcirq/qsim_simulator.py | 8 +- tests/hybrid_cuda_test.cu | 21 +- tests/qtrajectory_cuda_test.cu | 46 +- tests/simulator_cuda_test.cu | 132 +- tests/simulator_testfixture.h | 134 +- tests/statespace_cuda_test.cu | 104 +- 16 files changed, 1292 insertions(+), 11456 deletions(-) diff --git a/apps/qsim_base_cuda.cu b/apps/qsim_base_cuda.cu index d90ca477..b4af7967 100644 --- a/apps/qsim_base_cuda.cu +++ b/apps/qsim_base_cuda.cu @@ -112,46 +112,42 @@ int main(int argc, char* argv[]) { return 1; } - Circuit> circuit; + using fp_type = float; + + Circuit> circuit; if (!CircuitQsimParser::FromFile(opt.maxtime, opt.circuit_file, circuit)) { return 1; } struct Factory { - using Simulator = qsim::SimulatorCUDA; + using Simulator = qsim::SimulatorCUDA; using StateSpace = Simulator::StateSpace; - Factory(const StateSpace::Parameter& param1, - const Simulator::Parameter& param2) - : param1(param1), param2(param2) {} + Factory(const StateSpace::Parameter& param) : param(param) {} StateSpace CreateStateSpace() const { - return StateSpace(param1); + return StateSpace(param); } Simulator CreateSimulator() const { - return Simulator(param2); + return Simulator(); } - const StateSpace::Parameter& param1; - const Simulator::Parameter& param2; + const StateSpace::Parameter& param; }; using Simulator = Factory::Simulator; using StateSpace = Simulator::StateSpace; using State = StateSpace::State; - using Fuser = MultiQubitGateFuser>; + using Fuser = MultiQubitGateFuser>; using Runner = QSimRunner; StateSpace::Parameter param1; param1.num_threads = opt.num_threads; param1.num_dblocks = opt.num_dblocks; - Simulator::Parameter param2; - param2.num_threads = opt.num_threads; - - Factory factory(param1, param2); + Factory factory(param1); StateSpace state_space = factory.CreateStateSpace(); State state = state_space.Create(circuit.num_qubits); diff --git a/apps/qsim_qtrajectory_cuda.cu b/apps/qsim_qtrajectory_cuda.cu index 65fe1cd3..0d513cfa 100644 --- a/apps/qsim_qtrajectory_cuda.cu +++ b/apps/qsim_qtrajectory_cuda.cu @@ -190,23 +190,20 @@ int main(int argc, char* argv[]) { using fp_type = float; struct Factory { - using Simulator = qsim::SimulatorCUDA; + using Simulator = qsim::SimulatorCUDA; using StateSpace = Simulator::StateSpace; - Factory(const StateSpace::Parameter& param1, - const Simulator::Parameter& param2) - : param1(param1), param2(param2) {} + Factory(const StateSpace::Parameter& param) : param(param) {} StateSpace CreateStateSpace() const { - return StateSpace(param1); + return StateSpace(param); } Simulator CreateSimulator() const { - return Simulator(param2); + return Simulator(); } - const StateSpace::Parameter& param1; - const Simulator::Parameter& param2; + const StateSpace::Parameter& param; }; using Simulator = Factory::Simulator; @@ -235,8 +232,7 @@ int main(int argc, char* argv[]) { } StateSpace::Parameter param1; - Simulator::Parameter param2; - Factory factory(param1, param2); + Factory factory(param1); Simulator simulator = factory.CreateSimulator(); StateSpace state_space = factory.CreateStateSpace(); diff --git a/docs/cirq_interface.md b/docs/cirq_interface.md index 24149708..c06653b7 100644 --- a/docs/cirq_interface.md +++ b/docs/cirq_interface.md @@ -190,11 +190,9 @@ is required to enable GPU execution: library if set to any other value. If `use_gpu` is set and `gpu_mode` is set to 0, the remaining parameters can -optionally be set to fine-tune perfomance for a specific device or circuit. +optionally be set to fine-tune StateSpace perfomance for a specific device. In most cases, the default values provide good performance. -* `gpu_sim_threads`: number of threads per CUDA block to use for the GPU -Simulator. This must be a power of 2 in the range [32, 256]. * `gpu_state_threads`: number of threads per CUDA block to use for the GPU StateSpace. This must be a power of 2 in the range [32, 1024]. -* `gpu_data_blocks`: number of data blocks to use on GPU. Below 16 data blocks, -performance is noticeably reduced. +* `gpu_data_blocks`: number of data blocks to use for the GPU StateSpace. +Below 16 data blocks, performance is noticeably reduced. diff --git a/lib/run_qsim.h b/lib/run_qsim.h index b0aad9f3..37529152 100644 --- a/lib/run_qsim.h +++ b/lib/run_qsim.h @@ -133,6 +133,7 @@ struct QSimRunner final { } if (param.verbosity > 3) { + state_space.DeviceSync(); double t2 = GetTime(); IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1); } @@ -147,6 +148,7 @@ struct QSimRunner final { } if (param.verbosity > 0) { + state_space.DeviceSync(); double t2 = GetTime(); IO::messagef("time is %g seconds.\n", t2 - t0); } @@ -221,12 +223,14 @@ struct QSimRunner final { } if (param.verbosity > 3) { + state_space.DeviceSync(); double t2 = GetTime(); IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1); } } if (param.verbosity > 0) { + state_space.DeviceSync(); double t2 = GetTime(); IO::messagef("simu time is %g seconds.\n", t2 - t0); } diff --git a/lib/simulator_cuda.h b/lib/simulator_cuda.h index b507a224..66bf702a 100644 --- a/lib/simulator_cuda.h +++ b/lib/simulator_cuda.h @@ -20,10 +20,11 @@ #include #include #include +#include +#include #include "bits.h" #include "statespace_cuda.h" -#include "util_cuda.h" namespace qsim { @@ -32,34 +33,28 @@ namespace qsim { */ template class SimulatorCUDA final { - public: - struct Parameter { - /** - * The number of threads per block. - * Should be 2 to the power of k, where k is in the range [5,8]. - * Note that the number of registers on the multiprocessor can be - * exceeded if k > 8 (num_threads > 256). - */ - unsigned num_threads = 256; - }; + private: + using idx_type = uint64_t; + using Complex = qsim::Complex; + + // The maximum buffer size for indices and gate matrices. + // The maximum gate matrix size (for 6-qubit gates) is + // 2 * 2^6 * 2^6 * sizeof(FP) = 8192 * sizeof(FP). The maximum index size is + // 128 * sizeof(idx_type) + 96 * sizeof(unsigned). + static constexpr unsigned max_buf_size = 8192 * sizeof(FP) + + 128 * sizeof(idx_type) + 96 * sizeof(unsigned); + public: using StateSpace = StateSpaceCUDA; using State = typename StateSpace::State; using fp_type = typename StateSpace::fp_type; - explicit SimulatorCUDA(const Parameter& param) - : param_(param), scratch_(nullptr), scratch_size_(0) { - ErrorCheck(cudaMalloc(&d_wf, 131072 * sizeof(fp_type))); - ErrorCheck(cudaMalloc(&d_idx, 992 * sizeof(unsigned))); - ErrorCheck(cudaMalloc(&d_ms, 7 * sizeof(uint64_t))); - ErrorCheck(cudaMalloc(&d_xss, 64 * sizeof(uint64_t))); + SimulatorCUDA() : scratch_(nullptr), scratch_size_(0) { + ErrorCheck(cudaMalloc(&d_ws, max_buf_size)); } ~SimulatorCUDA() { - ErrorCheck(cudaFree(d_wf)); - ErrorCheck(cudaFree(d_idx)); - ErrorCheck(cudaFree(d_ms)); - ErrorCheck(cudaFree(d_xss)); + ErrorCheck(cudaFree(d_ws)); if (scratch_ != nullptr) { ErrorCheck(cudaFree(scratch_)); @@ -76,80 +71,54 @@ class SimulatorCUDA final { const fp_type* matrix, State& state) const { // Assume qs[0] < qs[1] < qs[2] < ... . - switch (qs.size()) { - case 1: - if (qs[0] > 4) { - ApplyGate1H(qs, matrix, state); - } else { - ApplyGate1L(qs, matrix, state); - } - break; - case 2: - if (qs[0] > 4) { - ApplyGate2HH(qs, matrix, state); - } else if (qs[1] > 4) { - ApplyGate2HL(qs, matrix, state); - } else { - ApplyGate2LL(qs, matrix, state); - } - break; - case 3: - if (qs[0] > 4) { - ApplyGate3HHH(qs, matrix, state); - } else if (qs[1] > 4) { - ApplyGate3HHL(qs, matrix, state); - } else if (qs[2] > 4) { - ApplyGate3HLL(qs, matrix, state); - } else { - ApplyGate3LLL(qs, matrix, state); - } - break; - case 4: - if (qs[0] > 4) { - ApplyGate4HHHH(qs, matrix, state); - } else if (qs[1] > 4) { - ApplyGate4HHHL(qs, matrix, state); - } else if (qs[2] > 4) { - ApplyGate4HHLL(qs, matrix, state); - } else if (qs[3] > 4) { - ApplyGate4HLLL(qs, matrix, state); - } else { - ApplyGate4LLLL(qs, matrix, state); + if (qs[0] > 4) { + switch (qs.size()) { + case 1: + ApplyGateH<1>(qs, matrix, state); + break; + case 2: + ApplyGateH<2>(qs, matrix, state); + break; + case 3: + ApplyGateH<3>(qs, matrix, state); + break; + case 4: + ApplyGateH<4>(qs, matrix, state); + break; + case 5: + ApplyGateH<5>(qs, matrix, state); + break; + case 6: + ApplyGateH<6>(qs, matrix, state); + break; + default: + // Not implemented. + break; } - break; - case 5: - if (qs[0] > 4) { - ApplyGate5HHHHH(qs, matrix, state); - } else if (qs[1] > 4) { - ApplyGate5HHHHL(qs, matrix, state); - } else if (qs[2] > 4) { - ApplyGate5HHHLL(qs, matrix, state); - } else if (qs[3] > 4) { - ApplyGate5HHLLL(qs, matrix, state); - } else if (qs[4] > 4) { - ApplyGate5HLLLL(qs, matrix, state); - } else { - ApplyGate5LLLLL(qs, matrix, state); - } - break; - case 6: - if (qs[0] > 4) { - ApplyGate6HHHHHH(qs, matrix, state); - } else if (qs[1] > 4) { - ApplyGate6HHHHHL(qs, matrix, state); - } else if (qs[2] > 4) { - ApplyGate6HHHHLL(qs, matrix, state); - } else if (qs[3] > 4) { - ApplyGate6HHHLLL(qs, matrix, state); - } else if (qs[4] > 4) { - ApplyGate6HHLLLL(qs, matrix, state); - } else { - ApplyGate6HLLLLL(qs, matrix, state); + } else { + switch (qs.size()) { + case 1: + ApplyGateL<1>(qs, matrix, state); + break; + case 2: + ApplyGateL<2>(qs, matrix, state); + break; + case 3: + ApplyGateL<3>(qs, matrix, state); + break; + case 4: + ApplyGateL<4>(qs, matrix, state); + break; + case 5: + ApplyGateL<5>(qs, matrix, state); + break; + case 6: + ApplyGateL<6>(qs, matrix, state); + break; + default: + // Not implemented. + break; } - break; - default: - // Not implemented. - break; } } @@ -157,118 +126,76 @@ class SimulatorCUDA final { * Applies a controlled gate using CUDA instructions. * @param qs Indices of the qubits affected by this gate. * @param cqs Indices of control qubits. - * @param cmask Bit mask of control qubit values. + * @param cvals Bit mask of control qubit values. * @param matrix Matrix representation of the gate to be applied. * @param state The state of the system, to be updated by this method. */ void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cmask, + const std::vector& cqs, uint64_t cvals, const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + if (cqs.size() == 0) { ApplyGate(qs, matrix, state); return; } - switch (qs.size()) { - case 1: - if (qs[0] > 4) { - if (cqs[0] > 4) { - ApplyControlledGate1H_H(qs, cqs, cmask, matrix, state); - } else { - ApplyControlledGate1H_L(qs, cqs, cmask, matrix, state); - } - } else { - if (cqs[0] > 4) { - ApplyControlledGate1L_H(qs, cqs, cmask, matrix, state); - } else { - ApplyControlledGate1L_L(qs, cqs, cmask, matrix, state); - } - } - break; - case 2: - if (qs[0] > 4) { - if (cqs[0] > 4) { - ApplyControlledGate2HH_H(qs, cqs, cmask, matrix, state); - } else { - ApplyControlledGate2HH_L(qs, cqs, cmask, matrix, state); - } - } else if (qs[1] > 4) { - if (cqs[0] > 4) { - ApplyControlledGate2HL_H(qs, cqs, cmask, matrix, state); - } else { - ApplyControlledGate2HL_L(qs, cqs, cmask, matrix, state); - } - } else { - if (cqs[0] > 4) { - ApplyControlledGate2LL_H(qs, cqs, cmask, matrix, state); - } else { - ApplyControlledGate2LL_L(qs, cqs, cmask, matrix, state); - } - } - break; - case 3: - if (qs[0] > 4) { - if (cqs[0] > 4) { - ApplyControlledGate3HHH_H(qs, cqs, cmask, matrix, state); - } else { - ApplyControlledGate3HHH_L(qs, cqs, cmask, matrix, state); - } - } else if (qs[1] > 4) { - if (cqs[0] > 4) { - ApplyControlledGate3HHL_H(qs, cqs, cmask, matrix, state); - } else { - ApplyControlledGate3HHL_L(qs, cqs, cmask, matrix, state); - } - } else if (qs[2] > 4) { - if (cqs[0] > 4) { - ApplyControlledGate3HLL_H(qs, cqs, cmask, matrix, state); - } else { - ApplyControlledGate3HLL_L(qs, cqs, cmask, matrix, state); - } - } else { - if (cqs[0] > 4) { - ApplyControlledGate3LLL_H(qs, cqs, cmask, matrix, state); - } else { - ApplyControlledGate3LLL_L(qs, cqs, cmask, matrix, state); - } + if (cqs[0] < 5) { + switch (qs.size()) { + case 1: + ApplyControlledGateL<1>(qs, cqs, cvals, matrix, state); + break; + case 2: + ApplyControlledGateL<2>(qs, cqs, cvals, matrix, state); + break; + case 3: + ApplyControlledGateL<3>(qs, cqs, cvals, matrix, state); + break; + case 4: + ApplyControlledGateL<4>(qs, cqs, cvals, matrix, state); + break; + default: + // Not implemented. + break; } - break; - case 4: + } else { if (qs[0] > 4) { - if (cqs[0] > 4) { - ApplyControlledGate4HHHH_H(qs, cqs, cmask, matrix, state); - } else { - ApplyControlledGate4HHHH_L(qs, cqs, cmask, matrix, state); - } - } else if (qs[1] > 4) { - if (cqs[0] > 4) { - ApplyControlledGate4HHHL_H(qs, cqs, cmask, matrix, state); - } else { - ApplyControlledGate4HHHL_L(qs, cqs, cmask, matrix, state); - } - } else if (qs[2] > 4) { - if (cqs[0] > 4) { - ApplyControlledGate4HHLL_H(qs, cqs, cmask, matrix, state); - } else { - ApplyControlledGate4HHLL_L(qs, cqs, cmask, matrix, state); - } - } else if (qs[3] > 4) { - if (cqs[0] > 4) { - ApplyControlledGate4HLLL_H(qs, cqs, cmask, matrix, state); - } else { - ApplyControlledGate4HLLL_L(qs, cqs, cmask, matrix, state); + switch (qs.size()) { + case 1: + ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); + break; + case 2: + ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); + break; + case 3: + ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); + break; + case 4: + ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); + break; + default: + // Not implemented. + break; } } else { - if (cqs[0] > 4) { - ApplyControlledGate4LLLL_H(qs, cqs, cmask, matrix, state); - } else { - ApplyControlledGate4LLLL_L(qs, cqs, cmask, matrix, state); + switch (qs.size()) { + case 1: + ApplyControlledGateLH<1>(qs, cqs, cvals, matrix, state); + break; + case 2: + ApplyControlledGateLH<2>(qs, cqs, cvals, matrix, state); + break; + case 3: + ApplyControlledGateLH<3>(qs, cqs, cvals, matrix, state); + break; + case 4: + ApplyControlledGateLH<4>(qs, cqs, cvals, matrix, state); + break; + default: + // Not implemented. + break; } } - break; - default: - // Not implemented. - break; } } @@ -284,80 +211,42 @@ class SimulatorCUDA final { const State& state) const { // Assume qs[0] < qs[1] < qs[2] < ... . - switch (qs.size()) { - case 1: - if (qs[0] > 4) { - return ExpectationValue1H(qs, matrix, state); - } else { - return ExpectationValue1L(qs, matrix, state); - } - break; - case 2: - if (qs[0] > 4) { - return ExpectationValue2HH(qs, matrix, state); - } else if (qs[1] > 4) { - return ExpectationValue2HL(qs, matrix, state); - } else { - return ExpectationValue2LL(qs, matrix, state); + if (qs[0] > 4) { + switch (qs.size()) { + case 1: + return ExpectationValueH<1>(qs, matrix, state); + case 2: + return ExpectationValueH<2>(qs, matrix, state); + case 3: + return ExpectationValueH<3>(qs, matrix, state); + case 4: + return ExpectationValueH<4>(qs, matrix, state); + case 5: + return ExpectationValueH<5>(qs, matrix, state); + case 6: + return ExpectationValueH<6>(qs, matrix, state); + default: + // Not implemented. + break; } - break; - case 3: - if (qs[0] > 4) { - return ExpectationValue3HHH(qs, matrix, state); - } else if (qs[1] > 4) { - return ExpectationValue3HHL(qs, matrix, state); - } else if (qs[2] > 4) { - return ExpectationValue3HLL(qs, matrix, state); - } else { - return ExpectationValue3LLL(qs, matrix, state); - } - break; - case 4: - if (qs[0] > 4) { - return ExpectationValue4HHHH(qs, matrix, state); - } else if (qs[1] > 4) { - return ExpectationValue4HHHL(qs, matrix, state); - } else if (qs[2] > 4) { - return ExpectationValue4HHLL(qs, matrix, state); - } else if (qs[3] > 4) { - return ExpectationValue4HLLL(qs, matrix, state); - } else { - return ExpectationValue4LLLL(qs, matrix, state); - } - break; - case 5: - if (qs[0] > 4) { - return ExpectationValue5HHHHH(qs, matrix, state); - } else if (qs[1] > 4) { - return ExpectationValue5HHHHL(qs, matrix, state); - } else if (qs[2] > 4) { - return ExpectationValue5HHHLL(qs, matrix, state); - } else if (qs[3] > 4) { - return ExpectationValue5HHLLL(qs, matrix, state); - } else if (qs[4] > 4) { - return ExpectationValue5HLLLL(qs, matrix, state); - } else { - return ExpectationValue5LLLLL(qs, matrix, state); - } - break; - case 6: - if (qs[0] > 4) { - return ExpectationValue6HHHHHH(qs, matrix, state); - } else if (qs[1] > 4) { - return ExpectationValue6HHHHHL(qs, matrix, state); - } else if (qs[2] > 4) { - return ExpectationValue6HHHHLL(qs, matrix, state); - } else if (qs[3] > 4) { - return ExpectationValue6HHHLLL(qs, matrix, state); - } else if (qs[4] > 4) { - return ExpectationValue6HHLLLL(qs, matrix, state); - } else { - return ExpectationValue6HLLLLL(qs, matrix, state); + } else { + switch (qs.size()) { + case 1: + return ExpectationValueL<1>(qs, matrix, state); + case 2: + return ExpectationValueL<2>(qs, matrix, state); + case 3: + return ExpectationValueL<3>(qs, matrix, state); + case 4: + return ExpectationValueL<4>(qs, matrix, state); + case 5: + return ExpectationValueL<5>(qs, matrix, state); + case 6: + return ExpectationValueL<6>(qs, matrix, state); + default: + // Not implemented. + break; } - break; - default: - // Not implemented. - break; } return 0; @@ -371,6750 +260,652 @@ class SimulatorCUDA final { } private: - void ApplyGate1H(const std::vector& qs, - const fp_type* matrix, State& state) const { - uint64_t xs[1]; - uint64_t ms[2]; - - xs[0] = uint64_t{1} << (qs[0] + 1); - ms[0] = (uint64_t{1} << qs[0]) - 1; - ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); - - uint64_t xss[2]; - for (unsigned i = 0; i < 2; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 1; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } + // The following indices are used in kernels. + // xss - indices to access the state vector entries in global memory. + // ms - masks to access the state vector entries in global memory. + // tis - indices to access the state vector entries in shared memory + // in the presence of low gate qubits. + // qis - indices to access the state vector entries in shared memory + // in the presence of low gate qubits. + // cis - additional indices to access the state vector entries in global + // memory in the presence of low control qubits. + + template + struct IndicesH { + static constexpr unsigned gsize = 1 << G; + static constexpr unsigned matrix_size = 2 * gsize * gsize * sizeof(fp_type); + static constexpr unsigned xss_size = 32 * sizeof(idx_type) * (1 + (G == 6)); + static constexpr unsigned ms_size = 32 * sizeof(idx_type); + static constexpr unsigned xss_offs = matrix_size; + static constexpr unsigned ms_offs = xss_offs + xss_size; + static constexpr unsigned buf_size = ms_offs + ms_size; + + IndicesH(char* p) + : xss((idx_type*) (p + xss_offs)), ms((idx_type*) (p + ms_offs)) {} + + idx_type* xss; + idx_type* ms; + }; - ErrorCheck( - cudaMemcpy(d_wf, matrix, 8 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); + template + struct IndicesL : public IndicesH { + using Base = IndicesH; + static constexpr unsigned qis_size = 32 * sizeof(unsigned) * (1 + (G == 6)); + static constexpr unsigned tis_size = 32 * sizeof(unsigned); + static constexpr unsigned qis_offs = Base::buf_size; + static constexpr unsigned tis_offs = qis_offs + qis_size; + static constexpr unsigned buf_size = tis_offs + tis_size; + + IndicesL(char* p) + : Base(p), qis((unsigned*) (p + qis_offs)), + tis((unsigned*) (p + tis_offs)) {} + + unsigned* qis; + unsigned* tis; + }; - fp_type* rstate = state.get(); + template + struct IndicesLC : public IndicesL { + using Base = IndicesL; + static constexpr unsigned cis_size = 32 * sizeof(idx_type); + static constexpr unsigned cis_offs = Base::buf_size; + static constexpr unsigned buf_size = cis_offs + cis_size; - unsigned k = 6; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; + IndicesLC(char* p) : Base(p), cis((idx_type*) (p + cis_offs)) {} - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; + idx_type* cis; + }; - ApplyGate1H_Kernel<<>>( - d_wf, d_ms, d_xss, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } + struct DataC { + idx_type cvalsh; + unsigned num_aqs; + unsigned num_effective_qs; + unsigned remaining_low_cqs; + }; - void ApplyGate1L(const std::vector& qs, - const fp_type* matrix, State& state) const { - unsigned p[32]; - unsigned idx[32]; - fp_type wf[128]; + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + unsigned num_qubits = state.num_qubits(); - unsigned qmask = (1 << qs[0]); + IndicesH h_i(h_ws); + GetIndicesH(num_qubits, qs, qs.size(), h_i); - for (unsigned i = 0; i < 1; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask)); - } - } + std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); + ErrorCheck( + cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); - for (unsigned i = 0; i < 1; ++i) { - for (unsigned m = 0; m < 2; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2); - } + unsigned k = 5 + G; + unsigned n = num_qubits > k ? num_qubits - k : 0; + unsigned size = unsigned{1} << n; + unsigned threads = 64U; + unsigned blocks = std::max(1U, size / 2); - unsigned l = 2 * (2 * i + m); + IndicesH d_i(d_ws); - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } + ApplyGateH_Kernel<<>>( + (fp_type*) d_ws, d_i.xss, d_i.ms, state.get()); + } - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } + template + void ApplyGateL(const std::vector& qs, + const fp_type* matrix, State& state) const { + unsigned num_qubits = state.num_qubits(); - ErrorCheck( - cudaMemcpy(d_wf, wf, 128 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice)); + IndicesL h_i(h_ws); + auto num_effective_qs = GetIndicesL(num_qubits, qs, h_i); - fp_type* rstate = state.get(); + std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); + ErrorCheck( + cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); - unsigned k = 5; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; + unsigned k = 5 + num_effective_qs; + unsigned n = num_qubits > k ? num_qubits - k : 0; + unsigned size = unsigned{1} << n; + unsigned threads = 32; + unsigned blocks = size; - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; + IndicesL d_i(d_ws); - ApplyGate1L_Kernel<<>>( - d_wf, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); + ApplyGateL_Kernel<<>>( + (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, + 1 << num_effective_qs, state.get()); } - void ApplyGate2HH(const std::vector& qs, - const fp_type* matrix, State& state) const { - uint64_t xs[2]; - uint64_t ms[3]; + template + void ApplyControlledGateHH(const std::vector& qs, + const std::vector& cqs, idx_type cvals, + const fp_type* matrix, State& state) const { + unsigned aqs[64]; + idx_type cmaskh = 0; + unsigned num_qubits = state.num_qubits(); - xs[0] = uint64_t{1} << (qs[0] + 1); - ms[0] = (uint64_t{1} << qs[0]) - 1; - for (unsigned i = 1; i < 2; ++i) { - xs[i] = uint64_t{1} << (qs[i + 0] + 1); - ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); - } - ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); - - uint64_t xss[4]; - for (unsigned i = 0; i < 4; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 2; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } + IndicesH h_i(h_ws); - ErrorCheck( - cudaMemcpy(d_wf, matrix, 32 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); + unsigned num_aqs = GetHighQubits(qs, 0, cqs, 0, 0, cmaskh, aqs); + GetMs(num_qubits, aqs, num_aqs, h_i.ms); + GetXss(num_qubits, qs, qs.size(), h_i.xss); - fp_type* rstate = state.get(); + idx_type cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh); + + std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); + ErrorCheck( + cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); - unsigned k = 7; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; + unsigned k = 5 + G + cqs.size(); + unsigned n = num_qubits > k ? num_qubits - k : 0; + unsigned size = unsigned{1} << n; + unsigned threads = 64U; + unsigned blocks = std::max(1U, size / 2); - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; + IndicesH d_i(d_ws); - ApplyGate2HH_Kernel<<>>( - d_wf, d_ms, d_xss, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); + ApplyControlledGateH_Kernel<<>>( + (fp_type*) d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get()); } - void ApplyGate2HL(const std::vector& qs, - const fp_type* matrix, State& state) const { - uint64_t xs[1]; - uint64_t ms[2]; - - xs[0] = uint64_t{1} << (qs[1] + 1); - ms[0] = (uint64_t{1} << qs[1]) - 1; - ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); - - uint64_t xss[2]; - for (unsigned i = 0; i < 2; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 1; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } + template + void ApplyControlledGateLH(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + unsigned num_qubits = state.num_qubits(); - unsigned p[32]; - unsigned idx[32]; - fp_type wf[512]; + IndicesL h_i(h_ws); + auto d = GetIndicesLC(num_qubits, qs, cqs, cvals, h_i); - unsigned qmask = (1 << qs[0]); + std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); + ErrorCheck( + cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); - for (unsigned i = 0; i < 1; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask)); - } - } + unsigned k = 5 + G + cqs.size(); + unsigned n = num_qubits > k ? num_qubits - k : 0; + unsigned size = unsigned{1} << n; + unsigned threads = 32; + unsigned blocks = size; - for (unsigned i = 0; i < 2; ++i) { - for (unsigned m = 0; m < 4; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2); - } + IndicesL d_i(d_ws); - unsigned l = 2 * (4 * i + m); + ApplyControlledGateLH_Kernel<<>>( + (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, + d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, state.get()); + } - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } + template + void ApplyControlledGateL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + unsigned num_qubits = state.num_qubits(); - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } + IndicesLC h_i(h_ws); + auto d = GetIndicesLCL(num_qubits, qs, cqs, cvals, h_i); + std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); ErrorCheck( - cudaMemcpy(d_wf, wf, 512 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); + cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); - unsigned k = 6; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; + unsigned k = 5 + G + cqs.size(); + unsigned n = num_qubits > k ? num_qubits - k : 0; + unsigned size = unsigned{1} << n; + unsigned threads = 32; + unsigned blocks = size; - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; + IndicesLC d_i(d_ws); - ApplyGate2HL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); + ApplyControlledGateL_Kernel<<>>( + (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis, + d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, + 1 << (5 - d.remaining_low_cqs), state.get()); } - void ApplyGate2LL(const std::vector& qs, - const fp_type* matrix, State& state) const { - unsigned p[32]; - unsigned idx[96]; - fp_type wf[256]; + template + std::complex ExpectationValueH(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + unsigned num_qubits = state.num_qubits(); - unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + IndicesH h_i(h_ws); + GetIndicesH(num_qubits, qs, qs.size(), h_i); - for (unsigned i = 0; i < 3; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned m = 0; m < 4; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4); - } + std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); + ErrorCheck( + cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); - unsigned l = 2 * (4 * i + m); + unsigned k = 5 + G; + unsigned n = num_qubits > k ? num_qubits - k : 0; + unsigned size = unsigned{1} << n; - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } + unsigned s = std::min(n >= 14 ? n - 14 : 0, 4U); + unsigned threads = 64U; + unsigned blocks = std::max(1U, (size / 2) >> s); + unsigned num_iterations_per_block = 1 << s; - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } + constexpr unsigned m = 16; - ErrorCheck( - cudaMemcpy(d_wf, wf, 256 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice)); + Complex* d_res1 = (Complex*) AllocScratch((blocks + m) * sizeof(Complex)); + Complex* d_res2 = d_res1 + blocks; - fp_type* rstate = state.get(); + IndicesH d_i(d_ws); - unsigned k = 5; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; + ExpectationValueH_Kernel<<>>( + (fp_type*) d_ws, d_i.xss, d_i.ms, num_iterations_per_block, + state.get(), Plus(), d_res1); - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; + double mul = size == 1 ? 0.5 : 1.0; - ApplyGate2LL_Kernel<<>>( - d_wf, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); + return ExpectationValueReduceFinal(blocks, mul, d_res1, d_res2); } - void ApplyGate3HHH(const std::vector& qs, - const fp_type* matrix, State& state) const { - uint64_t xs[3]; - uint64_t ms[4]; + template + std::complex ExpectationValueL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + unsigned num_qubits = state.num_qubits(); - xs[0] = uint64_t{1} << (qs[0] + 1); - ms[0] = (uint64_t{1} << qs[0]) - 1; - for (unsigned i = 1; i < 3; ++i) { - xs[i] = uint64_t{1} << (qs[i + 0] + 1); - ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); - } - ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); - - uint64_t xss[8]; - for (unsigned i = 0; i < 8; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 3; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } + IndicesL h_i(h_ws); + auto num_effective_qs = GetIndicesL(num_qubits, qs, h_i); + std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); ErrorCheck( - cudaMemcpy(d_wf, matrix, 128 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); + cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); - unsigned k = 8; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; + unsigned k = 5 + num_effective_qs; + unsigned n = num_qubits > k ? num_qubits - k : 0; + unsigned size = unsigned{1} << n; - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; + unsigned s = std::min(n >= 13 ? n - 13 : 0, 5U); + unsigned threads = 32; + unsigned blocks = size >> s; + unsigned num_iterations_per_block = 1 << s; - ApplyGate3HHH_Kernel<<>>( - d_wf, d_ms, d_xss, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } + constexpr unsigned m = 16; - void ApplyGate3HHL(const std::vector& qs, - const fp_type* matrix, State& state) const { - uint64_t xs[2]; - uint64_t ms[3]; + Complex* d_res1 = (Complex*) AllocScratch((blocks + m) * sizeof(Complex)); + Complex* d_res2 = d_res1 + blocks; - xs[0] = uint64_t{1} << (qs[1] + 1); - ms[0] = (uint64_t{1} << qs[1]) - 1; - for (unsigned i = 1; i < 2; ++i) { - xs[i] = uint64_t{1} << (qs[i + 1] + 1); - ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); - } - ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); - - uint64_t xss[4]; - for (unsigned i = 0; i < 4; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 2; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } + IndicesL d_i(d_ws); - unsigned p[32]; - unsigned idx[32]; - fp_type wf[2048]; + ExpectationValueL_Kernel<<>>( + (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, + num_iterations_per_block, state.get(), Plus(), d_res1); - unsigned qmask = (1 << qs[0]); + double mul = double(1 << (5 + num_effective_qs - G)) / 32; - for (unsigned i = 0; i < 1; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask)); - } - } + return ExpectationValueReduceFinal(blocks, mul, d_res1, d_res2); + } - for (unsigned i = 0; i < 4; ++i) { - for (unsigned m = 0; m < 8; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2); - } + template + std::complex ExpectationValueReduceFinal( + unsigned blocks, double mul, + const Complex* d_res1, Complex* d_res2) const { + Complex res2[m]; - unsigned l = 2 * (8 * i + m); + if (blocks <= 16) { + ErrorCheck(cudaMemcpy(res2, d_res1, blocks * sizeof(Complex), + cudaMemcpyDeviceToHost)); + } else { + unsigned threads2 = std::min(1024U, blocks); + unsigned blocks2 = std::min(m, blocks / threads2); - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } + unsigned dblocks = std::max(1U, blocks / (blocks2 * threads2)); + unsigned bytes = threads2 * sizeof(Complex); - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } + Reduce2Kernel<<>>( + dblocks, blocks, Plus(), Plus(), d_res1, d_res2); - ErrorCheck( - cudaMemcpy(d_wf, wf, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); + ErrorCheck(cudaMemcpy(res2, d_res2, blocks2 * sizeof(Complex), + cudaMemcpyDeviceToHost)); - fp_type* rstate = state.get(); + blocks = blocks2; + } - unsigned k = 7; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; + double re = 0; + double im = 0; - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; + for (unsigned i = 0; i < blocks; ++i) { + re += res2[i].re; + im += res2[i].im; + } - ApplyGate3HHL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); + return {mul * re, mul * im}; } - void ApplyGate3HLL(const std::vector& qs, - const fp_type* matrix, State& state) const { - uint64_t xs[1]; - uint64_t ms[2]; - - xs[0] = uint64_t{1} << (qs[2] + 1); - ms[0] = (uint64_t{1} << qs[2]) - 1; - ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); - - uint64_t xss[2]; - for (unsigned i = 0; i < 2; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 1; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } + template + unsigned GetHighQubits(const std::vector& qs, unsigned qi, + const std::vector& cqs, unsigned ci, + unsigned ai, idx_type& cmaskh, AQ& aqs) const { + while (1) { + if (qi < qs.size() && (ci == cqs.size() || qs[qi] < cqs[ci])) { + aqs[ai++] = qs[qi++]; + } else if (ci < cqs.size()) { + cmaskh |= idx_type{1} << cqs[ci]; + aqs[ai++] = cqs[ci++]; + } else { + break; } - xss[i] = a; } - unsigned p[32]; - unsigned idx[96]; - fp_type wf[1024]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + return ai; + } - for (unsigned i = 0; i < 3; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask)); + template + void GetMs(unsigned num_qubits, const QS& qs, unsigned qs_size, + idx_type* ms) const { + if (qs_size == 0) { + ms[0] = idx_type(-1); + } else { + idx_type xs = idx_type{1} << (qs[0] + 1); + ms[0] = (idx_type{1} << qs[0]) - 1; + for (unsigned i = 1; i < qs_size; ++i) { + ms[i] = ((idx_type{1} << qs[i]) - 1) ^ (xs - 1); + xs = idx_type{1} << (qs[i] + 1); } + ms[qs_size] = ((idx_type{1} << num_qubits) - 1) ^ (xs - 1); } + } - for (unsigned i = 0; i < 2; ++i) { - for (unsigned m = 0; m < 8; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4); - } + template + void GetXss(unsigned num_qubits, const QS& qs, unsigned qs_size, + idx_type* xss) const { + if (qs_size == 0) { + xss[0] = 0; + } else { + unsigned g = qs_size; + unsigned gsize = 1 << qs_size; - unsigned l = 2 * (8 * i + m); + idx_type xs[64]; - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } + xs[0] = idx_type{1} << (qs[0] + 1); + for (unsigned i = 1; i < g; ++i) { + xs[i] = idx_type{1} << (qs[i] + 1); + } - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; + for (unsigned i = 0; i < gsize; ++i) { + idx_type a = 0; + for (unsigned k = 0; k < g; ++k) { + a += xs[k] * ((i >> k) & 1); } + xss[i] = a; } } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 1024 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 6; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyGate3HLL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); } - void ApplyGate3LLL(const std::vector& qs, - const fp_type* matrix, State& state) const { - unsigned p[32]; - unsigned idx[224]; - fp_type wf[512]; + template + void GetIndicesH(unsigned num_qubits, const qs_type& qs, unsigned qs_size, + IndicesH& indices) const { + if (qs_size == 0) { + indices.ms[0] = idx_type(-1); + indices.xss[0] = 0; + } else { + unsigned g = qs_size; + unsigned gsize = 1 << qs_size; - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); + idx_type xs[64]; - for (unsigned i = 0; i < 7; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask)); + xs[0] = idx_type{1} << (qs[0] + 1); + indices.ms[0] = (idx_type{1} << qs[0]) - 1; + for (unsigned i = 1; i < g; ++i) { + xs[i] = idx_type{1} << (qs[i] + 1); + indices.ms[i] = ((idx_type{1} << qs[i]) - 1) ^ (xs[i - 1] - 1); } - } + indices.ms[g] = ((idx_type{1} << num_qubits) - 1) ^ (xs[g - 1] - 1); - for (unsigned i = 0; i < 1; ++i) { - for (unsigned m = 0; m < 8; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8); + for (unsigned i = 0; i < gsize; ++i) { + idx_type a = 0; + for (unsigned k = 0; k < g; ++k) { + a += xs[k] * ((i >> k) & 1); } + indices.xss[i] = a; + } + } + } - unsigned l = 2 * (8 * i + m); + template + void GetIndicesL(unsigned num_effective_qs, unsigned qmask, + IndicesL& indices) const { + for (unsigned i = num_effective_qs + 1; i < (G + 1); ++i) { + indices.ms[i] = 0; + } - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } + for (unsigned i = (1 << num_effective_qs); i < indices.gsize; ++i) { + indices.xss[i] = 0; + } - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } + for (unsigned i = 0; i < indices.gsize; ++i) { + indices.qis[i] = bits::ExpandBits(i, 5 + num_effective_qs, qmask); } - ErrorCheck( - cudaMemcpy(d_wf, wf, 512 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice)); + unsigned tmask = ((1 << (5 + num_effective_qs)) - 1) ^ qmask; + for (unsigned i = 0; i < 32; ++i) { + indices.tis[i] = bits::ExpandBits(i, 5 + num_effective_qs, tmask); + } + } - fp_type* rstate = state.get(); + template + unsigned GetIndicesL(unsigned num_qubits, const std::vector& qs, + IndicesL& indices) const { + unsigned eqs[32]; - unsigned k = 5; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; + unsigned qmaskh = 0; + unsigned qmaskl = 0; - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; + unsigned qi = 0; - ApplyGate3LLL_Kernel<<>>( - d_wf, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } + while (qi < qs.size() && qs[qi] < 5) { + qmaskl |= 1 << qs[qi++]; + } - void ApplyGate4HHHH(const std::vector& qs, - const fp_type* matrix, State& state) const { - uint64_t xs[4]; - uint64_t ms[5]; + unsigned nq = std::max(5U, num_qubits); + unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size())); - xs[0] = uint64_t{1} << (qs[0] + 1); - ms[0] = (uint64_t{1} << qs[0]) - 1; - for (unsigned i = 1; i < 4; ++i) { - xs[i] = uint64_t{1} << (qs[i + 0] + 1); - ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); - } - ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1); - - uint64_t xss[16]; - for (unsigned i = 0; i < 16; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 4; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; + unsigned l = 0; + unsigned ei = 0; + unsigned num_low_qs = qi; + + if (qs.size() == num_low_qs) { + while (ei < num_effective_qs && l++ < num_low_qs) { + eqs[ei] = ei + 5; + ++ei; + } + } else { + while (ei < num_effective_qs && l < num_low_qs) { + unsigned ei5 = ei + 5; + eqs[ei] = ei5; + if (qi < qs.size() && qs[qi] == ei5) { + ++qi; + qmaskh |= 1 << ei5; + } else { + ++l; } + ++ei; } - xss[i] = a; - } - - ErrorCheck( - cudaMemcpy(d_wf, matrix, 512 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 5 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 16 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - fp_type* rstate = state.get(); - - unsigned k = 9; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; + while (ei < num_effective_qs) { + eqs[ei] = qs[qi++]; + qmaskh |= 1 << (ei + 5); + ++ei; + } + } - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; + GetIndicesH(num_qubits, eqs, num_effective_qs, indices); + GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices); - ApplyGate4HHHH_Kernel<<>>( - d_wf, d_ms, d_xss, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); + return num_effective_qs; } - void ApplyGate4HHHL(const std::vector& qs, - const fp_type* matrix, State& state) const { - uint64_t xs[3]; - uint64_t ms[4]; - - xs[0] = uint64_t{1} << (qs[1] + 1); - ms[0] = (uint64_t{1} << qs[1]) - 1; - for (unsigned i = 1; i < 3; ++i) { - xs[i] = uint64_t{1} << (qs[i + 1] + 1); - ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); - } - ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); - - uint64_t xss[8]; - for (unsigned i = 0; i < 8; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 3; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } + template + DataC GetIndicesLC(unsigned num_qubits, const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + IndicesL& indices) const { + unsigned aqs[64]; + unsigned eqs[32]; - unsigned p[32]; - unsigned idx[32]; - fp_type wf[8192]; + unsigned qmaskh = 0; + unsigned qmaskl = 0; + idx_type cmaskh = 0; - unsigned qmask = (1 << qs[0]); + unsigned qi = 0; - for (unsigned i = 0; i < 1; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask)); - } + while (qi < qs.size() && qs[qi] < 5) { + qmaskl |= 1 << qs[qi++]; } - for (unsigned i = 0; i < 8; ++i) { - for (unsigned m = 0; m < 16; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2); - } - - unsigned l = 2 * (16 * i + m); + unsigned nq = std::max(5U, num_qubits - unsigned(cqs.size())); + unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size())); - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } + unsigned l = 0; + unsigned ai = 5; + unsigned ci = 0; + unsigned ei = 0; + unsigned num_low_qs = qi; - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } + while (ai < num_qubits && l < num_low_qs) { + aqs[ai - 5] = ai; + if (qi < qs.size() && qs[qi] == ai) { + ++qi; + eqs[ei++] = ai; + qmaskh |= 1 << (ai - ci); + } else if (ci < cqs.size() && cqs[ci] == ai) { + ++ci; + cmaskh |= idx_type{1} << ai; + } else { + ++l; + eqs[ei++] = ai; } + ++ai; } - ErrorCheck( - cudaMemcpy(d_wf, wf, 8192 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice)); + unsigned i = ai; + unsigned j = qi; - fp_type* rstate = state.get(); + while (ei < num_effective_qs) { + eqs[ei++] = qs[j++]; + qmaskh |= 1 << (i++ - ci); + } - unsigned k = 8; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; + unsigned num_aqs = GetHighQubits(qs, qi, cqs, ci, ai - 5, cmaskh, aqs); + GetMs(num_qubits, aqs, num_aqs, indices.ms); + GetXss(num_qubits, eqs, num_effective_qs, indices.xss); + GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices); - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; + idx_type cvalsh = bits::ExpandBits(idx_type(cvals), num_qubits, cmaskh); - ApplyGate4HHHL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); + return {cvalsh, num_aqs, num_effective_qs}; } - void ApplyGate4HHLL(const std::vector& qs, - const fp_type* matrix, State& state) const { - uint64_t xs[2]; - uint64_t ms[3]; - - xs[0] = uint64_t{1} << (qs[2] + 1); - ms[0] = (uint64_t{1} << qs[2]) - 1; - for (unsigned i = 1; i < 2; ++i) { - xs[i] = uint64_t{1} << (qs[i + 2] + 1); - ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1); - } - ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); - - uint64_t xss[4]; - for (unsigned i = 0; i < 4; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 2; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } + template + DataC GetIndicesLCL(unsigned num_qubits, const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + IndicesLC& indices) const { + unsigned aqs[64]; + unsigned eqs[32]; - unsigned p[32]; - unsigned idx[96]; - fp_type wf[4096]; + unsigned qmaskh = 0; + unsigned qmaskl = 0; + idx_type cmaskh = 0; + idx_type cmaskl = 0; + idx_type cis_mask = 0; - unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + unsigned qi = 0; + unsigned ci = 0; - for (unsigned i = 0; i < 3; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask)); + for (unsigned k = 0; k < 5; ++k) { + if (qi < qs.size() && qs[qi] == k) { + qmaskl |= 1 << (k - ci); + ++qi; + } else if (ci < cqs.size() && cqs[ci] == k) { + cmaskl |= idx_type{1} << k; + ++ci; } } - for (unsigned i = 0; i < 4; ++i) { - for (unsigned m = 0; m < 16; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4); - } + unsigned num_low_qs = qi; + unsigned num_low_cqs = ci; - unsigned l = 2 * (16 * i + m); + unsigned nq = std::max(5U, num_qubits - unsigned(cqs.size())); + unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size())); - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } + unsigned l = 0; + unsigned ai = 5; + unsigned ei = 0; + unsigned num_low = num_low_qs + num_low_cqs; + unsigned remaining_low_cqs = num_low_cqs; + unsigned effective_low_qs = num_low_qs; + unsigned highest_cis_bit = 0; - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; + while (ai < num_qubits && l < num_low) { + aqs[ai - 5] = ai; + if (qi < qs.size() && qs[qi] == ai) { + ++qi; + if ((ai - ci) > 4) { + eqs[ei++] = ai; + qmaskh |= 1 << (ai - ci); + } else { + highest_cis_bit = ai; + cis_mask |= idx_type{1} << ai; + qmaskl |= 1 << (ai - ci); + --remaining_low_cqs; + ++effective_low_qs; + } + } else if (ci < cqs.size() && cqs[ci] == ai) { + ++ci; + cmaskh |= idx_type{1} << ai; + } else { + ++l; + if (remaining_low_cqs == 0) { + eqs[ei++] = ai; + } else { + highest_cis_bit = ai; + cis_mask |= idx_type{1} << ai; + --remaining_low_cqs; } } + ++ai; } - ErrorCheck( - cudaMemcpy(d_wf, wf, 4096 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); + unsigned i = ai; + unsigned j = effective_low_qs; - unsigned k = 7; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; + while (ei < num_effective_qs) { + eqs[ei++] = qs[j++]; + qmaskh |= 1 << (i++ - ci); + } - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; + unsigned num_aqs = GetHighQubits(qs, qi, cqs, ci, ai - 5, cmaskh, aqs); + GetMs(num_qubits, aqs, num_aqs, indices.ms); + GetXss(num_qubits, eqs, num_effective_qs, indices.xss); + GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices); - ApplyGate4HHLL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } + idx_type cvalsh = bits::ExpandBits(idx_type(cvals), num_qubits, cmaskh); + idx_type cvalsl = bits::ExpandBits(idx_type(cvals), 5, cmaskl); - void ApplyGate4HLLL(const std::vector& qs, - const fp_type* matrix, State& state) const { - uint64_t xs[1]; - uint64_t ms[2]; - - xs[0] = uint64_t{1} << (qs[3] + 1); - ms[0] = (uint64_t{1} << qs[3]) - 1; - ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); - - uint64_t xss[2]; - for (unsigned i = 0; i < 2; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 1; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; + cis_mask |= 31 ^ cmaskl; + highest_cis_bit = highest_cis_bit < 5 ? 5 : highest_cis_bit; + for (idx_type i = 0; i < 32; ++i) { + auto c = bits::ExpandBits(i, highest_cis_bit + 1, cis_mask); + indices.cis[i] = 2 * (c & 0xffffffe0) | (c & 0x1f) | cvalsl; } - unsigned p[32]; - unsigned idx[224]; - fp_type wf[2048]; + return {cvalsh, num_aqs, num_effective_qs, remaining_low_cqs}; + } - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); - for (unsigned i = 0; i < 7; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask)); + void* AllocScratch(uint64_t size) const { + if (size > scratch_size_) { + if (scratch_ != nullptr) { + ErrorCheck(cudaFree(scratch_)); } - } - for (unsigned i = 0; i < 2; ++i) { - for (unsigned m = 0; m < 16; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8); - } + ErrorCheck(cudaMalloc(const_cast(&scratch_), size)); - unsigned l = 2 * (16 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 6; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyGate4HLLL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyGate4LLLL(const std::vector& qs, - const fp_type* matrix, State& state) const { - unsigned p[32]; - unsigned idx[480]; - fp_type wf[1024]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]); - - for (unsigned i = 0; i < 15; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 16) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned m = 0; m < 16; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16); - } - - unsigned l = 2 * (16 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 1024 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 480 * sizeof(unsigned), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 5; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyGate4LLLL_Kernel<<>>( - d_wf, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyGate5HHHHH(const std::vector& qs, - const fp_type* matrix, State& state) const { - uint64_t xs[5]; - uint64_t ms[6]; - - xs[0] = uint64_t{1} << (qs[0] + 1); - ms[0] = (uint64_t{1} << qs[0]) - 1; - for (unsigned i = 1; i < 5; ++i) { - xs[i] = uint64_t{1} << (qs[i + 0] + 1); - ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); - } - ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1); - - uint64_t xss[32]; - for (unsigned i = 0; i < 32; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 5; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - ErrorCheck( - cudaMemcpy(d_wf, matrix, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 6 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 32 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 10; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyGate5HHHHH_Kernel<<>>( - d_wf, d_ms, d_xss, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyGate5HHHHL(const std::vector& qs, - const fp_type* matrix, State& state) const { - uint64_t xs[4]; - uint64_t ms[5]; - - xs[0] = uint64_t{1} << (qs[1] + 1); - ms[0] = (uint64_t{1} << qs[1]) - 1; - for (unsigned i = 1; i < 4; ++i) { - xs[i] = uint64_t{1} << (qs[i + 1] + 1); - ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); - } - ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1); - - uint64_t xss[16]; - for (unsigned i = 0; i < 16; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 4; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[32]; - fp_type wf[32768]; - - unsigned qmask = (1 << qs[0]); - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 16; ++i) { - for (unsigned m = 0; m < 32; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (64 * i + 32 * k + 2 * (m / 2) + (k + m) % 2); - } - - unsigned l = 2 * (32 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 32768 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 5 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 16 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 9; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyGate5HHHHL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyGate5HHHLL(const std::vector& qs, - const fp_type* matrix, State& state) const { - uint64_t xs[3]; - uint64_t ms[4]; - - xs[0] = uint64_t{1} << (qs[2] + 1); - ms[0] = (uint64_t{1} << qs[2]) - 1; - for (unsigned i = 1; i < 3; ++i) { - xs[i] = uint64_t{1} << (qs[i + 2] + 1); - ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1); - } - ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); - - uint64_t xss[8]; - for (unsigned i = 0; i < 8; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 3; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[96]; - fp_type wf[16384]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]); - - for (unsigned i = 0; i < 3; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 8; ++i) { - for (unsigned m = 0; m < 32; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (128 * i + 32 * k + 4 * (m / 4) + (k + m) % 4); - } - - unsigned l = 2 * (32 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 16384 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 8; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyGate5HHHLL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyGate5HHLLL(const std::vector& qs, - const fp_type* matrix, State& state) const { - uint64_t xs[2]; - uint64_t ms[3]; - - xs[0] = uint64_t{1} << (qs[3] + 1); - ms[0] = (uint64_t{1} << qs[3]) - 1; - for (unsigned i = 1; i < 2; ++i) { - xs[i] = uint64_t{1} << (qs[i + 3] + 1); - ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1); - } - ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); - - uint64_t xss[4]; - for (unsigned i = 0; i < 4; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 2; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[224]; - fp_type wf[8192]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); - - for (unsigned i = 0; i < 7; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 4; ++i) { - for (unsigned m = 0; m < 32; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (256 * i + 32 * k + 8 * (m / 8) + (k + m) % 8); - } - - unsigned l = 2 * (32 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 8192 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 7; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyGate5HHLLL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyGate5HLLLL(const std::vector& qs, - const fp_type* matrix, State& state) const { - uint64_t xs[1]; - uint64_t ms[2]; - - xs[0] = uint64_t{1} << (qs[4] + 1); - ms[0] = (uint64_t{1} << qs[4]) - 1; - ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); - - uint64_t xss[2]; - for (unsigned i = 0; i < 2; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 1; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[480]; - fp_type wf[4096]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]); - - for (unsigned i = 0; i < 15; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 16) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 2; ++i) { - for (unsigned m = 0; m < 32; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (512 * i + 32 * k + 16 * (m / 16) + (k + m) % 16); - } - - unsigned l = 2 * (32 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 4096 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 480 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 6; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyGate5HLLLL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyGate5LLLLL(const std::vector& qs, - const fp_type* matrix, State& state) const { - unsigned p[32]; - unsigned idx[992]; - fp_type wf[2048]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]) - | (1 << qs[4]); - - for (unsigned i = 0; i < 31; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 32) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned m = 0; m < 32; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (1024 * i + 32 * k + 32 * (m / 32) + (k + m) % 32); - } - - unsigned l = 2 * (32 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 992 * sizeof(unsigned), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 5; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyGate5LLLLL_Kernel<<>>( - d_wf, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyGate6HHHHHH(const std::vector& qs, - const fp_type* matrix, State& state) const { - uint64_t xs[6]; - uint64_t ms[7]; - - xs[0] = uint64_t{1} << (qs[0] + 1); - ms[0] = (uint64_t{1} << qs[0]) - 1; - for (unsigned i = 1; i < 6; ++i) { - xs[i] = uint64_t{1} << (qs[i + 0] + 1); - ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); - } - ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1); - - uint64_t xss[64]; - for (unsigned i = 0; i < 64; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 6; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - ErrorCheck( - cudaMemcpy(d_wf, matrix, 8192 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 7 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 64 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 11; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyGate6HHHHHH_Kernel<<>>( - d_wf, d_ms, d_xss, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyGate6HHHHHL(const std::vector& qs, - const fp_type* matrix, State& state) const { - uint64_t xs[5]; - uint64_t ms[6]; - - xs[0] = uint64_t{1} << (qs[1] + 1); - ms[0] = (uint64_t{1} << qs[1]) - 1; - for (unsigned i = 1; i < 5; ++i) { - xs[i] = uint64_t{1} << (qs[i + 1] + 1); - ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); - } - ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1); - - uint64_t xss[32]; - for (unsigned i = 0; i < 32; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 5; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[32]; - fp_type wf[131072]; - - unsigned qmask = (1 << qs[0]); - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 32; ++i) { - for (unsigned m = 0; m < 64; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (128 * i + 64 * k + 2 * (m / 2) + (k + m) % 2); - } - - unsigned l = 2 * (64 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 131072 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 6 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 32 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 10; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyGate6HHHHHL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyGate6HHHHLL(const std::vector& qs, - const fp_type* matrix, State& state) const { - uint64_t xs[4]; - uint64_t ms[5]; - - xs[0] = uint64_t{1} << (qs[2] + 1); - ms[0] = (uint64_t{1} << qs[2]) - 1; - for (unsigned i = 1; i < 4; ++i) { - xs[i] = uint64_t{1} << (qs[i + 2] + 1); - ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1); - } - ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1); - - uint64_t xss[16]; - for (unsigned i = 0; i < 16; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 4; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[96]; - fp_type wf[65536]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]); - - for (unsigned i = 0; i < 3; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 16; ++i) { - for (unsigned m = 0; m < 64; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (256 * i + 64 * k + 4 * (m / 4) + (k + m) % 4); - } - - unsigned l = 2 * (64 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 65536 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 5 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 16 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 9; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyGate6HHHHLL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyGate6HHHLLL(const std::vector& qs, - const fp_type* matrix, State& state) const { - uint64_t xs[3]; - uint64_t ms[4]; - - xs[0] = uint64_t{1} << (qs[3] + 1); - ms[0] = (uint64_t{1} << qs[3]) - 1; - for (unsigned i = 1; i < 3; ++i) { - xs[i] = uint64_t{1} << (qs[i + 3] + 1); - ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1); - } - ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); - - uint64_t xss[8]; - for (unsigned i = 0; i < 8; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 3; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[224]; - fp_type wf[32768]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); - - for (unsigned i = 0; i < 7; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 8; ++i) { - for (unsigned m = 0; m < 64; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (512 * i + 64 * k + 8 * (m / 8) + (k + m) % 8); - } - - unsigned l = 2 * (64 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 32768 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 8; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyGate6HHHLLL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyGate6HHLLLL(const std::vector& qs, - const fp_type* matrix, State& state) const { - uint64_t xs[2]; - uint64_t ms[3]; - - xs[0] = uint64_t{1} << (qs[4] + 1); - ms[0] = (uint64_t{1} << qs[4]) - 1; - for (unsigned i = 1; i < 2; ++i) { - xs[i] = uint64_t{1} << (qs[i + 4] + 1); - ms[i] = ((uint64_t{1} << qs[i + 4]) - 1) ^ (xs[i - 1] - 1); - } - ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); - - uint64_t xss[4]; - for (unsigned i = 0; i < 4; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 2; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[480]; - fp_type wf[16384]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]); - - for (unsigned i = 0; i < 15; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 16) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 4; ++i) { - for (unsigned m = 0; m < 64; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (1024 * i + 64 * k + 16 * (m / 16) + (k + m) % 16); - } - - unsigned l = 2 * (64 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 16384 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 480 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 7; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyGate6HHLLLL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyGate6HLLLLL(const std::vector& qs, - const fp_type* matrix, State& state) const { - uint64_t xs[1]; - uint64_t ms[2]; - - xs[0] = uint64_t{1} << (qs[5] + 1); - ms[0] = (uint64_t{1} << qs[5]) - 1; - ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); - - uint64_t xss[2]; - for (unsigned i = 0; i < 2; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 1; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[992]; - fp_type wf[8192]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]) - | (1 << qs[4]); - - for (unsigned i = 0; i < 31; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 32) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 2; ++i) { - for (unsigned m = 0; m < 64; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (2048 * i + 64 * k + 32 * (m / 32) + (k + m) % 32); - } - - unsigned l = 2 * (64 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 8192 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 992 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 6; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyGate6HLLLLL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate1H_H(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t xs[1]; - uint64_t ms[2]; - - xs[0] = uint64_t{1} << (qs[0] + 1); - ms[0] = (uint64_t{1} << qs[0]) - 1; - ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); - - uint64_t xss[2]; - for (unsigned i = 0; i < 2; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 1; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - uint64_t emaskh = 0; - - for (auto q : cqs) { - emaskh |= uint64_t{1} << q; - } - - uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); - - for (auto q : qs) { - emaskh |= uint64_t{1} << q; - } - - emaskh = ~emaskh ^ 31; - - ErrorCheck( - cudaMemcpy(d_wf, matrix, 8 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 6 + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate1H_H_Kernel<<>>( - d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate1H_L(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t xs[1]; - uint64_t ms[2]; - - xs[0] = uint64_t{1} << (qs[0] + 1); - ms[0] = (uint64_t{1} << qs[0]) - 1; - ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); - - uint64_t xss[2]; - for (unsigned i = 0; i < 2; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 1; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned cl = 0; - uint64_t emaskl = 0; - uint64_t emaskh = 0; - - for (auto q : cqs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } else { - ++cl; - emaskl |= uint64_t{1} << q; - } - } - - uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); - uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl); - - for (auto q : qs) { - emaskh |= uint64_t{1} << q; - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - fp_type wf[256]; - - unsigned qmask = (1 << qs[0]); - - for (unsigned i = 0; i < 2; ++i) { - for (unsigned m = 0; m < 2; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (2 * i + 2 * k + m); - } - - unsigned l = 2 * (2 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0; - wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 256 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 6 + cqs.size() - cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate1H_L_Kernel<<>>( - d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate1L_H(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t emaskh = 0; - - for (auto q : cqs) { - emaskh |= uint64_t{1} << q; - } - - uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); - - for (auto q : qs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - unsigned idx[32]; - fp_type wf[128]; - - unsigned qmask = (1 << qs[0]); - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned m = 0; m < 2; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2); - } - - unsigned l = 2 * (2 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 128 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 5 + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate1L_H_Kernel<<>>( - d_wf, state.num_qubits(), cmaskh, emaskh, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate1L_L(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - unsigned cl = 0; - uint64_t emaskl = 0; - uint64_t emaskh = 0; - - for (auto q : cqs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } else { - ++cl; - emaskl |= uint64_t{1} << q; - } - } - - uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); - uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl); - - for (auto q : qs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - unsigned idx[32]; - fp_type wf[128]; - - unsigned qmask = (1 << qs[0]); - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned m = 0; m < 2; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2); - } - - unsigned l = 2 * (2 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0; - wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 128 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 5 + cqs.size() - cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate1L_L_Kernel<<>>( - d_wf, state.num_qubits(), cmaskh, emaskh, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate2HH_H(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t xs[2]; - uint64_t ms[3]; - - xs[0] = uint64_t{1} << (qs[0] + 1); - ms[0] = (uint64_t{1} << qs[0]) - 1; - for (unsigned i = 1; i < 2; ++i) { - xs[i] = uint64_t{1} << (qs[i + 0] + 1); - ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); - } - ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); - - uint64_t xss[4]; - for (unsigned i = 0; i < 4; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 2; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - uint64_t emaskh = 0; - - for (auto q : cqs) { - emaskh |= uint64_t{1} << q; - } - - uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); - - for (auto q : qs) { - emaskh |= uint64_t{1} << q; - } - - emaskh = ~emaskh ^ 31; - - ErrorCheck( - cudaMemcpy(d_wf, matrix, 32 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 7 + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate2HH_H_Kernel<<>>( - d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate2HH_L(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t xs[2]; - uint64_t ms[3]; - - xs[0] = uint64_t{1} << (qs[0] + 1); - ms[0] = (uint64_t{1} << qs[0]) - 1; - for (unsigned i = 1; i < 2; ++i) { - xs[i] = uint64_t{1} << (qs[i + 0] + 1); - ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); - } - ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); - - uint64_t xss[4]; - for (unsigned i = 0; i < 4; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 2; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned cl = 0; - uint64_t emaskl = 0; - uint64_t emaskh = 0; - - for (auto q : cqs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } else { - ++cl; - emaskl |= uint64_t{1} << q; - } - } - - uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); - uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl); - - for (auto q : qs) { - emaskh |= uint64_t{1} << q; - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - fp_type wf[1024]; - - unsigned qmask = (1 << qs[0]); - - for (unsigned i = 0; i < 4; ++i) { - for (unsigned m = 0; m < 4; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (4 * i + 4 * k + m); - } - - unsigned l = 2 * (4 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0; - wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 1024 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 7 + cqs.size() - cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate2HH_L_Kernel<<>>( - d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate2HL_H(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t xs[1]; - uint64_t ms[2]; - - xs[0] = uint64_t{1} << (qs[1] + 1); - ms[0] = (uint64_t{1} << qs[1]) - 1; - ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); - - uint64_t xss[2]; - for (unsigned i = 0; i < 2; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 1; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - uint64_t emaskh = 0; - - for (auto q : cqs) { - emaskh |= uint64_t{1} << q; - } - - uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); - - for (auto q : qs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - unsigned idx[32]; - fp_type wf[512]; - - unsigned qmask = (1 << qs[0]); - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 2; ++i) { - for (unsigned m = 0; m < 4; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2); - } - - unsigned l = 2 * (4 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 512 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 6 + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate2HL_H_Kernel<<>>( - d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate2HL_L(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t xs[1]; - uint64_t ms[2]; - - xs[0] = uint64_t{1} << (qs[1] + 1); - ms[0] = (uint64_t{1} << qs[1]) - 1; - ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); - - uint64_t xss[2]; - for (unsigned i = 0; i < 2; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 1; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned cl = 0; - uint64_t emaskl = 0; - uint64_t emaskh = 0; - - for (auto q : cqs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } else { - ++cl; - emaskl |= uint64_t{1} << q; - } - } - - uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); - uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl); - - for (auto q : qs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - unsigned idx[32]; - fp_type wf[512]; - - unsigned qmask = (1 << qs[0]); - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 2; ++i) { - for (unsigned m = 0; m < 4; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2); - } - - unsigned l = 2 * (4 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0; - wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 512 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 6 + cqs.size() - cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate2HL_L_Kernel<<>>( - d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate2LL_H(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t emaskh = 0; - - for (auto q : cqs) { - emaskh |= uint64_t{1} << q; - } - - uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); - - for (auto q : qs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - unsigned idx[96]; - fp_type wf[256]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]); - - for (unsigned i = 0; i < 3; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned m = 0; m < 4; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4); - } - - unsigned l = 2 * (4 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 256 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 5 + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate2LL_H_Kernel<<>>( - d_wf, state.num_qubits(), cmaskh, emaskh, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate2LL_L(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - unsigned cl = 0; - uint64_t emaskl = 0; - uint64_t emaskh = 0; - - for (auto q : cqs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } else { - ++cl; - emaskl |= uint64_t{1} << q; - } - } - - uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); - uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl); - - for (auto q : qs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - unsigned idx[96]; - fp_type wf[256]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]); - - for (unsigned i = 0; i < 3; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned m = 0; m < 4; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4); - } - - unsigned l = 2 * (4 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0; - wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 256 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 5 + cqs.size() - cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate2LL_L_Kernel<<>>( - d_wf, state.num_qubits(), cmaskh, emaskh, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate3HHH_H(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t xs[3]; - uint64_t ms[4]; - - xs[0] = uint64_t{1} << (qs[0] + 1); - ms[0] = (uint64_t{1} << qs[0]) - 1; - for (unsigned i = 1; i < 3; ++i) { - xs[i] = uint64_t{1} << (qs[i + 0] + 1); - ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); - } - ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); - - uint64_t xss[8]; - for (unsigned i = 0; i < 8; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 3; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - uint64_t emaskh = 0; - - for (auto q : cqs) { - emaskh |= uint64_t{1} << q; - } - - uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); - - for (auto q : qs) { - emaskh |= uint64_t{1} << q; - } - - emaskh = ~emaskh ^ 31; - - ErrorCheck( - cudaMemcpy(d_wf, matrix, 128 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 8 + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate3HHH_H_Kernel<<>>( - d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate3HHH_L(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t xs[3]; - uint64_t ms[4]; - - xs[0] = uint64_t{1} << (qs[0] + 1); - ms[0] = (uint64_t{1} << qs[0]) - 1; - for (unsigned i = 1; i < 3; ++i) { - xs[i] = uint64_t{1} << (qs[i + 0] + 1); - ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); - } - ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); - - uint64_t xss[8]; - for (unsigned i = 0; i < 8; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 3; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned cl = 0; - uint64_t emaskl = 0; - uint64_t emaskh = 0; - - for (auto q : cqs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } else { - ++cl; - emaskl |= uint64_t{1} << q; - } - } - - uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); - uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl); - - for (auto q : qs) { - emaskh |= uint64_t{1} << q; - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - fp_type wf[4096]; - - unsigned qmask = (1 << qs[0]); - - for (unsigned i = 0; i < 8; ++i) { - for (unsigned m = 0; m < 8; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (8 * i + 8 * k + m); - } - - unsigned l = 2 * (8 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0; - wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 4096 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 8 + cqs.size() - cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate3HHH_L_Kernel<<>>( - d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate3HHL_H(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t xs[2]; - uint64_t ms[3]; - - xs[0] = uint64_t{1} << (qs[1] + 1); - ms[0] = (uint64_t{1} << qs[1]) - 1; - for (unsigned i = 1; i < 2; ++i) { - xs[i] = uint64_t{1} << (qs[i + 1] + 1); - ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); - } - ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); - - uint64_t xss[4]; - for (unsigned i = 0; i < 4; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 2; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - uint64_t emaskh = 0; - - for (auto q : cqs) { - emaskh |= uint64_t{1} << q; - } - - uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); - - for (auto q : qs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - unsigned idx[32]; - fp_type wf[2048]; - - unsigned qmask = (1 << qs[0]); - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 4; ++i) { - for (unsigned m = 0; m < 8; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2); - } - - unsigned l = 2 * (8 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 7 + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate3HHL_H_Kernel<<>>( - d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate3HHL_L(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t xs[2]; - uint64_t ms[3]; - - xs[0] = uint64_t{1} << (qs[1] + 1); - ms[0] = (uint64_t{1} << qs[1]) - 1; - for (unsigned i = 1; i < 2; ++i) { - xs[i] = uint64_t{1} << (qs[i + 1] + 1); - ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); - } - ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); - - uint64_t xss[4]; - for (unsigned i = 0; i < 4; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 2; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned cl = 0; - uint64_t emaskl = 0; - uint64_t emaskh = 0; - - for (auto q : cqs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } else { - ++cl; - emaskl |= uint64_t{1} << q; - } - } - - uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); - uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl); - - for (auto q : qs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - unsigned idx[32]; - fp_type wf[2048]; - - unsigned qmask = (1 << qs[0]); - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 4; ++i) { - for (unsigned m = 0; m < 8; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2); - } - - unsigned l = 2 * (8 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0; - wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 7 + cqs.size() - cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate3HHL_L_Kernel<<>>( - d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate3HLL_H(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t xs[1]; - uint64_t ms[2]; - - xs[0] = uint64_t{1} << (qs[2] + 1); - ms[0] = (uint64_t{1} << qs[2]) - 1; - ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); - - uint64_t xss[2]; - for (unsigned i = 0; i < 2; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 1; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - uint64_t emaskh = 0; - - for (auto q : cqs) { - emaskh |= uint64_t{1} << q; - } - - uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); - - for (auto q : qs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - unsigned idx[96]; - fp_type wf[1024]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]); - - for (unsigned i = 0; i < 3; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 2; ++i) { - for (unsigned m = 0; m < 8; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4); - } - - unsigned l = 2 * (8 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 1024 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 6 + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate3HLL_H_Kernel<<>>( - d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate3HLL_L(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t xs[1]; - uint64_t ms[2]; - - xs[0] = uint64_t{1} << (qs[2] + 1); - ms[0] = (uint64_t{1} << qs[2]) - 1; - ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); - - uint64_t xss[2]; - for (unsigned i = 0; i < 2; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 1; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned cl = 0; - uint64_t emaskl = 0; - uint64_t emaskh = 0; - - for (auto q : cqs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } else { - ++cl; - emaskl |= uint64_t{1} << q; - } - } - - uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); - uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl); - - for (auto q : qs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - unsigned idx[96]; - fp_type wf[1024]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]); - - for (unsigned i = 0; i < 3; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 2; ++i) { - for (unsigned m = 0; m < 8; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4); - } - - unsigned l = 2 * (8 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0; - wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 1024 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 6 + cqs.size() - cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate3HLL_L_Kernel<<>>( - d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate3LLL_H(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t emaskh = 0; - - for (auto q : cqs) { - emaskh |= uint64_t{1} << q; - } - - uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); - - for (auto q : qs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - unsigned idx[224]; - fp_type wf[512]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); - - for (unsigned i = 0; i < 7; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned m = 0; m < 8; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8); - } - - unsigned l = 2 * (8 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 512 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 5 + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate3LLL_H_Kernel<<>>( - d_wf, state.num_qubits(), cmaskh, emaskh, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate3LLL_L(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - unsigned cl = 0; - uint64_t emaskl = 0; - uint64_t emaskh = 0; - - for (auto q : cqs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } else { - ++cl; - emaskl |= uint64_t{1} << q; - } - } - - uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); - uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl); - - for (auto q : qs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - unsigned idx[224]; - fp_type wf[512]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); - - for (unsigned i = 0; i < 7; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned m = 0; m < 8; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8); - } - - unsigned l = 2 * (8 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0; - wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 512 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 5 + cqs.size() - cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate3LLL_L_Kernel<<>>( - d_wf, state.num_qubits(), cmaskh, emaskh, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate4HHHH_H(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t xs[4]; - uint64_t ms[5]; - - xs[0] = uint64_t{1} << (qs[0] + 1); - ms[0] = (uint64_t{1} << qs[0]) - 1; - for (unsigned i = 1; i < 4; ++i) { - xs[i] = uint64_t{1} << (qs[i + 0] + 1); - ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); - } - ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1); - - uint64_t xss[16]; - for (unsigned i = 0; i < 16; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 4; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - uint64_t emaskh = 0; - - for (auto q : cqs) { - emaskh |= uint64_t{1} << q; - } - - uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); - - for (auto q : qs) { - emaskh |= uint64_t{1} << q; - } - - emaskh = ~emaskh ^ 31; - - ErrorCheck( - cudaMemcpy(d_wf, matrix, 512 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 5 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 16 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 9 + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate4HHHH_H_Kernel<<>>( - d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate4HHHH_L(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t xs[4]; - uint64_t ms[5]; - - xs[0] = uint64_t{1} << (qs[0] + 1); - ms[0] = (uint64_t{1} << qs[0]) - 1; - for (unsigned i = 1; i < 4; ++i) { - xs[i] = uint64_t{1} << (qs[i + 0] + 1); - ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); - } - ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1); - - uint64_t xss[16]; - for (unsigned i = 0; i < 16; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 4; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned cl = 0; - uint64_t emaskl = 0; - uint64_t emaskh = 0; - - for (auto q : cqs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } else { - ++cl; - emaskl |= uint64_t{1} << q; - } - } - - uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); - uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl); - - for (auto q : qs) { - emaskh |= uint64_t{1} << q; - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - fp_type wf[16384]; - - unsigned qmask = (1 << qs[0]); - - for (unsigned i = 0; i < 16; ++i) { - for (unsigned m = 0; m < 16; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (16 * i + 16 * k + m); - } - - unsigned l = 2 * (16 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0; - wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 16384 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 5 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 16 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 9 + cqs.size() - cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate4HHHH_L_Kernel<<>>( - d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate4HHHL_H(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t xs[3]; - uint64_t ms[4]; - - xs[0] = uint64_t{1} << (qs[1] + 1); - ms[0] = (uint64_t{1} << qs[1]) - 1; - for (unsigned i = 1; i < 3; ++i) { - xs[i] = uint64_t{1} << (qs[i + 1] + 1); - ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); - } - ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); - - uint64_t xss[8]; - for (unsigned i = 0; i < 8; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 3; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - uint64_t emaskh = 0; - - for (auto q : cqs) { - emaskh |= uint64_t{1} << q; - } - - uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); - - for (auto q : qs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - unsigned idx[32]; - fp_type wf[8192]; - - unsigned qmask = (1 << qs[0]); - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 8; ++i) { - for (unsigned m = 0; m < 16; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2); - } - - unsigned l = 2 * (16 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 8192 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 8 + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate4HHHL_H_Kernel<<>>( - d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate4HHHL_L(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t xs[3]; - uint64_t ms[4]; - - xs[0] = uint64_t{1} << (qs[1] + 1); - ms[0] = (uint64_t{1} << qs[1]) - 1; - for (unsigned i = 1; i < 3; ++i) { - xs[i] = uint64_t{1} << (qs[i + 1] + 1); - ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); - } - ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); - - uint64_t xss[8]; - for (unsigned i = 0; i < 8; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 3; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned cl = 0; - uint64_t emaskl = 0; - uint64_t emaskh = 0; - - for (auto q : cqs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } else { - ++cl; - emaskl |= uint64_t{1} << q; - } - } - - uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); - uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl); - - for (auto q : qs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - unsigned idx[32]; - fp_type wf[8192]; - - unsigned qmask = (1 << qs[0]); - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 8; ++i) { - for (unsigned m = 0; m < 16; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2); - } - - unsigned l = 2 * (16 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0; - wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 8192 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 8 + cqs.size() - cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate4HHHL_L_Kernel<<>>( - d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate4HHLL_H(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t xs[2]; - uint64_t ms[3]; - - xs[0] = uint64_t{1} << (qs[2] + 1); - ms[0] = (uint64_t{1} << qs[2]) - 1; - for (unsigned i = 1; i < 2; ++i) { - xs[i] = uint64_t{1} << (qs[i + 2] + 1); - ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1); - } - ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); - - uint64_t xss[4]; - for (unsigned i = 0; i < 4; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 2; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - uint64_t emaskh = 0; - - for (auto q : cqs) { - emaskh |= uint64_t{1} << q; - } - - uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); - - for (auto q : qs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - unsigned idx[96]; - fp_type wf[4096]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]); - - for (unsigned i = 0; i < 3; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 4; ++i) { - for (unsigned m = 0; m < 16; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4); - } - - unsigned l = 2 * (16 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 4096 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 7 + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate4HHLL_H_Kernel<<>>( - d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate4HHLL_L(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t xs[2]; - uint64_t ms[3]; - - xs[0] = uint64_t{1} << (qs[2] + 1); - ms[0] = (uint64_t{1} << qs[2]) - 1; - for (unsigned i = 1; i < 2; ++i) { - xs[i] = uint64_t{1} << (qs[i + 2] + 1); - ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1); - } - ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); - - uint64_t xss[4]; - for (unsigned i = 0; i < 4; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 2; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned cl = 0; - uint64_t emaskl = 0; - uint64_t emaskh = 0; - - for (auto q : cqs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } else { - ++cl; - emaskl |= uint64_t{1} << q; - } - } - - uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); - uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl); - - for (auto q : qs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - unsigned idx[96]; - fp_type wf[4096]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]); - - for (unsigned i = 0; i < 3; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 4; ++i) { - for (unsigned m = 0; m < 16; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4); - } - - unsigned l = 2 * (16 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0; - wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 4096 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 7 + cqs.size() - cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate4HHLL_L_Kernel<<>>( - d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate4HLLL_H(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t xs[1]; - uint64_t ms[2]; - - xs[0] = uint64_t{1} << (qs[3] + 1); - ms[0] = (uint64_t{1} << qs[3]) - 1; - ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); - - uint64_t xss[2]; - for (unsigned i = 0; i < 2; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 1; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - uint64_t emaskh = 0; - - for (auto q : cqs) { - emaskh |= uint64_t{1} << q; - } - - uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); - - for (auto q : qs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - unsigned idx[224]; - fp_type wf[2048]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); - - for (unsigned i = 0; i < 7; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 2; ++i) { - for (unsigned m = 0; m < 16; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8); - } - - unsigned l = 2 * (16 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 6 + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate4HLLL_H_Kernel<<>>( - d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate4HLLL_L(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t xs[1]; - uint64_t ms[2]; - - xs[0] = uint64_t{1} << (qs[3] + 1); - ms[0] = (uint64_t{1} << qs[3]) - 1; - ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); - - uint64_t xss[2]; - for (unsigned i = 0; i < 2; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 1; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned cl = 0; - uint64_t emaskl = 0; - uint64_t emaskh = 0; - - for (auto q : cqs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } else { - ++cl; - emaskl |= uint64_t{1} << q; - } - } - - uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); - uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl); - - for (auto q : qs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - unsigned idx[224]; - fp_type wf[2048]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); - - for (unsigned i = 0; i < 7; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 2; ++i) { - for (unsigned m = 0; m < 16; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8); - } - - unsigned l = 2 * (16 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0; - wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 6 + cqs.size() - cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate4HLLL_L_Kernel<<>>( - d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate4LLLL_H(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - uint64_t emaskh = 0; - - for (auto q : cqs) { - emaskh |= uint64_t{1} << q; - } - - uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); - - for (auto q : qs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - unsigned idx[480]; - fp_type wf[1024]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]); - - for (unsigned i = 0; i < 15; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 16) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned m = 0; m < 16; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16); - } - - unsigned l = 2 * (16 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 1024 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 480 * sizeof(unsigned), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 5 + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate4LLLL_H_Kernel<<>>( - d_wf, state.num_qubits(), cmaskh, emaskh, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void ApplyControlledGate4LLLL_L(const std::vector& qs, - const std::vector& cqs, - uint64_t cmask, const fp_type* matrix, - State& state) const { - unsigned cl = 0; - uint64_t emaskl = 0; - uint64_t emaskh = 0; - - for (auto q : cqs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } else { - ++cl; - emaskl |= uint64_t{1} << q; - } - } - - uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); - uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl); - - for (auto q : qs) { - if (q > 4) { - emaskh |= uint64_t{1} << q; - } - } - - emaskh = ~emaskh ^ 31; - - unsigned p[32]; - unsigned idx[480]; - fp_type wf[1024]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]); - - for (unsigned i = 0; i < 15; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 16) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned m = 0; m < 16; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16); - } - - unsigned l = 2 * (16 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0; - wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 1024 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 480 * sizeof(unsigned), cudaMemcpyHostToDevice)); - - fp_type* rstate = state.get(); - - unsigned k = 5 + cqs.size() - cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - - ApplyControlledGate4LLLL_L_Kernel<<>>( - d_wf, state.num_qubits(), cmaskh, emaskh, d_idx, rstate); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - std::complex ExpectationValue1H(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - uint64_t xs[1]; - uint64_t ms[2]; - - xs[0] = uint64_t{1} << (qs[0] + 1); - ms[0] = (uint64_t{1} << qs[0]) - 1; - ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); - - uint64_t xss[2]; - for (unsigned i = 0; i < 2; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 1; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - ErrorCheck( - cudaMemcpy(d_wf, matrix, 8 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 6; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue1H_Kernel<<>>( - d_wf, d_ms, d_xss, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue1L(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - unsigned p[32]; - unsigned idx[32]; - fp_type wf[128]; - - unsigned qmask = (1 << qs[0]); - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned m = 0; m < 2; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2); - } - - unsigned l = 2 * (2 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 128 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 5; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue1L_Kernel<<>>( - d_wf, d_idx, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue2HH(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - uint64_t xs[2]; - uint64_t ms[3]; - - xs[0] = uint64_t{1} << (qs[0] + 1); - ms[0] = (uint64_t{1} << qs[0]) - 1; - for (unsigned i = 1; i < 2; ++i) { - xs[i] = uint64_t{1} << (qs[i + 0] + 1); - ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); - } - ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); - - uint64_t xss[4]; - for (unsigned i = 0; i < 4; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 2; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - ErrorCheck( - cudaMemcpy(d_wf, matrix, 32 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 7; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue2HH_Kernel<<>>( - d_wf, d_ms, d_xss, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue2HL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - uint64_t xs[1]; - uint64_t ms[2]; - - xs[0] = uint64_t{1} << (qs[1] + 1); - ms[0] = (uint64_t{1} << qs[1]) - 1; - ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); - - uint64_t xss[2]; - for (unsigned i = 0; i < 2; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 1; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[32]; - fp_type wf[512]; - - unsigned qmask = (1 << qs[0]); - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 2; ++i) { - for (unsigned m = 0; m < 4; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2); - } - - unsigned l = 2 * (4 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 512 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 6; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue2HL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue2LL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - unsigned p[32]; - unsigned idx[96]; - fp_type wf[256]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]); - - for (unsigned i = 0; i < 3; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned m = 0; m < 4; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4); - } - - unsigned l = 2 * (4 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 256 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 5; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue2LL_Kernel<<>>( - d_wf, d_idx, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue3HHH(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - uint64_t xs[3]; - uint64_t ms[4]; - - xs[0] = uint64_t{1} << (qs[0] + 1); - ms[0] = (uint64_t{1} << qs[0]) - 1; - for (unsigned i = 1; i < 3; ++i) { - xs[i] = uint64_t{1} << (qs[i + 0] + 1); - ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); - } - ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); - - uint64_t xss[8]; - for (unsigned i = 0; i < 8; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 3; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - ErrorCheck( - cudaMemcpy(d_wf, matrix, 128 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 8; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue3HHH_Kernel<<>>( - d_wf, d_ms, d_xss, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue3HHL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - uint64_t xs[2]; - uint64_t ms[3]; - - xs[0] = uint64_t{1} << (qs[1] + 1); - ms[0] = (uint64_t{1} << qs[1]) - 1; - for (unsigned i = 1; i < 2; ++i) { - xs[i] = uint64_t{1} << (qs[i + 1] + 1); - ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); - } - ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); - - uint64_t xss[4]; - for (unsigned i = 0; i < 4; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 2; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[32]; - fp_type wf[2048]; - - unsigned qmask = (1 << qs[0]); - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 4; ++i) { - for (unsigned m = 0; m < 8; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2); - } - - unsigned l = 2 * (8 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 7; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue3HHL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue3HLL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - uint64_t xs[1]; - uint64_t ms[2]; - - xs[0] = uint64_t{1} << (qs[2] + 1); - ms[0] = (uint64_t{1} << qs[2]) - 1; - ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); - - uint64_t xss[2]; - for (unsigned i = 0; i < 2; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 1; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[96]; - fp_type wf[1024]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]); - - for (unsigned i = 0; i < 3; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 2; ++i) { - for (unsigned m = 0; m < 8; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4); - } - - unsigned l = 2 * (8 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 1024 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 6; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue3HLL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue3LLL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - unsigned p[32]; - unsigned idx[224]; - fp_type wf[512]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); - - for (unsigned i = 0; i < 7; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned m = 0; m < 8; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8); - } - - unsigned l = 2 * (8 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 512 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 5; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue3LLL_Kernel<<>>( - d_wf, d_idx, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue4HHHH(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - uint64_t xs[4]; - uint64_t ms[5]; - - xs[0] = uint64_t{1} << (qs[0] + 1); - ms[0] = (uint64_t{1} << qs[0]) - 1; - for (unsigned i = 1; i < 4; ++i) { - xs[i] = uint64_t{1} << (qs[i + 0] + 1); - ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); - } - ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1); - - uint64_t xss[16]; - for (unsigned i = 0; i < 16; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 4; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - ErrorCheck( - cudaMemcpy(d_wf, matrix, 512 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 5 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 16 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 9; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue4HHHH_Kernel<<>>( - d_wf, d_ms, d_xss, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue4HHHL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - uint64_t xs[3]; - uint64_t ms[4]; - - xs[0] = uint64_t{1} << (qs[1] + 1); - ms[0] = (uint64_t{1} << qs[1]) - 1; - for (unsigned i = 1; i < 3; ++i) { - xs[i] = uint64_t{1} << (qs[i + 1] + 1); - ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); - } - ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); - - uint64_t xss[8]; - for (unsigned i = 0; i < 8; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 3; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[32]; - fp_type wf[8192]; - - unsigned qmask = (1 << qs[0]); - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 8; ++i) { - for (unsigned m = 0; m < 16; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2); - } - - unsigned l = 2 * (16 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 8192 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 8; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue4HHHL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue4HHLL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - uint64_t xs[2]; - uint64_t ms[3]; - - xs[0] = uint64_t{1} << (qs[2] + 1); - ms[0] = (uint64_t{1} << qs[2]) - 1; - for (unsigned i = 1; i < 2; ++i) { - xs[i] = uint64_t{1} << (qs[i + 2] + 1); - ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1); - } - ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); - - uint64_t xss[4]; - for (unsigned i = 0; i < 4; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 2; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[96]; - fp_type wf[4096]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]); - - for (unsigned i = 0; i < 3; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 4; ++i) { - for (unsigned m = 0; m < 16; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4); - } - - unsigned l = 2 * (16 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 4096 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 7; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue4HHLL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue4HLLL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - uint64_t xs[1]; - uint64_t ms[2]; - - xs[0] = uint64_t{1} << (qs[3] + 1); - ms[0] = (uint64_t{1} << qs[3]) - 1; - ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); - - uint64_t xss[2]; - for (unsigned i = 0; i < 2; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 1; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[224]; - fp_type wf[2048]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); - - for (unsigned i = 0; i < 7; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 2; ++i) { - for (unsigned m = 0; m < 16; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8); - } - - unsigned l = 2 * (16 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 6; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue4HLLL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue4LLLL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - unsigned p[32]; - unsigned idx[480]; - fp_type wf[1024]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]); - - for (unsigned i = 0; i < 15; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 16) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned m = 0; m < 16; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16); - } - - unsigned l = 2 * (16 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 1024 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 480 * sizeof(unsigned), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 5; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue4LLLL_Kernel<<>>( - d_wf, d_idx, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue5HHHHH(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - uint64_t xs[5]; - uint64_t ms[6]; - - xs[0] = uint64_t{1} << (qs[0] + 1); - ms[0] = (uint64_t{1} << qs[0]) - 1; - for (unsigned i = 1; i < 5; ++i) { - xs[i] = uint64_t{1} << (qs[i + 0] + 1); - ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); - } - ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1); - - uint64_t xss[32]; - for (unsigned i = 0; i < 32; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 5; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - ErrorCheck( - cudaMemcpy(d_wf, matrix, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 6 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 32 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 10; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue5HHHHH_Kernel<<>>( - d_wf, d_ms, d_xss, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue5HHHHL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - uint64_t xs[4]; - uint64_t ms[5]; - - xs[0] = uint64_t{1} << (qs[1] + 1); - ms[0] = (uint64_t{1} << qs[1]) - 1; - for (unsigned i = 1; i < 4; ++i) { - xs[i] = uint64_t{1} << (qs[i + 1] + 1); - ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); - } - ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1); - - uint64_t xss[16]; - for (unsigned i = 0; i < 16; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 4; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[32]; - fp_type wf[32768]; - - unsigned qmask = (1 << qs[0]); - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 16; ++i) { - for (unsigned m = 0; m < 32; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (64 * i + 32 * k + 2 * (m / 2) + (k + m) % 2); - } - - unsigned l = 2 * (32 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 32768 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 5 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 16 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 9; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue5HHHHL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue5HHHLL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - uint64_t xs[3]; - uint64_t ms[4]; - - xs[0] = uint64_t{1} << (qs[2] + 1); - ms[0] = (uint64_t{1} << qs[2]) - 1; - for (unsigned i = 1; i < 3; ++i) { - xs[i] = uint64_t{1} << (qs[i + 2] + 1); - ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1); - } - ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); - - uint64_t xss[8]; - for (unsigned i = 0; i < 8; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 3; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[96]; - fp_type wf[16384]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]); - - for (unsigned i = 0; i < 3; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 8; ++i) { - for (unsigned m = 0; m < 32; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (128 * i + 32 * k + 4 * (m / 4) + (k + m) % 4); - } - - unsigned l = 2 * (32 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 16384 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 8; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue5HHHLL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue5HHLLL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - uint64_t xs[2]; - uint64_t ms[3]; - - xs[0] = uint64_t{1} << (qs[3] + 1); - ms[0] = (uint64_t{1} << qs[3]) - 1; - for (unsigned i = 1; i < 2; ++i) { - xs[i] = uint64_t{1} << (qs[i + 3] + 1); - ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1); - } - ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); - - uint64_t xss[4]; - for (unsigned i = 0; i < 4; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 2; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[224]; - fp_type wf[8192]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); - - for (unsigned i = 0; i < 7; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 4; ++i) { - for (unsigned m = 0; m < 32; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (256 * i + 32 * k + 8 * (m / 8) + (k + m) % 8); - } - - unsigned l = 2 * (32 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 8192 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 7; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue5HHLLL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue5HLLLL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - uint64_t xs[1]; - uint64_t ms[2]; - - xs[0] = uint64_t{1} << (qs[4] + 1); - ms[0] = (uint64_t{1} << qs[4]) - 1; - ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); - - uint64_t xss[2]; - for (unsigned i = 0; i < 2; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 1; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[480]; - fp_type wf[4096]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]); - - for (unsigned i = 0; i < 15; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 16) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 2; ++i) { - for (unsigned m = 0; m < 32; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (512 * i + 32 * k + 16 * (m / 16) + (k + m) % 16); - } - - unsigned l = 2 * (32 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 4096 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 480 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 6; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue5HLLLL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue5LLLLL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - unsigned p[32]; - unsigned idx[992]; - fp_type wf[2048]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]) - | (1 << qs[4]); - - for (unsigned i = 0; i < 31; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 32) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned m = 0; m < 32; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (1024 * i + 32 * k + 32 * (m / 32) + (k + m) % 32); - } - - unsigned l = 2 * (32 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 992 * sizeof(unsigned), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 5; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue5LLLLL_Kernel<<>>( - d_wf, d_idx, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue6HHHHHH(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - uint64_t xs[6]; - uint64_t ms[7]; - - xs[0] = uint64_t{1} << (qs[0] + 1); - ms[0] = (uint64_t{1} << qs[0]) - 1; - for (unsigned i = 1; i < 6; ++i) { - xs[i] = uint64_t{1} << (qs[i + 0] + 1); - ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); - } - ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1); - - uint64_t xss[64]; - for (unsigned i = 0; i < 64; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 6; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - ErrorCheck( - cudaMemcpy(d_wf, matrix, 8192 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 7 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 64 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 11; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue6HHHHHH_Kernel<<>>( - d_wf, d_ms, d_xss, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue6HHHHHL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - uint64_t xs[5]; - uint64_t ms[6]; - - xs[0] = uint64_t{1} << (qs[1] + 1); - ms[0] = (uint64_t{1} << qs[1]) - 1; - for (unsigned i = 1; i < 5; ++i) { - xs[i] = uint64_t{1} << (qs[i + 1] + 1); - ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); - } - ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1); - - uint64_t xss[32]; - for (unsigned i = 0; i < 32; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 5; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[32]; - fp_type wf[131072]; - - unsigned qmask = (1 << qs[0]); - - for (unsigned i = 0; i < 1; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 32; ++i) { - for (unsigned m = 0; m < 64; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (128 * i + 64 * k + 2 * (m / 2) + (k + m) % 2); - } - - unsigned l = 2 * (64 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 131072 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 6 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 32 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 10; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue6HHHHHL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue6HHHHLL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - uint64_t xs[4]; - uint64_t ms[5]; - - xs[0] = uint64_t{1} << (qs[2] + 1); - ms[0] = (uint64_t{1} << qs[2]) - 1; - for (unsigned i = 1; i < 4; ++i) { - xs[i] = uint64_t{1} << (qs[i + 2] + 1); - ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1); - } - ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1); - - uint64_t xss[16]; - for (unsigned i = 0; i < 16; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 4; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[96]; - fp_type wf[65536]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]); - - for (unsigned i = 0; i < 3; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 16; ++i) { - for (unsigned m = 0; m < 64; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (256 * i + 64 * k + 4 * (m / 4) + (k + m) % 4); - } - - unsigned l = 2 * (64 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 65536 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 5 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 16 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 9; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue6HHHHLL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue6HHHLLL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - uint64_t xs[3]; - uint64_t ms[4]; - - xs[0] = uint64_t{1} << (qs[3] + 1); - ms[0] = (uint64_t{1} << qs[3]) - 1; - for (unsigned i = 1; i < 3; ++i) { - xs[i] = uint64_t{1} << (qs[i + 3] + 1); - ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1); - } - ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); - - uint64_t xss[8]; - for (unsigned i = 0; i < 8; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 3; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[224]; - fp_type wf[32768]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); - - for (unsigned i = 0; i < 7; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 8; ++i) { - for (unsigned m = 0; m < 64; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (512 * i + 64 * k + 8 * (m / 8) + (k + m) % 8); - } - - unsigned l = 2 * (64 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 32768 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 8; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue6HHHLLL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue6HHLLLL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - uint64_t xs[2]; - uint64_t ms[3]; - - xs[0] = uint64_t{1} << (qs[4] + 1); - ms[0] = (uint64_t{1} << qs[4]) - 1; - for (unsigned i = 1; i < 2; ++i) { - xs[i] = uint64_t{1} << (qs[i + 4] + 1); - ms[i] = ((uint64_t{1} << qs[i + 4]) - 1) ^ (xs[i - 1] - 1); - } - ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); - - uint64_t xss[4]; - for (unsigned i = 0; i < 4; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 2; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[480]; - fp_type wf[16384]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]); - - for (unsigned i = 0; i < 15; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 16) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 4; ++i) { - for (unsigned m = 0; m < 64; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (1024 * i + 64 * k + 16 * (m / 16) + (k + m) % 16); - } - - unsigned l = 2 * (64 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 16384 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 480 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 7; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue6HHLLLL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - std::complex ExpectationValue6HLLLLL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - uint64_t xs[1]; - uint64_t ms[2]; - - xs[0] = uint64_t{1} << (qs[5] + 1); - ms[0] = (uint64_t{1} << qs[5]) - 1; - ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); - - uint64_t xss[2]; - for (unsigned i = 0; i < 2; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < 1; ++k) { - if (((i >> k) & 1) == 1) { - a += xs[k]; - } - } - xss[i] = a; - } - - unsigned p[32]; - unsigned idx[992]; - fp_type wf[8192]; - - unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]) - | (1 << qs[4]); - - for (unsigned i = 0; i < 31; ++i) { - for (unsigned j = 0; j < 32; ++j) { - idx[32 * i + j] = - MaskedAdd(j, i + 1, qmask, 32) | (j & (0xffffffff ^ qmask)); - } - } - - for (unsigned i = 0; i < 2; ++i) { - for (unsigned m = 0; m < 64; ++m) { - for (unsigned j = 0; j < 32; ++j) { - unsigned k = bits::CompressBits(j, 5, qmask); - p[j] = 2 * (2048 * i + 64 * k + 32 * (m / 32) + (k + m) % 32); - } - - unsigned l = 2 * (64 * i + m); - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j] = matrix[p[j]]; - } - - for (unsigned j = 0; j < 32; ++j) { - wf[32 * l + j + 32] = matrix[p[j] + 1]; - } - } - } - - ErrorCheck( - cudaMemcpy(d_wf, wf, 8192 * sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_idx, idx, 992 * sizeof(unsigned), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - const fp_type* rstate = state.get(); - - unsigned k = 6; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Complex = qsim::Complex; - - unsigned threads = std::min(32 * size, uint64_t{param_.num_threads}); - unsigned blocks = 32 * size / threads; - unsigned bytes = threads * sizeof(Complex); - - Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex)); - Complex* resd1 = resd2 + 1; - - auto op1 = Plus(); - - ExpectationValue6HLLLLL_Kernel<<>>( - d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - Complex result; - - if (blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost)); - } else { - auto op2 = Plus(); - - unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks)); - unsigned dblocks2 = std::max(1U, blocks / threads2); - unsigned bytes2 = threads2 * sizeof(Complex); - - Reduce2Kernel<<<1, threads2, bytes2>>>( - dblocks2, blocks, op2, op1, resd1, resd2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost)); - } - - return {result.re, result.im}; - } - - static unsigned MaskedAdd( - unsigned a, unsigned b, unsigned mask, unsigned lsize) { - unsigned c = bits::CompressBits(a, 5, mask); - return bits::ExpandBits((c + b) % lsize, 5, mask); - } - - void* AllocScratch(uint64_t size) const { - if (size > scratch_size_) { - if (scratch_ != nullptr) { - ErrorCheck(cudaFree(scratch_)); - } - - ErrorCheck(cudaMalloc(const_cast(&scratch_), size)); - - const_cast(scratch_size_) = size; - } + const_cast(scratch_size_) = size; + } return scratch_; } - Parameter param_; - - fp_type* d_wf; - unsigned* d_idx; - uint64_t* d_ms; - uint64_t* d_xss; + char* d_ws; + char h_ws0[max_buf_size]; + char* h_ws = (char*) h_ws0; void* scratch_; uint64_t scratch_size_; diff --git a/lib/simulator_cuda_kernels.h b/lib/simulator_cuda_kernels.h index c2e66273..6510fadf 100644 --- a/lib/simulator_cuda_kernels.h +++ b/lib/simulator_cuda_kernels.h @@ -18,4519 +18,660 @@ #include #include -#include -#include -#include - #include "util_cuda.h" namespace qsim { -template -__device__ __forceinline__ Integer ExpandBits( - Integer bits, unsigned n, Integer mask) { - Integer ebits = 0; - unsigned k = 0; - - for (unsigned i = 0; i < n; ++i) { - if ((mask >> i) & 1) { - ebits |= ((bits >> k) & 1) << i; - ++k; - } - } - - return ebits; -} +template +__global__ void ApplyGateH_Kernel( + const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0, + const idx_type* __restrict__ mss, fp_type* __restrict__ rstate) { + // blockDim.x must be equal to 64. -template -__global__ void ApplyGate1H_Kernel( - const fp_type* __restrict__ v, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, fp_type* rstate) { - fp_type rn, in; - fp_type rs[2], is[2]; + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; + constexpr unsigned gsize = 1 << G; + constexpr unsigned rows = + G < 4 ? gsize : (sizeof(fp_type) == 4 ? + (G < 6 ? gsize : 32) : (G < 5 ? 8 : 16)); - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]); + fp_type rs[gsize], is[gsize]; - auto p0 = rstate + 2 * k + lane; + __shared__ idx_type xss[64]; + __shared__ fp_type v[2 * gsize * rows]; - for (unsigned l = 0; l < 2; ++l) { - rs[l] = *(p0 + xss[l]); - is[l] = *(p0 + xss[l] + 32); + if (threadIdx.x < gsize) { + xss[threadIdx.x] = xss0[threadIdx.x]; } - unsigned j = 0; - - for (unsigned l = 0; l < 2; ++l) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned n = 1; n < 2; ++n) { - rn += rs[n] * v[j] - is[n] * v[j + 1]; - in += rs[n] * v[j + 1] + is[n] * v[j]; - - j += 2; + if (G <= 2) { + if (threadIdx.x < 2 * gsize * gsize) { + v[threadIdx.x] = v0[threadIdx.x]; } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyGate1L_Kernel( - const fp_type* __restrict__ w, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[2], is[2]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - auto p0 = rstate + 64 * i + lane; - - for (unsigned l = 0; l < 1; ++l) { - rs[2 * l] = *(p0); - is[2 * l] = *(p0 + 32); - - for (unsigned j = 1; j < 2; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]); - is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]); + } else { + for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { + v[m + threadIdx.x] = v0[m + threadIdx.x]; } } - unsigned j = lane; - - for (unsigned l = 0; l < 1; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 2; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } + __syncthreads(); - *(p0) = rn; - *(p0 + 32) = in; + idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0; + idx_type ii = i & mss[0]; + for (unsigned j = 1; j <= G; ++j) { + i *= 2; + ii |= i & mss[j]; } -}; - -template -__global__ void ApplyGate2HH_Kernel( - const fp_type* __restrict__ v, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, fp_type* rstate) { - fp_type rn, in; - fp_type rs[4], is[4]; - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; + auto p0 = rstate + 2 * ii + threadIdx.x % 32; - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 4; ++l) { - rs[l] = *(p0 + xss[l]); - is[l] = *(p0 + xss[l] + 32); + for (unsigned k = 0; k < gsize; ++k) { + rs[k] = *(p0 + xss[k]); + is[k] = *(p0 + xss[k] + 32); } - unsigned j = 0; - - for (unsigned l = 0; l < 4; ++l) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; + for (unsigned s = 0; s < gsize / rows; ++s) { + if (s > 0) { + __syncthreads(); - for (unsigned n = 1; n < 4; ++n) { - rn += rs[n] * v[j] - is[n] * v[j + 1]; - in += rs[n] * v[j + 1] + is[n] * v[j]; + for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { + v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; + } - j += 2; + __syncthreads(); } - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyGate2HL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[4], is[4]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]); + unsigned j = 0; - auto p0 = rstate + 2 * k + lane; + for (unsigned k = rows * s; k < rows * (s + 1); ++k) { + fp_type rn = 0; + fp_type in = 0; - for (unsigned l = 0; l < 2; ++l) { - rs[2 * l] = *(p0 + xss[l]); - is[2 * l] = *(p0 + xss[l] + 32); + for (unsigned l = 0; l < gsize; ++l) { + fp_type rm = v[j++]; + fp_type im = v[j++]; + rn += rs[l] * rm; + rn -= is[l] * im; + in += rs[l] * im; + in += is[l] * rm; + } - for (unsigned j = 1; j < 2; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]); - is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]); + *(p0 + xss[k]) = rn; + *(p0 + xss[k] + 32) = in; } } +} - unsigned j = lane; - - for (unsigned l = 0; l < 2; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 4; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; +template +__global__ void ApplyGateL_Kernel( + const fp_type* __restrict__ v0, const idx_type* __restrict__ xss, + const idx_type* __restrict__ mss, const unsigned* __restrict__ qis, + const unsigned* __restrict__ tis, unsigned esize, + fp_type* __restrict__ rstate) { + // blockDim.x must be equal to 32. -template -__global__ void ApplyGate2LL_Kernel( - const fp_type* __restrict__ w, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[4], is[4]; + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; + constexpr unsigned gsize = 1 << G; + constexpr unsigned + rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ? + (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4)); - auto p0 = rstate + 64 * i + lane; + fp_type rs[gsize], is[gsize]; - for (unsigned l = 0; l < 1; ++l) { - rs[4 * l] = *(p0); - is[4 * l] = *(p0 + 32); + __shared__ fp_type v[2 * gsize * rows]; + __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; - for (unsigned j = 1; j < 4; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]); - is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]); + if (G < 2) { + if (threadIdx.x < 2 * gsize * gsize) { + v[threadIdx.x] = v0[threadIdx.x]; } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 1; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 4; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; + } else { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + threadIdx.x]; } - - *(p0) = rn; - *(p0 + 32) = in; - } -}; - -template -__global__ void ApplyGate3HHH_Kernel( - const fp_type* __restrict__ v, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, fp_type* rstate) { - fp_type rn, in; - fp_type rs[8], is[8]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]) - | (256 * i & ms[3]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 8; ++l) { - rs[l] = *(p0 + xss[l]); - is[l] = *(p0 + xss[l] + 32); } - unsigned j = 0; - - for (unsigned l = 0; l < 8; ++l) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned n = 1; n < 8; ++n) { - rn += rs[n] * v[j] - is[n] * v[j + 1]; - in += rs[n] * v[j + 1] + is[n] * v[j]; - - j += 2; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; + idx_type i = 32 * idx_type{blockIdx.x}; + idx_type ii = i & mss[0]; + for (unsigned j = 1; j <= G; ++j) { + i *= 2; + ii |= i & mss[j]; } -}; - -template -__global__ void ApplyGate3HHL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[8], is[8]; - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; + auto p0 = rstate + 2 * ii + threadIdx.x; - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 4; ++l) { - rs[2 * l] = *(p0 + xss[l]); - is[2 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 2; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]); - is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]); - } + for (unsigned k = 0; k < gsize; ++k) { + rs0[threadIdx.x][k] = *(p0 + xss[k]); + is0[threadIdx.x][k] = *(p0 + xss[k] + 32); } - unsigned j = lane; + for (unsigned k = 0; k < gsize; ++k) { + unsigned i = tis[threadIdx.x] | qis[k]; + unsigned m = i & 0x1f; + unsigned n = i / 32; - for (unsigned l = 0; l < 4; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 8; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; + rs[k] = rs0[m][n]; + is[k] = is0[m][n]; } -}; - -template -__global__ void ApplyGate3HLL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[8], is[8]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]); - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 2; ++l) { - rs[4 * l] = *(p0 + xss[l]); - is[4 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 4; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]); - is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]); + for (unsigned s = 0; s < gsize / rows; ++s) { + if (s > 0) { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; + } } - } - unsigned j = lane; + unsigned j = 0; - for (unsigned l = 0; l < 2; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; + for (unsigned k = rows * s; k < rows * (s + 1); ++k) { + fp_type rn = 0; + fp_type in = 0; - j += 64; + for (unsigned l = 0; l < gsize; ++l) { + fp_type rm = v[j++]; + fp_type im = v[j++]; + rn += rs[l] * rm; + rn -= is[l] * im; + in += rs[l] * im; + in += is[l] * rm; + } - for (unsigned n = 1; n < 8; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; + unsigned i = tis[threadIdx.x] | qis[k]; + unsigned m = i & 0x1f; + unsigned n = i / 32; - j += 64; + rs0[m][n] = rn; + is0[m][n] = in; } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; } -}; - -template -__global__ void ApplyGate3LLL_Kernel( - const fp_type* __restrict__ w, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[8], is[8]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - auto p0 = rstate + 64 * i + lane; - - for (unsigned l = 0; l < 1; ++l) { - rs[8 * l] = *(p0); - is[8 * l] = *(p0 + 32); - for (unsigned j = 1; j < 8; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]); - is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]); - } + for (unsigned k = 0; k < esize; ++k) { + *(p0 + xss[k]) = rs0[threadIdx.x][k]; + *(p0 + xss[k] + 32) = is0[threadIdx.x][k]; } +} - unsigned j = lane; - - for (unsigned l = 0; l < 1; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 8; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0) = rn; - *(p0 + 32) = in; - } -}; +template +__global__ void ApplyControlledGateH_Kernel( + const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0, + const idx_type* __restrict__ mss, unsigned num_mss, idx_type cvalsh, + fp_type* __restrict__ rstate) { + // blockDim.x must be equal to 64. -template -__global__ void ApplyGate4HHHH_Kernel( - const fp_type* __restrict__ v, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, fp_type* rstate) { - fp_type rn, in; - fp_type rs[16], is[16]; + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; + constexpr unsigned gsize = 1 << G; + constexpr unsigned rows = + G < 4 ? gsize : (sizeof(fp_type) == 4 ? + (G < 6 ? gsize : 32) : (G < 5 ? 8 : 16)); - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]) - | (256 * i & ms[3]) | (512 * i & ms[4]); + fp_type rs[gsize], is[gsize]; - auto p0 = rstate + 2 * k + lane; + __shared__ idx_type xss[64]; + __shared__ fp_type v[2 * gsize * rows]; - for (unsigned l = 0; l < 16; ++l) { - rs[l] = *(p0 + xss[l]); - is[l] = *(p0 + xss[l] + 32); + if (threadIdx.x < gsize) { + xss[threadIdx.x] = xss0[threadIdx.x]; } - unsigned j = 0; - - for (unsigned l = 0; l < 16; ++l) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned n = 1; n < 16; ++n) { - rn += rs[n] * v[j] - is[n] * v[j + 1]; - in += rs[n] * v[j + 1] + is[n] * v[j]; - - j += 2; + if (G <= 2) { + if (threadIdx.x < 2 * gsize * gsize) { + v[threadIdx.x] = v0[threadIdx.x]; } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyGate4HHHL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[16], is[16]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]) - | (256 * i & ms[3]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 8; ++l) { - rs[2 * l] = *(p0 + xss[l]); - is[2 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 2; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]); - is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]); + } else { + for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { + v[m + threadIdx.x] = v0[m + threadIdx.x]; } } - unsigned j = lane; - - for (unsigned l = 0; l < 8; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 16; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } + __syncthreads(); - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; + idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0; + idx_type ii = i & mss[0]; + for (unsigned j = 1; j < num_mss; ++j) { + i *= 2; + ii |= i & mss[j]; } -}; - -template -__global__ void ApplyGate4HHLL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[16], is[16]; - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; + ii |= cvalsh; - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]); + auto p0 = rstate + 2 * ii + threadIdx.x % 32; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 4; ++l) { - rs[4 * l] = *(p0 + xss[l]); - is[4 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 4; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]); - is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]); - } + for (unsigned k = 0; k < gsize; ++k) { + rs[k] = *(p0 + xss[k]); + is[k] = *(p0 + xss[k] + 32); } - unsigned j = lane; - - for (unsigned l = 0; l < 4; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; + for (unsigned s = 0; s < gsize / rows; ++s) { + if (s > 0) { + __syncthreads(); - j += 64; + for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { + v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; + } - for (unsigned n = 1; n < 16; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; + __syncthreads(); } - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyGate4HLLL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[16], is[16]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; + unsigned j = 0; - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]); + for (unsigned k = rows * s; k < rows * (s + 1); ++k) { + fp_type rn = 0; + fp_type in = 0; - auto p0 = rstate + 2 * k + lane; + for (unsigned l = 0; l < gsize; ++l) { + fp_type rm = v[j++]; + fp_type im = v[j++]; + rn += rs[l] * rm; + rn -= is[l] * im; + in += rs[l] * im; + in += is[l] * rm; + } - for (unsigned l = 0; l < 2; ++l) { - rs[8 * l] = *(p0 + xss[l]); - is[8 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 8; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]); - is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]); + *(p0 + xss[k]) = rn; + *(p0 + xss[k] + 32) = in; } } +} - unsigned j = lane; +template +__global__ void ApplyControlledGateLH_Kernel( + const fp_type* __restrict__ v0, const idx_type* __restrict__ xss, + const idx_type* __restrict__ mss, const unsigned* __restrict__ qis, + const unsigned* __restrict__ tis, unsigned num_mss, idx_type cvalsh, + unsigned esize, fp_type* __restrict__ rstate) { + // blockDim.x must be equal to 32. - for (unsigned l = 0; l < 2; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); - j += 64; + constexpr unsigned gsize = 1 << G; + constexpr unsigned + rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ? + (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4)); - for (unsigned n = 1; n < 16; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; + fp_type rs[gsize], is[gsize]; - j += 64; - } + __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; + __shared__ fp_type v[2 * gsize * rows]; - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; + idx_type i = 32 * idx_type{blockIdx.x}; + idx_type ii = i & mss[0]; + for (unsigned j = 1; j < num_mss; ++j) { + i *= 2; + ii |= i & mss[j]; } -}; - -template -__global__ void ApplyGate4LLLL_Kernel( - const fp_type* __restrict__ w, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[16], is[16]; - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; + ii |= cvalsh; - auto p0 = rstate + 64 * i + lane; + auto p0 = rstate + 2 * ii + threadIdx.x; - for (unsigned l = 0; l < 1; ++l) { - rs[16 * l] = *(p0); - is[16 * l] = *(p0 + 32); - - for (unsigned j = 1; j < 16; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[16 * l + j] = __shfl_sync(0xffffffff, rs[16 * l], idx[k]); - is[16 * l + j] = __shfl_sync(0xffffffff, is[16 * l], idx[k]); - } + for (unsigned k = 0; k < gsize; ++k) { + rs0[threadIdx.x][k] = *(p0 + xss[k]); + is0[threadIdx.x][k] = *(p0 + xss[k] + 32); } - unsigned j = lane; - - for (unsigned l = 0; l < 1; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 16; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; + if (G < 2) { + if (threadIdx.x < 2 * gsize * gsize) { + v[threadIdx.x] = v0[threadIdx.x]; + } + } else { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + threadIdx.x]; } - - *(p0) = rn; - *(p0 + 32) = in; } -}; -template -__global__ void ApplyGate5HHHHH_Kernel( - const fp_type* __restrict__ v, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, fp_type* rstate) { - fp_type rn, in; - fp_type rs[32], is[32]; + for (unsigned k = 0; k < gsize; ++k) { + unsigned i = tis[threadIdx.x] | qis[k]; + unsigned m = i & 0x1f; + unsigned n = i / 32; - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]) - | (256 * i & ms[3]) | (512 * i & ms[4]) | (1024 * i & ms[5]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 32; ++l) { - rs[l] = *(p0 + xss[l]); - is[l] = *(p0 + xss[l] + 32); + rs[k] = rs0[m][n]; + is[k] = is0[m][n]; } - unsigned j = 0; - - for (unsigned l = 0; l < 32; ++l) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned n = 1; n < 32; ++n) { - rn += rs[n] * v[j] - is[n] * v[j + 1]; - in += rs[n] * v[j + 1] + is[n] * v[j]; - - j += 2; + for (unsigned s = 0; s < gsize / rows; ++s) { + if (s > 0) { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; + } } - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyGate5HHHHL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[32], is[32]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; + unsigned j = 0; - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]) - | (256 * i & ms[3]) | (512 * i & ms[4]); + for (unsigned k = rows * s; k < rows * (s + 1); ++k) { + fp_type rn = 0; + fp_type in = 0; - auto p0 = rstate + 2 * k + lane; + for (unsigned l = 0; l < gsize; ++l) { + fp_type rm = v[j++]; + fp_type im = v[j++]; + rn += rs[l] * rm; + rn -= is[l] * im; + in += rs[l] * im; + in += is[l] * rm; + } - for (unsigned l = 0; l < 16; ++l) { - rs[2 * l] = *(p0 + xss[l]); - is[2 * l] = *(p0 + xss[l] + 32); + unsigned i = tis[threadIdx.x] | qis[k]; + unsigned m = i & 0x1f; + unsigned n = i / 32; - for (unsigned j = 1; j < 2; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]); - is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]); + rs0[m][n] = rn; + is0[m][n] = in; } } - unsigned j = lane; - - for (unsigned l = 0; l < 16; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 32; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; + for (unsigned k = 0; k < esize; ++k) { + *(p0 + xss[k]) = rs0[threadIdx.x][k]; + *(p0 + xss[k] + 32) = is0[threadIdx.x][k]; } -}; +} -template -__global__ void ApplyGate5HHHLL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[32], is[32]; +template +__global__ void ApplyControlledGateL_Kernel( + const fp_type* __restrict__ v0, const idx_type* __restrict__ xss, + const idx_type* __restrict__ mss, const unsigned* __restrict__ qis, + const unsigned* __restrict__ tis, const idx_type* __restrict__ cis, + unsigned num_mss, idx_type cvalsh, unsigned esize, unsigned rwthreads, + fp_type* __restrict__ rstate) { + // blockDim.x must be equal to 32. - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]) - | (256 * i & ms[3]); + constexpr unsigned gsize = 1 << G; + constexpr unsigned + rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ? + (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4)); - auto p0 = rstate + 2 * k + lane; + fp_type rs[gsize], is[gsize]; - for (unsigned l = 0; l < 8; ++l) { - rs[4 * l] = *(p0 + xss[l]); - is[4 * l] = *(p0 + xss[l] + 32); + __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; + __shared__ fp_type v[2 * gsize * rows]; - for (unsigned j = 1; j < 4; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]); - is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]); - } + idx_type i = 32 * idx_type{blockIdx.x}; + idx_type ii = i & mss[0]; + for (unsigned j = 1; j < num_mss; ++j) { + i *= 2; + ii |= i & mss[j]; } - unsigned j = lane; - - for (unsigned l = 0; l < 8; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; + ii |= cvalsh; - for (unsigned n = 1; n < 32; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; + auto p0 = rstate + 2 * ii + cis[threadIdx.x]; - j += 64; + if (threadIdx.x < rwthreads) { + for (unsigned k = 0; k < gsize; ++k) { + rs0[threadIdx.x][k] = *(p0 + xss[k]); + is0[threadIdx.x][k] = *(p0 + xss[k] + 32); } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; } -}; - -template -__global__ void ApplyGate5HHLLL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[32], is[32]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]); - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 4; ++l) { - rs[8 * l] = *(p0 + xss[l]); - is[8 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 8; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]); - is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]); + if (G < 2) { + if (threadIdx.x < 2 * gsize * gsize) { + v[threadIdx.x] = v0[threadIdx.x]; } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 4; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 32; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; + } else { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + threadIdx.x]; } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; } -}; - -template -__global__ void ApplyGate5HLLLL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[32], is[32]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]); - auto p0 = rstate + 2 * k + lane; + for (unsigned k = 0; k < gsize; ++k) { + unsigned i = tis[threadIdx.x] | qis[k]; + unsigned m = i & 0x1f; + unsigned n = i / 32; - for (unsigned l = 0; l < 2; ++l) { - rs[16 * l] = *(p0 + xss[l]); - is[16 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 16; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[16 * l + j] = __shfl_sync(0xffffffff, rs[16 * l], idx[k]); - is[16 * l + j] = __shfl_sync(0xffffffff, is[16 * l], idx[k]); - } + rs[k] = rs0[m][n]; + is[k] = is0[m][n]; } - unsigned j = lane; - - for (unsigned l = 0; l < 2; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 32; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; + for (unsigned s = 0; s < gsize / rows; ++s) { + if (s > 0) { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; + } } - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyGate5LLLLL_Kernel( - const fp_type* __restrict__ w, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[32], is[32]; + unsigned j = 0; - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; + for (unsigned k = rows * s; k < rows * (s + 1); ++k) { + fp_type rn = 0; + fp_type in = 0; - auto p0 = rstate + 64 * i + lane; + for (unsigned l = 0; l < gsize; ++l) { + fp_type rm = v[j++]; + fp_type im = v[j++]; + rn += rs[l] * rm; + rn -= is[l] * im; + in += rs[l] * im; + in += is[l] * rm; + } - for (unsigned l = 0; l < 1; ++l) { - rs[32 * l] = *(p0); - is[32 * l] = *(p0 + 32); + unsigned i = tis[threadIdx.x] | qis[k]; + unsigned m = i & 0x1f; + unsigned n = i / 32; - for (unsigned j = 1; j < 32; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[32 * l + j] = __shfl_sync(0xffffffff, rs[32 * l], idx[k]); - is[32 * l + j] = __shfl_sync(0xffffffff, is[32 * l], idx[k]); + rs0[m][n] = rn; + is0[m][n] = in; } } - unsigned j = lane; - - for (unsigned l = 0; l < 1; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 32; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; + if (threadIdx.x < rwthreads) { + for (unsigned k = 0; k < esize; ++k) { + *(p0 + xss[k]) = rs0[threadIdx.x][k]; + *(p0 + xss[k] + 32) = is0[threadIdx.x][k]; } - - *(p0) = rn; - *(p0 + 32) = in; - } -}; - -template -__global__ void ApplyGate6HHHHHH_Kernel( - const fp_type* __restrict__ v, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, fp_type* rstate) { - fp_type rn, in; - fp_type rs[64], is[64]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]) - | (256 * i & ms[3]) | (512 * i & ms[4]) | (1024 * i & ms[5]) - | (2048 * i & ms[6]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 64; ++l) { - rs[l] = *(p0 + xss[l]); - is[l] = *(p0 + xss[l] + 32); } +} - unsigned j = 0; +template +__global__ void ExpectationValueH_Kernel( + const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0, + const idx_type* __restrict__ mss, unsigned num_iterations_per_block, + const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) { + // blockDim.x must be equal to 64. - for (unsigned l = 0; l < 64; ++l) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); - j += 2; + constexpr unsigned gsize = 1 << G; + constexpr unsigned rows = + G < 5 ? gsize : (sizeof(fp_type) == 4 ? (G < 6 ? 4 : 8) : 8); - for (unsigned n = 1; n < 64; ++n) { - rn += rs[n] * v[j] - is[n] * v[j + 1]; - in += rs[n] * v[j + 1] + is[n] * v[j]; + fp_type rs[gsize], is[gsize]; - j += 2; - } + __shared__ idx_type xss[64]; + __shared__ fp_type v[2 * gsize * rows]; - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; + if (threadIdx.x < gsize) { + xss[threadIdx.x] = xss0[threadIdx.x]; } -}; - -template -__global__ void ApplyGate6HHHHHL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[64], is[64]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]) - | (256 * i & ms[3]) | (512 * i & ms[4]) | (1024 * i & ms[5]); - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 32; ++l) { - rs[2 * l] = *(p0 + xss[l]); - is[2 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 2; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]); - is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]); + if (G <= 2) { + if (threadIdx.x < 2 * gsize * gsize) { + v[threadIdx.x] = v0[threadIdx.x]; } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 32; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 64; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; + } else { + for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { + v[m + threadIdx.x] = v0[m + threadIdx.x]; } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; } -}; - -template -__global__ void ApplyGate6HHHHLL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[64], is[64]; - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]) - | (256 * i & ms[3]) | (512 * i & ms[4]); + __syncthreads(); - auto p0 = rstate + 2 * k + lane; + double re = 0; + double im = 0; - for (unsigned l = 0; l < 16; ++l) { - rs[4 * l] = *(p0 + xss[l]); - is[4 * l] = *(p0 + xss[l] + 32); + for (unsigned iter = 0; iter < num_iterations_per_block; ++iter) { + idx_type b = num_iterations_per_block * idx_type{blockIdx.x} + iter; - for (unsigned j = 1; j < 4; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]); - is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]); + idx_type i = (64 * b + threadIdx.x) & 0xffffffffffe0; + idx_type ii = i & mss[0]; + for (unsigned j = 1; j <= G; ++j) { + i *= 2; + ii |= i & mss[j]; } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 16; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - for (unsigned n = 1; n < 64; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; + auto p0 = rstate + 2 * ii + threadIdx.x % 32; - j += 64; + for (unsigned k = 0; k < gsize; ++k) { + rs[k] = *(p0 + xss[k]); + is[k] = *(p0 + xss[k] + 32); } - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; + for (unsigned s = 0; s < gsize / rows; ++s) { + if (s > 0 || iter > 0) { + __syncthreads(); -template -__global__ void ApplyGate6HHHLLL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[64], is[64]; + for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { + v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; + } - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; + __syncthreads(); + } - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]) - | (256 * i & ms[3]); + unsigned j = 0; - auto p0 = rstate + 2 * k + lane; + for (unsigned k = rows * s; k < rows * (s + 1); ++k) { + fp_type rn = 0; + fp_type in = 0; - for (unsigned l = 0; l < 8; ++l) { - rs[8 * l] = *(p0 + xss[l]); - is[8 * l] = *(p0 + xss[l] + 32); + for (unsigned l = 0; l < gsize; ++l) { + fp_type rm = v[j++]; + fp_type im = v[j++]; + rn += rs[l] * rm; + rn -= is[l] * im; + in += rs[l] * im; + in += is[l] * rm; + } - for (unsigned j = 1; j < 8; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]); - is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]); + re += rs[k] * rn; + re += is[k] * in; + im += rs[k] * in; + im -= is[k] * rn; + } } } - unsigned j = lane; + __shared__ cfp_type partial1[64]; + __shared__ cfp_type partial2[2]; - for (unsigned l = 0; l < 8; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; + partial1[threadIdx.x].re = re; + partial1[threadIdx.x].im = im; - j += 64; + auto val = WarpReduce(partial1[threadIdx.x], op); - for (unsigned n = 1; n < 64; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; + if (threadIdx.x % 32 == 0) { + partial2[threadIdx.x / 32] = val; + } - j += 64; - } + __syncthreads(); - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; + if (threadIdx.x == 0) { + result[blockIdx.x].re = partial2[0].re + partial2[1].re; + result[blockIdx.x].im = partial2[0].im + partial2[1].im; } -}; +} -template -__global__ void ApplyGate6HHLLLL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[64], is[64]; +template +__global__ void ExpectationValueL_Kernel( + const fp_type* __restrict__ v0, const idx_type* __restrict__ xss, + const idx_type* __restrict__ mss, const unsigned* __restrict__ qis, + const unsigned* __restrict__ tis, unsigned num_iterations_per_block, + const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) { + // blockDim.x must be equal to 32. - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]); + constexpr unsigned gsize = 1 << G; + constexpr unsigned rows = G < 5 ? gsize : (sizeof(fp_type) == 4 ? + (G < 6 ? 4 : 2) : (G < 6 ? 2 : 1)); - auto p0 = rstate + 2 * k + lane; + fp_type rs[gsize], is[gsize]; - for (unsigned l = 0; l < 4; ++l) { - rs[16 * l] = *(p0 + xss[l]); - is[16 * l] = *(p0 + xss[l] + 32); + __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; + __shared__ fp_type v[2 * gsize * rows]; - for (unsigned j = 1; j < 16; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[16 * l + j] = __shfl_sync(0xffffffff, rs[16 * l], idx[k]); - is[16 * l + j] = __shfl_sync(0xffffffff, is[16 * l], idx[k]); + if (G < 2) { + if (threadIdx.x < 2 * gsize * gsize) { + v[threadIdx.x] = v0[threadIdx.x]; } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 4; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 64; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; + } else { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + threadIdx.x]; } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; } -}; - -template -__global__ void ApplyGate6HLLLLL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[64], is[64]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]); - auto p0 = rstate + 2 * k + lane; + double re = 0; + double im = 0; - for (unsigned l = 0; l < 2; ++l) { - rs[32 * l] = *(p0 + xss[l]); - is[32 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 32; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[32 * l + j] = __shfl_sync(0xffffffff, rs[32 * l], idx[k]); - is[32 * l + j] = __shfl_sync(0xffffffff, is[32 * l], idx[k]); + for (idx_type iter = 0; iter < num_iterations_per_block; ++iter) { + idx_type i = 32 * (num_iterations_per_block * idx_type{blockIdx.x} + iter); + idx_type ii = i & mss[0]; + for (unsigned j = 1; j <= G; ++j) { + i *= 2; + ii |= i & mss[j]; } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 2; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - j += 64; + auto p0 = rstate + 2 * ii + threadIdx.x; - for (unsigned n = 1; n < 64; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; + for (unsigned k = 0; k < gsize; ++k) { + rs0[threadIdx.x][k] = *(p0 + xss[k]); + is0[threadIdx.x][k] = *(p0 + xss[k] + 32); } - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate1H_H_Kernel( - const fp_type* __restrict__ v, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, fp_type* rstate) { - fp_type rn, in; - fp_type rs[2], is[2]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; + for (unsigned k = 0; k < gsize; ++k) { + unsigned i = tis[threadIdx.x] | qis[k]; + unsigned m = i & 0x1f; + unsigned n = i / 32; - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 2; ++l) { - rs[l] = *(p0 + xss[l]); - is[l] = *(p0 + xss[l] + 32); - } - - unsigned j = 0; - - for (unsigned l = 0; l < 2; ++l) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned n = 1; n < 2; ++n) { - rn += rs[n] * v[j] - is[n] * v[j + 1]; - in += rs[n] * v[j + 1] + is[n] * v[j]; - - j += 2; + rs[k] = rs0[m][n]; + is[k] = is0[m][n]; } - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate1H_L_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, fp_type* rstate) { - fp_type rn, in; - fp_type rs[2], is[2]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 2; ++l) { - rs[l] = *(p0 + xss[l]); - is[l] = *(p0 + xss[l] + 32); - } - - unsigned j = lane; + for (unsigned s = 0; s < gsize / rows; ++s) { + if (s > 0 || iter > 0) { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; + } + } - for (unsigned l = 0; l < 2; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; + unsigned j = 0; - j += 64; + for (unsigned k = rows * s; k < rows * (s + 1); ++k) { + fp_type rn = 0; + fp_type in = 0; - for (unsigned n = 1; n < 2; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } + for (unsigned l = 0; l < gsize; ++l) { + fp_type rm = v[j++]; + fp_type im = v[j++]; + rn += rs[l] * rm; + rn -= is[l] * im; + in += rs[l] * im; + in += is[l] * rm; + } - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate1L_H_Kernel( - const fp_type* __restrict__ w, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[2], is[2]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 1; ++l) { - rs[2 * l] = *(p0); - is[2 * l] = *(p0 + 32); - - for (unsigned j = 1; j < 2; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]); - is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]); + re += rs[k] * rn; + re += is[k] * in; + im += rs[k] * in; + im -= is[k] * rn; + } } } - unsigned j = lane; - - for (unsigned l = 0; l < 1; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; + __shared__ cfp_type partial[32]; - j += 64; + partial[threadIdx.x].re = re; + partial[threadIdx.x].im = im; - for (unsigned n = 1; n < 2; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } + auto val = WarpReduce(partial[threadIdx.x], op); - *(p0) = rn; - *(p0 + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate1L_L_Kernel( - const fp_type* __restrict__ w, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[2], is[2]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 1; ++l) { - rs[2 * l] = *(p0); - is[2 * l] = *(p0 + 32); - - for (unsigned j = 1; j < 2; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]); - is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]); - } + if (threadIdx.x == 0) { + result[blockIdx.x].re = val.re; + result[blockIdx.x].im = val.im; } - - unsigned j = lane; - - for (unsigned l = 0; l < 1; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 2; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0) = rn; - *(p0 + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate2HH_H_Kernel( - const fp_type* __restrict__ v, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, fp_type* rstate) { - fp_type rn, in; - fp_type rs[4], is[4]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 4; ++l) { - rs[l] = *(p0 + xss[l]); - is[l] = *(p0 + xss[l] + 32); - } - - unsigned j = 0; - - for (unsigned l = 0; l < 4; ++l) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned n = 1; n < 4; ++n) { - rn += rs[n] * v[j] - is[n] * v[j + 1]; - in += rs[n] * v[j + 1] + is[n] * v[j]; - - j += 2; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate2HH_L_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, fp_type* rstate) { - fp_type rn, in; - fp_type rs[4], is[4]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 4; ++l) { - rs[l] = *(p0 + xss[l]); - is[l] = *(p0 + xss[l] + 32); - } - - unsigned j = lane; - - for (unsigned l = 0; l < 4; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 4; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate2HL_H_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[4], is[4]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 2; ++l) { - rs[2 * l] = *(p0 + xss[l]); - is[2 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 2; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]); - is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]); - } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 2; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 4; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate2HL_L_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[4], is[4]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 2; ++l) { - rs[2 * l] = *(p0 + xss[l]); - is[2 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 2; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]); - is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]); - } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 2; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 4; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate2LL_H_Kernel( - const fp_type* __restrict__ w, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[4], is[4]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 1; ++l) { - rs[4 * l] = *(p0); - is[4 * l] = *(p0 + 32); - - for (unsigned j = 1; j < 4; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]); - is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]); - } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 1; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 4; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0) = rn; - *(p0 + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate2LL_L_Kernel( - const fp_type* __restrict__ w, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[4], is[4]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 1; ++l) { - rs[4 * l] = *(p0); - is[4 * l] = *(p0 + 32); - - for (unsigned j = 1; j < 4; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]); - is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]); - } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 1; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 4; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0) = rn; - *(p0 + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate3HHH_H_Kernel( - const fp_type* __restrict__ v, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, fp_type* rstate) { - fp_type rn, in; - fp_type rs[8], is[8]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 8; ++l) { - rs[l] = *(p0 + xss[l]); - is[l] = *(p0 + xss[l] + 32); - } - - unsigned j = 0; - - for (unsigned l = 0; l < 8; ++l) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned n = 1; n < 8; ++n) { - rn += rs[n] * v[j] - is[n] * v[j + 1]; - in += rs[n] * v[j + 1] + is[n] * v[j]; - - j += 2; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate3HHH_L_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, fp_type* rstate) { - fp_type rn, in; - fp_type rs[8], is[8]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 8; ++l) { - rs[l] = *(p0 + xss[l]); - is[l] = *(p0 + xss[l] + 32); - } - - unsigned j = lane; - - for (unsigned l = 0; l < 8; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 8; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate3HHL_H_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[8], is[8]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 4; ++l) { - rs[2 * l] = *(p0 + xss[l]); - is[2 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 2; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]); - is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]); - } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 4; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 8; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate3HHL_L_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[8], is[8]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 4; ++l) { - rs[2 * l] = *(p0 + xss[l]); - is[2 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 2; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]); - is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]); - } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 4; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 8; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate3HLL_H_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[8], is[8]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 2; ++l) { - rs[4 * l] = *(p0 + xss[l]); - is[4 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 4; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]); - is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]); - } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 2; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 8; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate3HLL_L_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[8], is[8]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 2; ++l) { - rs[4 * l] = *(p0 + xss[l]); - is[4 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 4; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]); - is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]); - } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 2; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 8; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate3LLL_H_Kernel( - const fp_type* __restrict__ w, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[8], is[8]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 1; ++l) { - rs[8 * l] = *(p0); - is[8 * l] = *(p0 + 32); - - for (unsigned j = 1; j < 8; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]); - is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]); - } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 1; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 8; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0) = rn; - *(p0 + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate3LLL_L_Kernel( - const fp_type* __restrict__ w, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[8], is[8]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 1; ++l) { - rs[8 * l] = *(p0); - is[8 * l] = *(p0 + 32); - - for (unsigned j = 1; j < 8; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]); - is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]); - } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 1; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 8; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0) = rn; - *(p0 + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate4HHHH_H_Kernel( - const fp_type* __restrict__ v, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, fp_type* rstate) { - fp_type rn, in; - fp_type rs[16], is[16]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 16; ++l) { - rs[l] = *(p0 + xss[l]); - is[l] = *(p0 + xss[l] + 32); - } - - unsigned j = 0; - - for (unsigned l = 0; l < 16; ++l) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned n = 1; n < 16; ++n) { - rn += rs[n] * v[j] - is[n] * v[j + 1]; - in += rs[n] * v[j + 1] + is[n] * v[j]; - - j += 2; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate4HHHH_L_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, fp_type* rstate) { - fp_type rn, in; - fp_type rs[16], is[16]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 16; ++l) { - rs[l] = *(p0 + xss[l]); - is[l] = *(p0 + xss[l] + 32); - } - - unsigned j = lane; - - for (unsigned l = 0; l < 16; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 16; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate4HHHL_H_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[16], is[16]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 8; ++l) { - rs[2 * l] = *(p0 + xss[l]); - is[2 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 2; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]); - is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]); - } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 8; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 16; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate4HHHL_L_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[16], is[16]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 8; ++l) { - rs[2 * l] = *(p0 + xss[l]); - is[2 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 2; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]); - is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]); - } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 8; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 16; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate4HHLL_H_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[16], is[16]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 4; ++l) { - rs[4 * l] = *(p0 + xss[l]); - is[4 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 4; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]); - is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]); - } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 4; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 16; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate4HHLL_L_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[16], is[16]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 4; ++l) { - rs[4 * l] = *(p0 + xss[l]); - is[4 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 4; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]); - is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]); - } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 4; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 16; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate4HLLL_H_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[16], is[16]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 2; ++l) { - rs[8 * l] = *(p0 + xss[l]); - is[8 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 8; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]); - is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]); - } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 2; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 16; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate4HLLL_L_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[16], is[16]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 2; ++l) { - rs[8 * l] = *(p0 + xss[l]); - is[8 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 8; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]); - is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]); - } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 2; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 16; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0 + xss[l]) = rn; - *(p0 + xss[l] + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate4LLLL_H_Kernel( - const fp_type* __restrict__ w, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[16], is[16]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 1; ++l) { - rs[16 * l] = *(p0); - is[16 * l] = *(p0 + 32); - - for (unsigned j = 1; j < 16; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[16 * l + j] = __shfl_sync(0xffffffff, rs[16 * l], idx[k]); - is[16 * l + j] = __shfl_sync(0xffffffff, is[16 * l], idx[k]); - } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 1; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 16; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0) = rn; - *(p0 + 32) = in; - } -}; - -template -__global__ void ApplyControlledGate4LLLL_L_Kernel( - const fp_type* __restrict__ w, unsigned num_qubits, - uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx, - fp_type* rstate) { - fp_type rn, in; - fp_type rs[16], is[16]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh; - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 1; ++l) { - rs[16 * l] = *(p0); - is[16 * l] = *(p0 + 32); - - for (unsigned j = 1; j < 16; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[16 * l + j] = __shfl_sync(0xffffffff, rs[16 * l], idx[k]); - is[16 * l + j] = __shfl_sync(0xffffffff, is[16 * l], idx[k]); - } - } - - unsigned j = lane; - - for (unsigned l = 0; l < 1; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 16; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - *(p0) = rn; - *(p0 + 32) = in; - } -}; - -template -__global__ void ExpectationValue1H_Kernel( - const fp_type* __restrict__ v, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[2], is[2]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 2; ++l) { - rs[l] = *(p0 + xss[l]); - is[l] = *(p0 + xss[l] + 32); - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = 0; - - for (unsigned l = 0; l < 2; ++l) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned n = 1; n < 2; ++n) { - rn += rs[n] * v[j] - is[n] * v[j + 1]; - in += rs[n] * v[j + 1] + is[n] * v[j]; - - j += 2; - } - - re += rs[l] * rn + is[l] * in; - im += rs[l] * in - is[l] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue1L_Kernel( - const fp_type* __restrict__ w, const unsigned* __restrict__ idx, - const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[2], is[2]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - auto p0 = rstate + 64 * i + lane; - - for (unsigned l = 0; l < 1; ++l) { - rs[2 * l] = *(p0); - is[2 * l] = *(p0 + 32); - - for (unsigned j = 1; j < 2; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]); - is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]); - } - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = lane; - - for (unsigned l = 0; l < 1; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 2; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - re += rs[l] * rn + is[l] * in; - im += rs[l] * in - is[l] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue2HH_Kernel( - const fp_type* __restrict__ v, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[4], is[4]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 4; ++l) { - rs[l] = *(p0 + xss[l]); - is[l] = *(p0 + xss[l] + 32); - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = 0; - - for (unsigned l = 0; l < 4; ++l) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned n = 1; n < 4; ++n) { - rn += rs[n] * v[j] - is[n] * v[j + 1]; - in += rs[n] * v[j + 1] + is[n] * v[j]; - - j += 2; - } - - re += rs[l] * rn + is[l] * in; - im += rs[l] * in - is[l] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue2HL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[4], is[4]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 2; ++l) { - rs[2 * l] = *(p0 + xss[l]); - is[2 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 2; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]); - is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]); - } - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = lane; - - for (unsigned l = 0; l < 2; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 4; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - unsigned m = 2 * l; - - re += rs[m] * rn + is[m] * in; - im += rs[m] * in - is[m] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue2LL_Kernel( - const fp_type* __restrict__ w, const unsigned* __restrict__ idx, - const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[4], is[4]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - auto p0 = rstate + 64 * i + lane; - - for (unsigned l = 0; l < 1; ++l) { - rs[4 * l] = *(p0); - is[4 * l] = *(p0 + 32); - - for (unsigned j = 1; j < 4; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]); - is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]); - } - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = lane; - - for (unsigned l = 0; l < 1; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 4; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - re += rs[l] * rn + is[l] * in; - im += rs[l] * in - is[l] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue3HHH_Kernel( - const fp_type* __restrict__ v, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[8], is[8]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]) - | (256 * i & ms[3]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 8; ++l) { - rs[l] = *(p0 + xss[l]); - is[l] = *(p0 + xss[l] + 32); - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = 0; - - for (unsigned l = 0; l < 8; ++l) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned n = 1; n < 8; ++n) { - rn += rs[n] * v[j] - is[n] * v[j + 1]; - in += rs[n] * v[j + 1] + is[n] * v[j]; - - j += 2; - } - - re += rs[l] * rn + is[l] * in; - im += rs[l] * in - is[l] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue3HHL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[8], is[8]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 4; ++l) { - rs[2 * l] = *(p0 + xss[l]); - is[2 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 2; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]); - is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]); - } - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = lane; - - for (unsigned l = 0; l < 4; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 8; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - unsigned m = 2 * l; - - re += rs[m] * rn + is[m] * in; - im += rs[m] * in - is[m] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue3HLL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[8], is[8]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 2; ++l) { - rs[4 * l] = *(p0 + xss[l]); - is[4 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 4; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]); - is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]); - } - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = lane; - - for (unsigned l = 0; l < 2; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 8; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - unsigned m = 4 * l; - - re += rs[m] * rn + is[m] * in; - im += rs[m] * in - is[m] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue3LLL_Kernel( - const fp_type* __restrict__ w, const unsigned* __restrict__ idx, - const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[8], is[8]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - auto p0 = rstate + 64 * i + lane; - - for (unsigned l = 0; l < 1; ++l) { - rs[8 * l] = *(p0); - is[8 * l] = *(p0 + 32); - - for (unsigned j = 1; j < 8; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]); - is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]); - } - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = lane; - - for (unsigned l = 0; l < 1; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 8; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - re += rs[l] * rn + is[l] * in; - im += rs[l] * in - is[l] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue4HHHH_Kernel( - const fp_type* __restrict__ v, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[16], is[16]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]) - | (256 * i & ms[3]) | (512 * i & ms[4]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 16; ++l) { - rs[l] = *(p0 + xss[l]); - is[l] = *(p0 + xss[l] + 32); - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = 0; - - for (unsigned l = 0; l < 16; ++l) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned n = 1; n < 16; ++n) { - rn += rs[n] * v[j] - is[n] * v[j + 1]; - in += rs[n] * v[j + 1] + is[n] * v[j]; - - j += 2; - } - - re += rs[l] * rn + is[l] * in; - im += rs[l] * in - is[l] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue4HHHL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[16], is[16]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]) - | (256 * i & ms[3]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 8; ++l) { - rs[2 * l] = *(p0 + xss[l]); - is[2 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 2; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]); - is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]); - } - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = lane; - - for (unsigned l = 0; l < 8; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 16; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - unsigned m = 2 * l; - - re += rs[m] * rn + is[m] * in; - im += rs[m] * in - is[m] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue4HHLL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[16], is[16]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 4; ++l) { - rs[4 * l] = *(p0 + xss[l]); - is[4 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 4; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]); - is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]); - } - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = lane; - - for (unsigned l = 0; l < 4; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 16; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - unsigned m = 4 * l; - - re += rs[m] * rn + is[m] * in; - im += rs[m] * in - is[m] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue4HLLL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[16], is[16]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 2; ++l) { - rs[8 * l] = *(p0 + xss[l]); - is[8 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 8; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]); - is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]); - } - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = lane; - - for (unsigned l = 0; l < 2; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 16; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - unsigned m = 8 * l; - - re += rs[m] * rn + is[m] * in; - im += rs[m] * in - is[m] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue4LLLL_Kernel( - const fp_type* __restrict__ w, const unsigned* __restrict__ idx, - const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[16], is[16]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - auto p0 = rstate + 64 * i + lane; - - for (unsigned l = 0; l < 1; ++l) { - rs[16 * l] = *(p0); - is[16 * l] = *(p0 + 32); - - for (unsigned j = 1; j < 16; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[16 * l + j] = __shfl_sync(0xffffffff, rs[16 * l], idx[k]); - is[16 * l + j] = __shfl_sync(0xffffffff, is[16 * l], idx[k]); - } - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = lane; - - for (unsigned l = 0; l < 1; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 16; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - re += rs[l] * rn + is[l] * in; - im += rs[l] * in - is[l] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue5HHHHH_Kernel( - const fp_type* __restrict__ v, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[32], is[32]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]) - | (256 * i & ms[3]) | (512 * i & ms[4]) | (1024 * i & ms[5]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 32; ++l) { - rs[l] = *(p0 + xss[l]); - is[l] = *(p0 + xss[l] + 32); - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = 0; - - for (unsigned l = 0; l < 32; ++l) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned n = 1; n < 32; ++n) { - rn += rs[n] * v[j] - is[n] * v[j + 1]; - in += rs[n] * v[j + 1] + is[n] * v[j]; - - j += 2; - } - - re += rs[l] * rn + is[l] * in; - im += rs[l] * in - is[l] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue5HHHHL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[32], is[32]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]) - | (256 * i & ms[3]) | (512 * i & ms[4]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 16; ++l) { - rs[2 * l] = *(p0 + xss[l]); - is[2 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 2; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]); - is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]); - } - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = lane; - - for (unsigned l = 0; l < 16; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 32; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - unsigned m = 2 * l; - - re += rs[m] * rn + is[m] * in; - im += rs[m] * in - is[m] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue5HHHLL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[32], is[32]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]) - | (256 * i & ms[3]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 8; ++l) { - rs[4 * l] = *(p0 + xss[l]); - is[4 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 4; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]); - is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]); - } - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = lane; - - for (unsigned l = 0; l < 8; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 32; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - unsigned m = 4 * l; - - re += rs[m] * rn + is[m] * in; - im += rs[m] * in - is[m] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue5HHLLL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[32], is[32]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 4; ++l) { - rs[8 * l] = *(p0 + xss[l]); - is[8 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 8; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]); - is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]); - } - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = lane; - - for (unsigned l = 0; l < 4; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 32; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - unsigned m = 8 * l; - - re += rs[m] * rn + is[m] * in; - im += rs[m] * in - is[m] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue5HLLLL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[32], is[32]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 2; ++l) { - rs[16 * l] = *(p0 + xss[l]); - is[16 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 16; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[16 * l + j] = __shfl_sync(0xffffffff, rs[16 * l], idx[k]); - is[16 * l + j] = __shfl_sync(0xffffffff, is[16 * l], idx[k]); - } - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = lane; - - for (unsigned l = 0; l < 2; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 32; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - unsigned m = 16 * l; - - re += rs[m] * rn + is[m] * in; - im += rs[m] * in - is[m] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue5LLLLL_Kernel( - const fp_type* __restrict__ w, const unsigned* __restrict__ idx, - const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[32], is[32]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - auto p0 = rstate + 64 * i + lane; - - for (unsigned l = 0; l < 1; ++l) { - rs[32 * l] = *(p0); - is[32 * l] = *(p0 + 32); - - for (unsigned j = 1; j < 32; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[32 * l + j] = __shfl_sync(0xffffffff, rs[32 * l], idx[k]); - is[32 * l + j] = __shfl_sync(0xffffffff, is[32 * l], idx[k]); - } - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = lane; - - for (unsigned l = 0; l < 1; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 32; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - re += rs[l] * rn + is[l] * in; - im += rs[l] * in - is[l] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue6HHHHHH_Kernel( - const fp_type* __restrict__ v, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[64], is[64]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]) - | (256 * i & ms[3]) | (512 * i & ms[4]) | (1024 * i & ms[5]) - | (2048 * i & ms[6]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 64; ++l) { - rs[l] = *(p0 + xss[l]); - is[l] = *(p0 + xss[l] + 32); - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = 0; - - for (unsigned l = 0; l < 64; ++l) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned n = 1; n < 64; ++n) { - rn += rs[n] * v[j] - is[n] * v[j + 1]; - in += rs[n] * v[j + 1] + is[n] * v[j]; - - j += 2; - } - - re += rs[l] * rn + is[l] * in; - im += rs[l] * in - is[l] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue6HHHHHL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[64], is[64]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]) - | (256 * i & ms[3]) | (512 * i & ms[4]) | (1024 * i & ms[5]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 32; ++l) { - rs[2 * l] = *(p0 + xss[l]); - is[2 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 2; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]); - is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]); - } - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = lane; - - for (unsigned l = 0; l < 32; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 64; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - unsigned m = 2 * l; - - re += rs[m] * rn + is[m] * in; - im += rs[m] * in - is[m] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue6HHHHLL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[64], is[64]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]) - | (256 * i & ms[3]) | (512 * i & ms[4]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 16; ++l) { - rs[4 * l] = *(p0 + xss[l]); - is[4 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 4; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]); - is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]); - } - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = lane; - - for (unsigned l = 0; l < 16; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 64; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - unsigned m = 4 * l; - - re += rs[m] * rn + is[m] * in; - im += rs[m] * in - is[m] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue6HHHLLL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[64], is[64]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]) - | (256 * i & ms[3]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 8; ++l) { - rs[8 * l] = *(p0 + xss[l]); - is[8 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 8; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]); - is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]); - } - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = lane; - - for (unsigned l = 0; l < 8; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 64; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - unsigned m = 8 * l; - - re += rs[m] * rn + is[m] * in; - im += rs[m] * in - is[m] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue6HHLLLL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[64], is[64]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 4; ++l) { - rs[16 * l] = *(p0 + xss[l]); - is[16 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 16; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[16 * l + j] = __shfl_sync(0xffffffff, rs[16 * l], idx[k]); - is[16 * l + j] = __shfl_sync(0xffffffff, is[16 * l], idx[k]); - } - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = lane; - - for (unsigned l = 0; l < 4; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 64; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - unsigned m = 16 * l; - - re += rs[m] * rn + is[m] * in; - im += rs[m] * in - is[m] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; - -template -__global__ void ExpectationValue6HLLLLL_Kernel( - const fp_type* __restrict__ w, const uint64_t* __restrict__ ms, - const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx, - const fp_type* rstate, - Op op, FP* result) { - fp_type rn, in; - fp_type rs[64], is[64]; - - unsigned lane = threadIdx.x % 32; - uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32; - - uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]); - - auto p0 = rstate + 2 * k + lane; - - for (unsigned l = 0; l < 2; ++l) { - rs[32 * l] = *(p0 + xss[l]); - is[32 * l] = *(p0 + xss[l] + 32); - - for (unsigned j = 1; j < 32; ++j) { - unsigned k = 32 * (j - 1) + lane; - rs[32 * l + j] = __shfl_sync(0xffffffff, rs[32 * l], idx[k]); - is[32 * l + j] = __shfl_sync(0xffffffff, is[32 * l], idx[k]); - } - } - - fp_type re = 0; - fp_type im = 0; - - unsigned j = lane; - - for (unsigned l = 0; l < 2; ++l) { - rn = rs[0] * w[j] - is[0] * w[j + 32]; - in = rs[0] * w[j + 32] + is[0] * w[j]; - - j += 64; - - for (unsigned n = 1; n < 64; ++n) { - rn += rs[n] * w[j] - is[n] * w[j + 32]; - in += rs[n] * w[j + 32] + is[n] * w[j]; - - j += 64; - } - - unsigned m = 32 * l; - - re += rs[m] * rn + is[m] * in; - im += rs[m] * in - is[m] * rn; - } - - extern __shared__ float shared[]; - FP* partial1 = (FP*) shared; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - __shared__ FP partial2[32]; - - if (threadIdx.x < 32) { - partial2[threadIdx.x] = 0; - } - - __syncthreads(); - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (lane == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - FP r = 0; - - if (threadIdx.x < 32) { - r = WarpReduce(partial2[lane], op); - } - - if (threadIdx.x == 0) { - result[blockIdx.x] = r; - } -}; +} } // namespace qsim diff --git a/lib/vectorspace.h b/lib/vectorspace.h index 5a5a6c94..246394e1 100644 --- a/lib/vectorspace.h +++ b/lib/vectorspace.h @@ -160,6 +160,8 @@ class VectorSpace { return true; } + void DeviceSync() {} + protected: For for_; }; diff --git a/lib/vectorspace_cuda.h b/lib/vectorspace_cuda.h index ac228c63..d26f003f 100644 --- a/lib/vectorspace_cuda.h +++ b/lib/vectorspace_cuda.h @@ -141,6 +141,10 @@ class VectorSpaceCUDA { return true; } + void DeviceSync() { + cudaDeviceSynchronize(); + } + protected: }; diff --git a/pybind_interface/cuda/pybind_main_cuda.cpp b/pybind_interface/cuda/pybind_main_cuda.cpp index 88fa3a61..57b0ba84 100644 --- a/pybind_interface/cuda/pybind_main_cuda.cpp +++ b/pybind_interface/cuda/pybind_main_cuda.cpp @@ -27,19 +27,17 @@ namespace qsim { unsigned num_sim_threads, unsigned num_state_threads, unsigned num_dblocks - ) : ss_params{num_state_threads, num_dblocks}, - sim_params{num_sim_threads} {} + ) : ss_params{num_state_threads, num_dblocks} {} StateSpace CreateStateSpace() const { return StateSpace(ss_params); } Simulator CreateSimulator() const { - return Simulator(sim_params); + return Simulator(); } StateSpace::Parameter ss_params; - Simulator::Parameter sim_params; }; inline void SetFlushToZeroAndDenormalsAreZeros() {} diff --git a/pybind_interface/pybind_main.cpp b/pybind_interface/pybind_main.cpp index 5ddee99f..74fa3a31 100644 --- a/pybind_interface/pybind_main.cpp +++ b/pybind_interface/pybind_main.cpp @@ -399,7 +399,6 @@ std::vector> qsim_simulate(const py::dict &options) { if (use_gpu == 0) { num_sim_threads = parseOptions(options, "t\0"); } else if (gpu_mode == 0) { - num_sim_threads = parseOptions(options, "gsmt\0"); num_state_threads = parseOptions(options, "gsst\0"); num_dblocks = parseOptions(options, "gdb\0"); } @@ -464,7 +463,6 @@ std::vector> qtrajectory_simulate(const py::dict &options) { if (use_gpu == 0) { num_sim_threads = parseOptions(options, "t\0"); } else if (gpu_mode == 0) { - num_sim_threads = parseOptions(options, "gsmt\0"); num_state_threads = parseOptions(options, "gsst\0"); num_dblocks = parseOptions(options, "gdb\0"); } @@ -659,7 +657,6 @@ class SimulatorHelper { if (use_gpu == 0) { num_sim_threads = parseOptions(options, "t\0"); } else if (gpu_mode == 0) { - num_sim_threads = parseOptions(options, "gsmt\0"); num_state_threads = parseOptions(options, "gsst\0"); num_dblocks = parseOptions(options, "gdb\0"); } @@ -985,7 +982,6 @@ std::vector qsim_sample(const py::dict &options) { if (use_gpu == 0) { num_sim_threads = parseOptions(options, "t\0"); } else if (gpu_mode == 0) { - num_sim_threads = parseOptions(options, "gsmt\0"); num_state_threads = parseOptions(options, "gsst\0"); num_dblocks = parseOptions(options, "gdb\0"); } @@ -1054,7 +1050,6 @@ std::vector qtrajectory_sample(const py::dict &options) { if (use_gpu == 0) { num_sim_threads = parseOptions(options, "t\0"); } else if (gpu_mode == 0) { - num_sim_threads = parseOptions(options, "gsmt\0"); num_state_threads = parseOptions(options, "gsst\0"); num_dblocks = parseOptions(options, "gdb\0"); } diff --git a/qsimcirq/qsim_simulator.py b/qsimcirq/qsim_simulator.py index b59a6f36..37d497ed 100644 --- a/qsimcirq/qsim_simulator.py +++ b/qsimcirq/qsim_simulator.py @@ -63,12 +63,10 @@ class QSimOptions: gpu_mode: use CUDA if set to 0 (default value) or use the NVIDIA cuStateVec library if set to any other value. The "gpu_*" arguments below are only considered if this is set to 0. - gpu_sim_threads: number of threads per CUDA block to use for the GPU - Simulator. This must be a power of 2 in the range [32, 256]. gpu_state_threads: number of threads per CUDA block to use for the GPU StateSpace. This must be a power of 2 in the range [32, 1024]. - gpu_data_blocks: number of data blocks to use on GPU. Below 16 data - blocks, performance is noticeably reduced. + gpu_data_blocks: number of data blocks to use for the GPU StateSpace. + Below 16 data blocks, performance is noticeably reduced. verbosity: Logging verbosity. denormals_are_zeros: if true, set flush-to-zero and denormals-are-zeros MXCSR control flags. This prevents rare cases of performance @@ -80,7 +78,6 @@ class QSimOptions: ev_noisy_repetitions: int = 1 use_gpu: bool = False gpu_mode: int = 0 - gpu_sim_threads: int = 256 gpu_state_threads: int = 512 gpu_data_blocks: int = 16 verbosity: int = 0 @@ -97,7 +94,6 @@ def as_dict(self): "r": self.ev_noisy_repetitions, "g": self.use_gpu, "gmode": self.gpu_mode, - "gsmt": self.gpu_sim_threads, "gsst": self.gpu_state_threads, "gdb": self.gpu_data_blocks, "v": self.verbosity, diff --git a/tests/hybrid_cuda_test.cu b/tests/hybrid_cuda_test.cu index 28da96dc..6f2640a3 100644 --- a/tests/hybrid_cuda_test.cu +++ b/tests/hybrid_cuda_test.cu @@ -26,35 +26,30 @@ struct Factory { using Simulator = qsim::SimulatorCUDA; using StateSpace = typename Simulator::StateSpace; - Factory(const typename StateSpace::Parameter& param1, - const typename Simulator::Parameter& param2) - : param1(param1), param2(param2) {} + Factory(const typename StateSpace::Parameter& param) : param(param) {} StateSpace CreateStateSpace() const { - return StateSpace(param1); + return StateSpace(param); } Simulator CreateSimulator() const { - return Simulator(param2); + return Simulator(); } - typename StateSpace::Parameter param1; - typename Simulator::Parameter param2; + typename StateSpace::Parameter param; }; TEST(HybridCUDATest, Hybrid2) { using Factory = qsim::Factory; - Factory::StateSpace::Parameter param1; - Factory::Simulator::Parameter param2; - Factory factory(param1, param2); + Factory::StateSpace::Parameter param; + Factory factory(param); TestHybrid2(factory); } TEST(HybridCUDATest, Hybrid4) { using Factory = qsim::Factory; - Factory::StateSpace::Parameter param1; - Factory::Simulator::Parameter param2; - Factory factory(param1, param2); + Factory::StateSpace::Parameter param; + Factory factory(param); TestHybrid4(factory); } diff --git a/tests/qtrajectory_cuda_test.cu b/tests/qtrajectory_cuda_test.cu index 730ff7ed..459a25f6 100644 --- a/tests/qtrajectory_cuda_test.cu +++ b/tests/qtrajectory_cuda_test.cu @@ -26,75 +26,65 @@ struct Factory { using Simulator = qsim::SimulatorCUDA; using StateSpace = typename Simulator::StateSpace; - Factory(const typename StateSpace::Parameter& param1, - const typename Simulator::Parameter& param2) - : param1(param1), param2(param2) {} + Factory(const typename StateSpace::Parameter& param) : param(param) {} StateSpace CreateStateSpace() const { - return StateSpace(param1); + return StateSpace(param); } Simulator CreateSimulator() const { - return Simulator(param2); + return Simulator(); } - typename StateSpace::Parameter param1; - typename Simulator::Parameter param2; + typename StateSpace::Parameter param; }; TEST(QTrajectoryCUDATest, BitFlip) { using Factory = qsim::Factory; - Factory::StateSpace::Parameter param1; - Factory::Simulator::Parameter param2; - Factory factory(param1, param2); + Factory::StateSpace::Parameter param; + Factory factory(param); TestBitFlip(factory); } TEST(QTrajectoryCUDATest, GenDump) { using Factory = qsim::Factory; - Factory::StateSpace::Parameter param1; - Factory::Simulator::Parameter param2; - Factory factory(param1, param2); + Factory::StateSpace::Parameter param; + Factory factory(param); TestGenDump(factory); } TEST(QTrajectoryCUDATest, ReusingResults) { using Factory = qsim::Factory; - Factory::StateSpace::Parameter param1; - Factory::Simulator::Parameter param2; - Factory factory(param1, param2); + Factory::StateSpace::Parameter param; + Factory factory(param); TestReusingResults(factory); } TEST(QTrajectoryCUDATest, CollectKopStat) { using Factory = qsim::Factory; - Factory::StateSpace::Parameter param1; - Factory::Simulator::Parameter param2; - Factory factory(param1, param2); + Factory::StateSpace::Parameter param; + Factory factory(param); TestCollectKopStat(factory); } TEST(QTrajectoryCUDATest, CleanCircuit) { using Factory = qsim::Factory; - Factory::StateSpace::Parameter param1; - Factory::Simulator::Parameter param2; - Factory factory(param1, param2); + Factory::StateSpace::Parameter param; + Factory factory(param); TestCleanCircuit(factory); } TEST(QTrajectoryCUDATest, InitialState) { using Factory = qsim::Factory; - Factory::StateSpace::Parameter param1; - Factory::Simulator::Parameter param2; - Factory factory(param1, param2); + Factory::StateSpace::Parameter param; + Factory factory(param); TestInitialState(factory); } TEST(QTrajectoryCUDATest, UncomputeFinalState) { using Factory = qsim::Factory; - Factory::StateSpace::Parameter param1; - Factory::Simulator::Parameter param2; - Factory factory(param1, param2); + Factory::StateSpace::Parameter param; + Factory factory(param); TestUncomputeFinalState(factory); } diff --git a/tests/simulator_cuda_test.cu b/tests/simulator_cuda_test.cu index efc202aa..baecfd45 100644 --- a/tests/simulator_cuda_test.cu +++ b/tests/simulator_cuda_test.cu @@ -34,152 +34,88 @@ struct Factory { using Simulator = qsim::SimulatorCUDA; using StateSpace = typename Simulator::StateSpace; - Factory(const typename StateSpace::Parameter& param1, - const typename Simulator::Parameter& param2) - : param1(param1), param2(param2) {} + Factory(const typename StateSpace::Parameter& param) : param(param) {} StateSpace CreateStateSpace() const { - return StateSpace(param1); + return StateSpace(param); } Simulator CreateSimulator() const { - return Simulator(param2); + return Simulator(); } - typename StateSpace::Parameter param1; - typename Simulator::Parameter param2; + typename StateSpace::Parameter param; }; TYPED_TEST(SimulatorCUDATest, ApplyGate1) { using Factory = qsim::Factory; - - for (unsigned num_threads : {32, 64, 128, 256}) { - typename Factory::Simulator::Parameter param; - param.num_threads = num_threads; - - Factory factory(typename Factory::StateSpace::Parameter(), param); - - TestApplyGate1(factory); - } + typename Factory::StateSpace::Parameter param; + Factory factory(param); + TestApplyGate1(factory); } TYPED_TEST(SimulatorCUDATest, ApplyGate2) { using Factory = qsim::Factory; - - for (unsigned num_threads : {32, 64, 128, 256}) { - typename Factory::Simulator::Parameter param; - param.num_threads = num_threads; - - Factory factory(typename Factory::StateSpace::Parameter(), param); - - TestApplyGate2(factory); - } + typename Factory::StateSpace::Parameter param; + Factory factory(param); + TestApplyGate2(factory); } TYPED_TEST(SimulatorCUDATest, ApplyGate3) { using Factory = qsim::Factory; - - for (unsigned num_threads : {32, 64, 128, 256}) { - typename Factory::Simulator::Parameter param; - param.num_threads = num_threads; - - Factory factory(typename Factory::StateSpace::Parameter(), param); - - TestApplyGate3(factory); - } + typename Factory::StateSpace::Parameter param; + Factory factory(param); + TestApplyGate3(factory); } TYPED_TEST(SimulatorCUDATest, ApplyGate5) { using Factory = qsim::Factory; - - for (unsigned num_threads : {32, 64, 128, 256}) { - typename Factory::Simulator::Parameter param; - param.num_threads = num_threads; - - Factory factory(typename Factory::StateSpace::Parameter(), param); - - TestApplyGate5(factory); - } + typename Factory::StateSpace::Parameter param; + Factory factory(param); + TestApplyGate5(factory); } TYPED_TEST(SimulatorCUDATest, CircuitWithControlledGates) { using Factory = qsim::Factory; - - for (unsigned num_threads : {32, 64, 128, 256}) { - typename Factory::Simulator::Parameter param; - param.num_threads = num_threads; - - Factory factory(typename Factory::StateSpace::Parameter(), param); - - TestCircuitWithControlledGates(factory); - } + typename Factory::StateSpace::Parameter param; + Factory factory(param); + TestCircuitWithControlledGates(factory); } TYPED_TEST(SimulatorCUDATest, CircuitWithControlledGatesDagger) { using Factory = qsim::Factory; - - for (unsigned num_threads : {32, 64, 128, 256}) { - typename Factory::Simulator::Parameter param; - param.num_threads = num_threads; - - Factory factory(typename Factory::StateSpace::Parameter(), param); - - TestCircuitWithControlledGatesDagger(factory); - } + typename Factory::StateSpace::Parameter param; + Factory factory(param); + TestCircuitWithControlledGatesDagger(factory); } TYPED_TEST(SimulatorCUDATest, MultiQubitGates) { using Factory = qsim::Factory; - - for (unsigned num_threads : {32, 64, 128, 256}) { - typename Factory::Simulator::Parameter param; - param.num_threads = num_threads; - - Factory factory(typename Factory::StateSpace::Parameter(), param); - - TestMultiQubitGates(factory); - } + typename Factory::StateSpace::Parameter param; + Factory factory(param); + TestMultiQubitGates(factory); } TYPED_TEST(SimulatorCUDATest, ControlledGates) { using Factory = qsim::Factory; - + typename Factory::StateSpace::Parameter param; + Factory factory(param); bool high_precision = std::is_same::value; - - for (unsigned num_threads : {64, 256}) { - typename Factory::Simulator::Parameter param; - param.num_threads = num_threads; - - Factory factory(typename Factory::StateSpace::Parameter(), param); - - TestControlledGates(factory, high_precision); - } + TestControlledGates(factory, high_precision); } TYPED_TEST(SimulatorCUDATest, ExpectationValue1) { using Factory = qsim::Factory; - - for (unsigned num_threads : {32, 64, 128, 256}) { - typename Factory::Simulator::Parameter param; - param.num_threads = num_threads; - - Factory factory(typename Factory::StateSpace::Parameter(), param); - - TestExpectationValue1(factory); - } + typename Factory::StateSpace::Parameter param; + Factory factory(param); + TestExpectationValue1(factory); } TYPED_TEST(SimulatorCUDATest, ExpectationValue2) { using Factory = qsim::Factory; - - for (unsigned num_threads : {256}) { - typename Factory::Simulator::Parameter param; - param.num_threads = num_threads; - - Factory factory(typename Factory::StateSpace::Parameter(), param); - - TestExpectationValue2(factory); - } + typename Factory::StateSpace::Parameter param; + Factory factory(param); + TestExpectationValue2(factory); } } // namespace qsim diff --git a/tests/simulator_testfixture.h b/tests/simulator_testfixture.h index ef335565..31cdcc7e 100644 --- a/tests/simulator_testfixture.h +++ b/tests/simulator_testfixture.h @@ -1147,60 +1147,63 @@ void TestMultiQubitGates(const Factory& factory) { using StateSpace = typename Simulator::StateSpace; using fp_type = typename StateSpace::fp_type; - unsigned max_minq = 4; - unsigned max_gate_qubits = 6; - unsigned num_qubits = max_gate_qubits + max_minq; + unsigned max_num_qubits = 10 + std::log2(Simulator::SIMDRegisterSize()); StateSpace state_space = factory.CreateStateSpace(); Simulator simulator = factory.CreateSimulator(); - auto state = state_space.Create(num_qubits); - std::vector matrix; - matrix.reserve(1 << (2 * max_gate_qubits + 1)); + matrix.reserve(1 << (2 * 6 + 1)); std::vector qubits; - qubits.reserve(max_gate_qubits); - - std::vector vec(state_space.MinSize(num_qubits)); + qubits.reserve(6); - unsigned size = 1 << num_qubits; - fp_type inorm = std::sqrt(1.0 / (1 << num_qubits)); + std::vector vec(state_space.MinSize(max_num_qubits)); - for (unsigned q = 1; q <= max_gate_qubits; ++q) { - unsigned size1 = 1 << q; - unsigned size2 = size1 * size1; + for (unsigned num_qubits = 1; num_qubits <= max_num_qubits; ++num_qubits) { + auto state = state_space.Create(num_qubits); - matrix.resize(0); + unsigned size = 1 << num_qubits; + fp_type inorm = std::sqrt(1.0 / (1 << num_qubits)); + unsigned max_gate_qubits = std::min(6U, num_qubits); - for (unsigned i = 0; i < 2 * size2; ++i) { - matrix.push_back(i + 1); - } + for (unsigned q = 1; q <= max_gate_qubits; ++q) { - unsigned mask = (1 << q) - 1; + unsigned size1 = 1 << q; + unsigned size2 = size1 * size1; - for (unsigned k = 0; k <= max_minq; ++k) { - qubits.resize(0); + matrix.resize(0); - for (unsigned i = 0; i < q; ++i) { - qubits.push_back(i + k); + for (unsigned i = 0; i < 2 * size2; ++i) { + matrix.push_back(i + 1); } - state_space.SetStateUniform(state); - simulator.ApplyGate(qubits, matrix.data(), state); + unsigned mask = (1 << q) - 1; + unsigned max_minq = num_qubits - q; - state_space.InternalToNormalOrder(state); - state_space.Copy(state, vec.data()); + for (unsigned k = 0; k <= max_minq; ++k) { + qubits.resize(0); + + for (unsigned i = 0; i < q; ++i) { + qubits.push_back(i + k); + } - for (unsigned i = 0; i < size; ++i) { - unsigned j = (i >> k) & mask; + state_space.SetStateUniform(state); + simulator.ApplyGate(qubits, matrix.data(), state); - // Expected results are calculated analytically. - fp_type expected_real = size2 * (1 + 2 * j) * inorm; - fp_type expected_imag = expected_real + size1 * inorm; + state_space.InternalToNormalOrder(state); + state_space.Copy(state, vec.data()); - EXPECT_NEAR(vec[2 * i], expected_real, 1e-6); - EXPECT_NEAR(vec[2 * i + 1], expected_imag, 1e-6); + for (unsigned i = 0; i < size; ++i) { + unsigned j = (i >> k) & mask; + + // Expected results are calculated analytically. + fp_type expected_real = size2 * (1 + 2 * j) * inorm; + fp_type expected_imag = expected_real + size1 * inorm; + + EXPECT_NEAR(vec[2 * i] / expected_real, 1.0, 1e-6); + EXPECT_NEAR(vec[2 * i + 1] / expected_imag, 1.0, 1e-6); + } } } } @@ -1212,15 +1215,13 @@ void TestControlledGates(const Factory& factory, bool high_precision) { using StateSpace = typename Simulator::StateSpace; using fp_type = typename StateSpace::fp_type; - unsigned max_qubits = 5 + std::log2(Simulator::SIMDRegisterSize()); + unsigned max_qubits = 6 + std::log2(Simulator::SIMDRegisterSize()); unsigned max_target_qubits = 4; unsigned max_control_qubits = 3; StateSpace state_space = factory.CreateStateSpace(); Simulator simulator = factory.CreateSimulator(); - auto state = state_space.Create(max_qubits); - std::vector qubits; qubits.reserve(max_qubits); @@ -1237,6 +1238,8 @@ void TestControlledGates(const Factory& factory, bool high_precision) { unsigned size = 1 << num_qubits; unsigned nmask = size - 1; + auto state = state_space.Create(num_qubits); + // Iterate over control qubits (as a binary mask). for (unsigned cmask = 0; cmask <= nmask; ++cmask) { cqubits.resize(0); @@ -1359,47 +1362,54 @@ void TestExpectationValue1(const Factory& factory) { using StateSpace = typename Simulator::StateSpace; using fp_type = typename StateSpace::fp_type; - unsigned max_minq = 4; - unsigned max_gate_qubits = 6; - unsigned num_qubits = max_gate_qubits + max_minq; + unsigned rsize = std::log2(Simulator::SIMDRegisterSize()); + unsigned max_num_qubits = 10 + rsize; StateSpace state_space = factory.CreateStateSpace(); Simulator simulator = factory.CreateSimulator(); - auto state = state_space.Create(num_qubits); - std::vector matrix; - matrix.reserve(1 << (2 * max_gate_qubits + 1)); + matrix.reserve(1 << (2 * 6 + 1)); std::vector qubits; - qubits.reserve(max_gate_qubits); + qubits.reserve(6); - for (unsigned q = 1; q <= max_gate_qubits; ++q) { - unsigned size1 = 1 << q; - unsigned size2 = size1 * size1; + std::vector vec(state_space.MinSize(max_num_qubits)); - // Expected results are calculated analytically. - fp_type expected_real = size2 * size1; - fp_type expected_imag = expected_real + size1; + for (unsigned num_qubits = 1; num_qubits <= max_num_qubits; ++num_qubits) { + auto state = state_space.Create(num_qubits); - matrix.resize(0); + unsigned max_gate_qubits = std::min(6U, num_qubits); - for (unsigned i = 0; i < 2 * size2; ++i) { - matrix.push_back(i + 1); - } + for (unsigned q = 1; q <= max_gate_qubits; ++q) { + unsigned size1 = 1 << q; + unsigned size2 = size1 * size1; + + // Expected results are calculated analytically. + fp_type expected_real = size2 * size1; + fp_type expected_imag = expected_real + size1; - for (unsigned k = 0; k <= max_minq; ++k) { - qubits.resize(0); + matrix.resize(0); - for (unsigned i = 0; i < q; ++i) { - qubits.push_back(i + k); + for (unsigned i = 0; i < 2 * size2; ++i) { + matrix.push_back(i + 1); } - state_space.SetStateUniform(state); - auto eval = simulator.ExpectationValue(qubits, matrix.data(), state); + unsigned max_minq = std::min(num_qubits - q, rsize + 3); - EXPECT_NEAR(std::real(eval), expected_real, 1e-6); - EXPECT_NEAR(std::imag(eval), expected_imag, 1e-6); + for (unsigned k = 0; k <= max_minq; ++k) { + qubits.resize(0); + + for (unsigned i = 0; i < q; ++i) { + qubits.push_back(i + k); + } + + state_space.SetStateUniform(state); + auto eval = simulator.ExpectationValue(qubits, matrix.data(), state); + + EXPECT_NEAR(std::real(eval) / expected_real, 1.0, 1e-6); + EXPECT_NEAR(std::imag(eval) / expected_imag, 1.0, 1e-6); + } } } } diff --git a/tests/statespace_cuda_test.cu b/tests/statespace_cuda_test.cu index 489b5bf0..51b3ffb8 100644 --- a/tests/statespace_cuda_test.cu +++ b/tests/statespace_cuda_test.cu @@ -33,32 +33,28 @@ struct Factory { using Simulator = qsim::SimulatorCUDA; using StateSpace = typename Simulator::StateSpace; - Factory(const typename StateSpace::Parameter& param1, - const typename Simulator::Parameter& param2) - : param1(param1), param2(param2) {} + Factory(const typename StateSpace::Parameter& param) : param(param) {} StateSpace CreateStateSpace() const { - return StateSpace(param1); + return StateSpace(param); } Simulator CreateSimulator() const { - return Simulator(param2); + return Simulator(); } - typename StateSpace::Parameter param1; - typename Simulator::Parameter param2; + typename StateSpace::Parameter param; }; TYPED_TEST(StateSpaceCUDATest, Add) { using Factory = qsim::Factory; + typename Factory::StateSpace::Parameter param; for (unsigned num_dblocks : {2, 16}) { + param.num_dblocks = num_dblocks; for (unsigned num_threads : {64, 256, 1024}) { - typename Factory::StateSpace::Parameter param; param.num_threads = num_threads; - - Factory factory(param, typename Factory::Simulator::Parameter()); - + Factory factory(param); TestAdd(factory); } } @@ -66,14 +62,13 @@ TYPED_TEST(StateSpaceCUDATest, Add) { TYPED_TEST(StateSpaceCUDATest, NormSmall) { using Factory = qsim::Factory; + typename Factory::StateSpace::Parameter param; for (unsigned num_dblocks : {2, 16}) { + param.num_dblocks = num_dblocks; for (unsigned num_threads : {64, 256, 1024}) { - typename Factory::StateSpace::Parameter param; param.num_threads = num_threads; - - Factory factory(param, typename Factory::Simulator::Parameter()); - + Factory factory(param); TestNormSmall(factory); } } @@ -81,14 +76,13 @@ TYPED_TEST(StateSpaceCUDATest, NormSmall) { TYPED_TEST(StateSpaceCUDATest, NormAndInnerProductSmall) { using Factory = qsim::Factory; + typename Factory::StateSpace::Parameter param; for (unsigned num_dblocks : {2, 16}) { + param.num_dblocks = num_dblocks; for (unsigned num_threads : {64, 256, 1024}) { - typename Factory::StateSpace::Parameter param; param.num_threads = num_threads; - - Factory factory(param, typename Factory::Simulator::Parameter()); - + Factory factory(param); TestNormAndInnerProductSmall(factory); } } @@ -96,14 +90,13 @@ TYPED_TEST(StateSpaceCUDATest, NormAndInnerProductSmall) { TYPED_TEST(StateSpaceCUDATest, NormAndInnerProduct) { using Factory = qsim::Factory; + typename Factory::StateSpace::Parameter param; for (unsigned num_dblocks : {2, 16}) { + param.num_dblocks = num_dblocks; for (unsigned num_threads : {64, 256, 1024}) { - typename Factory::StateSpace::Parameter param; param.num_threads = num_threads; - - Factory factory(param, typename Factory::Simulator::Parameter()); - + Factory factory(param); TestNormAndInnerProduct(factory); } } @@ -111,14 +104,13 @@ TYPED_TEST(StateSpaceCUDATest, NormAndInnerProduct) { TYPED_TEST(StateSpaceCUDATest, SamplingSmall) { using Factory = qsim::Factory; + typename Factory::StateSpace::Parameter param; for (unsigned num_dblocks : {2, 16}) { + param.num_dblocks = num_dblocks; for (unsigned num_threads : {64, 256, 1024}) { - typename Factory::StateSpace::Parameter param; param.num_threads = num_threads; - - Factory factory(param, typename Factory::Simulator::Parameter()); - + Factory factory(param); TestSamplingSmall(factory); } } @@ -126,14 +118,13 @@ TYPED_TEST(StateSpaceCUDATest, SamplingSmall) { TYPED_TEST(StateSpaceCUDATest, SamplingCrossEntropyDifference) { using Factory = qsim::Factory; + typename Factory::StateSpace::Parameter param; for (unsigned num_dblocks : {16}) { + param.num_dblocks = num_dblocks; for (unsigned num_threads : {256, 1024}) { - typename Factory::StateSpace::Parameter param; param.num_threads = num_threads; - - Factory factory(param, typename Factory::Simulator::Parameter()); - + Factory factory(param); TestSamplingCrossEntropyDifference(factory); } } @@ -141,14 +132,13 @@ TYPED_TEST(StateSpaceCUDATest, SamplingCrossEntropyDifference) { TYPED_TEST(StateSpaceCUDATest, Ordering) { using Factory = qsim::Factory; + typename Factory::StateSpace::Parameter param; for (unsigned num_dblocks : {2, 16}) { + param.num_dblocks = num_dblocks; for (unsigned num_threads : {64, 256, 1024}) { - typename Factory::StateSpace::Parameter param; param.num_threads = num_threads; - - Factory factory(param, typename Factory::Simulator::Parameter()); - + Factory factory(param); TestOrdering(factory); } } @@ -156,22 +146,20 @@ TYPED_TEST(StateSpaceCUDATest, Ordering) { TEST(StateSpaceCUDATest, MeasurementSmall) { using Factory = qsim::Factory; - Factory::StateSpace::Parameter param1; - Factory::Simulator::Parameter param2; - Factory factory(param1, param2); + Factory::StateSpace::Parameter param; + Factory factory(param); TestMeasurementSmall(factory, true); } TYPED_TEST(StateSpaceCUDATest, MeasurementLarge) { using Factory = qsim::Factory; + typename Factory::StateSpace::Parameter param; for (unsigned num_dblocks : {2, 16}) { + param.num_dblocks = num_dblocks; for (unsigned num_threads : {64, 256, 1024}) { - typename Factory::StateSpace::Parameter param; param.num_threads = num_threads; - - Factory factory(param, typename Factory::Simulator::Parameter()); - + Factory factory(param); TestMeasurementLarge(factory); } } @@ -179,14 +167,13 @@ TYPED_TEST(StateSpaceCUDATest, MeasurementLarge) { TYPED_TEST(StateSpaceCUDATest, Collapse) { using Factory = qsim::Factory; + typename Factory::StateSpace::Parameter param; for (unsigned num_dblocks : {2, 16}) { + param.num_dblocks = num_dblocks; for (unsigned num_threads : {64, 256, 1024}) { - typename Factory::StateSpace::Parameter param; param.num_threads = num_threads; - - Factory factory(param, typename Factory::Simulator::Parameter()); - + Factory factory(param); TestCollapse(factory); } } @@ -194,22 +181,21 @@ TYPED_TEST(StateSpaceCUDATest, Collapse) { TEST(StateSpaceCUDATest, InvalidStateSize) { using Factory = qsim::Factory; - Factory::StateSpace::Parameter param1; - Factory::Simulator::Parameter param2; - Factory factory(param1, param2); + Factory::StateSpace::Parameter param; + Factory factory(param); TestInvalidStateSize(factory); } TYPED_TEST(StateSpaceCUDATest, BulkSetAmpl) { using Factory = qsim::Factory; + typename Factory::StateSpace::Parameter param; for (unsigned num_dblocks : {2, 16}) { + param.num_dblocks = num_dblocks; for (unsigned num_threads : {64, 256, 1024}) { typename Factory::StateSpace::Parameter param; param.num_threads = num_threads; - - Factory factory(param, typename Factory::Simulator::Parameter()); - + Factory factory(param); TestBulkSetAmplitude(factory); } } @@ -217,14 +203,13 @@ TYPED_TEST(StateSpaceCUDATest, BulkSetAmpl) { TYPED_TEST(StateSpaceCUDATest, BulkSetAmplExclusion) { using Factory = qsim::Factory; + typename Factory::StateSpace::Parameter param; for (unsigned num_dblocks : {2, 16}) { + param.num_dblocks = num_dblocks; for (unsigned num_threads : {64, 256, 1024}) { - typename Factory::StateSpace::Parameter param; param.num_threads = num_threads; - - Factory factory(param, typename Factory::Simulator::Parameter()); - + Factory factory(param); TestBulkSetAmplitudeExclusion(factory); } } @@ -232,14 +217,13 @@ TYPED_TEST(StateSpaceCUDATest, BulkSetAmplExclusion) { TYPED_TEST(StateSpaceCUDATest, BulkSetAmplDefault) { using Factory = qsim::Factory; + typename Factory::StateSpace::Parameter param; for (unsigned num_dblocks : {2, 16}) { + param.num_dblocks = num_dblocks; for (unsigned num_threads : {64, 256, 1024}) { - typename Factory::StateSpace::Parameter param; param.num_threads = num_threads; - - Factory factory(param, typename Factory::Simulator::Parameter()); - + Factory factory(param); TestBulkSetAmplitudeDefault(factory); } }