From 2f475aef0c2abacd62076af1f246194708c3d80f Mon Sep 17 00:00:00 2001
From: Sergei Isakov <iserge@google.com>
Date: Wed, 10 Aug 2022 15:17:01 +0200
Subject: [PATCH] New CUDA simulator.

---
 apps/qsim_base_cuda.cu                     |   24 +-
 apps/qsim_qtrajectory_cuda.cu              |   16 +-
 docs/cirq_interface.md                     |    8 +-
 lib/run_qsim.h                             |    4 +
 lib/simulator_cuda.h                       | 7487 ++------------------
 lib/simulator_cuda_kernels.h               | 4747 ++-----------
 lib/vectorspace.h                          |    2 +
 lib/vectorspace_cuda.h                     |    4 +
 pybind_interface/cuda/pybind_main_cuda.cpp |    6 +-
 pybind_interface/pybind_main.cpp           |    5 -
 qsimcirq/qsim_simulator.py                 |    8 +-
 tests/hybrid_cuda_test.cu                  |   21 +-
 tests/qtrajectory_cuda_test.cu             |   46 +-
 tests/simulator_cuda_test.cu               |  132 +-
 tests/simulator_testfixture.h              |  134 +-
 tests/statespace_cuda_test.cu              |  104 +-
 16 files changed, 1292 insertions(+), 11456 deletions(-)
diff --git a/apps/qsim_base_cuda.cu b/apps/qsim_base_cuda.cu
index d90ca477..b4af7967 100644
--- a/apps/qsim_base_cuda.cu
+++ b/apps/qsim_base_cuda.cu
@@ -112,46 +112,42 @@ int main(int argc, char* argv[]) {
     return 1;
   }
 
-  Circuit<GateQSim<float>> circuit;
+  using fp_type = float;
+
+  Circuit<GateQSim<fp_type>> circuit;
   if (!CircuitQsimParser<IOFile>::FromFile(opt.maxtime, opt.circuit_file,
                                            circuit)) {
     return 1;
   }
 
   struct Factory {
-    using Simulator = qsim::SimulatorCUDA<float>;
+    using Simulator = qsim::SimulatorCUDA<fp_type>;
     using StateSpace = Simulator::StateSpace;
 
-    Factory(const StateSpace::Parameter& param1,
-            const Simulator::Parameter& param2)
-        : param1(param1), param2(param2) {}
+    Factory(const StateSpace::Parameter& param) : param(param) {}
 
     StateSpace CreateStateSpace() const {
-      return StateSpace(param1);
+      return StateSpace(param);
     }
 
     Simulator CreateSimulator() const {
-      return Simulator(param2);
+      return Simulator();
     }
 
-    const StateSpace::Parameter& param1;
-    const Simulator::Parameter& param2;
+    const StateSpace::Parameter& param;
   };
 
   using Simulator = Factory::Simulator;
   using StateSpace = Simulator::StateSpace;
   using State = StateSpace::State;
-  using Fuser = MultiQubitGateFuser<IO, GateQSim<float>>;
+  using Fuser = MultiQubitGateFuser<IO, GateQSim<fp_type>>;
   using Runner = QSimRunner<IO, Fuser, Factory>;
 
   StateSpace::Parameter param1;
   param1.num_threads = opt.num_threads;
   param1.num_dblocks = opt.num_dblocks;
 
-  Simulator::Parameter param2;
-  param2.num_threads = opt.num_threads;
-
-  Factory factory(param1, param2);
+  Factory factory(param1);
 
   StateSpace state_space = factory.CreateStateSpace();
   State state = state_space.Create(circuit.num_qubits);
diff --git a/apps/qsim_qtrajectory_cuda.cu b/apps/qsim_qtrajectory_cuda.cu
index 65fe1cd3..0d513cfa 100644
--- a/apps/qsim_qtrajectory_cuda.cu
+++ b/apps/qsim_qtrajectory_cuda.cu
@@ -190,23 +190,20 @@ int main(int argc, char* argv[]) {
   using fp_type = float;
 
   struct Factory {
-    using Simulator = qsim::SimulatorCUDA<fp_type>;
+    using Simulator = qsim::SimulatorCUDA<float>;
     using StateSpace = Simulator::StateSpace;
 
-    Factory(const StateSpace::Parameter& param1,
-            const Simulator::Parameter& param2)
-        : param1(param1), param2(param2) {}
+    Factory(const StateSpace::Parameter& param) : param(param) {}
 
     StateSpace CreateStateSpace() const {
-      return StateSpace(param1);
+      return StateSpace(param);
     }
 
     Simulator CreateSimulator() const {
-      return Simulator(param2);
+      return Simulator();
     }
 
-    const StateSpace::Parameter& param1;
-    const Simulator::Parameter& param2;
+    const StateSpace::Parameter& param;
   };
 
   using Simulator = Factory::Simulator;
@@ -235,8 +232,7 @@ int main(int argc, char* argv[]) {
   }
 
   StateSpace::Parameter param1;
-  Simulator::Parameter param2;
-  Factory factory(param1, param2);
+  Factory factory(param1);
 
   Simulator simulator = factory.CreateSimulator();
   StateSpace state_space = factory.CreateStateSpace();
diff --git a/docs/cirq_interface.md b/docs/cirq_interface.md
index 24149708..c06653b7 100644
--- a/docs/cirq_interface.md
+++ b/docs/cirq_interface.md
@@ -190,11 +190,9 @@ is required to enable GPU execution:
 library if set to any other value.
 
 If `use_gpu` is set and `gpu_mode` is set to 0, the remaining parameters can
-optionally be set to fine-tune perfomance for a specific device or circuit.
+optionally be set to fine-tune StateSpace perfomance for a specific device.
 In most cases, the default values provide good performance.
-* `gpu_sim_threads`: number of threads per CUDA block to use for the GPU
-Simulator. This must be a power of 2 in the range [32, 256].
 * `gpu_state_threads`: number of threads per CUDA block to use for the GPU
 StateSpace. This must be a power of 2 in the range [32, 1024].
-* `gpu_data_blocks`: number of data blocks to use on GPU. Below 16 data blocks,
-performance is noticeably reduced.
+* `gpu_data_blocks`: number of data blocks to use for the GPU StateSpace.
+Below 16 data blocks, performance is noticeably reduced.
diff --git a/lib/run_qsim.h b/lib/run_qsim.h
index b0aad9f3..37529152 100644
--- a/lib/run_qsim.h
+++ b/lib/run_qsim.h
@@ -133,6 +133,7 @@ struct QSimRunner final {
       }
 
       if (param.verbosity > 3) {
+        state_space.DeviceSync();
         double t2 = GetTime();
         IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1);
       }
@@ -147,6 +148,7 @@ struct QSimRunner final {
     }
 
     if (param.verbosity > 0) {
+      state_space.DeviceSync();
       double t2 = GetTime();
       IO::messagef("time is %g seconds.\n", t2 - t0);
     }
@@ -221,12 +223,14 @@ struct QSimRunner final {
       }
 
       if (param.verbosity > 3) {
+        state_space.DeviceSync();
         double t2 = GetTime();
         IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1);
       }
     }
 
     if (param.verbosity > 0) {
+      state_space.DeviceSync();
       double t2 = GetTime();
       IO::messagef("simu time is %g seconds.\n", t2 - t0);
     }
diff --git a/lib/simulator_cuda.h b/lib/simulator_cuda.h
index b507a224..66bf702a 100644
--- a/lib/simulator_cuda.h
+++ b/lib/simulator_cuda.h
@@ -20,10 +20,11 @@
 #include <algorithm>
 #include <complex>
 #include <cstdint>
+#include <cstring>
+#include <vector>
 
 #include "bits.h"
 #include "statespace_cuda.h"
-#include "util_cuda.h"
 
 namespace qsim {
 
@@ -32,34 +33,28 @@ namespace qsim {
  */
 template <typename FP = float>
 class SimulatorCUDA final {
- public:
-  struct Parameter {
-    /**
-     * The number of threads per block.
-     * Should be 2 to the power of k, where k is in the range [5,8].
-     * Note that the number of registers on the multiprocessor can be
-     * exceeded if k > 8 (num_threads > 256).
-     */
-    unsigned num_threads = 256;
-  };
+ private:
+  using idx_type = uint64_t;
+  using Complex = qsim::Complex<double>;
+
+  // The maximum buffer size for indices and gate matrices.
+  // The maximum gate matrix size (for 6-qubit gates) is
+  // 2 * 2^6 * 2^6 * sizeof(FP) = 8192 * sizeof(FP). The maximum index size is
+  // 128 * sizeof(idx_type) + 96 * sizeof(unsigned).
+  static constexpr unsigned max_buf_size = 8192 * sizeof(FP)
+      + 128 * sizeof(idx_type) + 96 * sizeof(unsigned);
 
+ public:
   using StateSpace = StateSpaceCUDA<FP>;
   using State = typename StateSpace::State;
   using fp_type = typename StateSpace::fp_type;
 
-  explicit SimulatorCUDA(const Parameter& param)
-      : param_(param), scratch_(nullptr), scratch_size_(0) {
-    ErrorCheck(cudaMalloc(&d_wf, 131072 * sizeof(fp_type)));
-    ErrorCheck(cudaMalloc(&d_idx, 992 * sizeof(unsigned)));
-    ErrorCheck(cudaMalloc(&d_ms, 7 * sizeof(uint64_t)));
-    ErrorCheck(cudaMalloc(&d_xss, 64 * sizeof(uint64_t)));
+  SimulatorCUDA() : scratch_(nullptr), scratch_size_(0) {
+    ErrorCheck(cudaMalloc(&d_ws, max_buf_size));
   }
 
   ~SimulatorCUDA() {
-    ErrorCheck(cudaFree(d_wf));
-    ErrorCheck(cudaFree(d_idx));
-    ErrorCheck(cudaFree(d_ms));
-    ErrorCheck(cudaFree(d_xss));
+    ErrorCheck(cudaFree(d_ws));
 
     if (scratch_ != nullptr) {
       ErrorCheck(cudaFree(scratch_));
@@ -76,80 +71,54 @@ class SimulatorCUDA final {
                  const fp_type* matrix, State& state) const {
     // Assume qs[0] < qs[1] < qs[2] < ... .
 
-    switch (qs.size()) {
-    case 1:
-      if (qs[0] > 4) {
-        ApplyGate1H(qs, matrix, state);
-      } else {
-        ApplyGate1L(qs, matrix, state);
-      }
-      break;
-    case 2:
-      if (qs[0] > 4) {
-        ApplyGate2HH(qs, matrix, state);
-      } else if (qs[1] > 4) {
-        ApplyGate2HL(qs, matrix, state);
-      } else {
-        ApplyGate2LL(qs, matrix, state);
-      }
-      break;
-    case 3:
-      if (qs[0] > 4) {
-        ApplyGate3HHH(qs, matrix, state);
-      } else if (qs[1] > 4) {
-        ApplyGate3HHL(qs, matrix, state);
-      } else if (qs[2] > 4) {
-        ApplyGate3HLL(qs, matrix, state);
-      } else {
-        ApplyGate3LLL(qs, matrix, state);
-      }
-      break;
-    case 4:
-      if (qs[0] > 4) {
-        ApplyGate4HHHH(qs, matrix, state);
-      } else if (qs[1] > 4) {
-        ApplyGate4HHHL(qs, matrix, state);
-      } else if (qs[2] > 4) {
-        ApplyGate4HHLL(qs, matrix, state);
-      } else if (qs[3] > 4) {
-        ApplyGate4HLLL(qs, matrix, state);
-      } else {
-        ApplyGate4LLLL(qs, matrix, state);
+    if (qs[0] > 4) {
+      switch (qs.size()) {
+      case 1:
+        ApplyGateH<1>(qs, matrix, state);
+        break;
+      case 2:
+        ApplyGateH<2>(qs, matrix, state);
+        break;
+      case 3:
+        ApplyGateH<3>(qs, matrix, state);
+        break;
+      case 4:
+        ApplyGateH<4>(qs, matrix, state);
+        break;
+      case 5:
+        ApplyGateH<5>(qs, matrix, state);
+        break;
+      case 6:
+        ApplyGateH<6>(qs, matrix, state);
+        break;
+      default:
+        // Not implemented.
+        break;
       }
-      break;
-    case 5:
-      if (qs[0] > 4) {
-        ApplyGate5HHHHH(qs, matrix, state);
-      } else if (qs[1] > 4) {
-        ApplyGate5HHHHL(qs, matrix, state);
-      } else if (qs[2] > 4) {
-        ApplyGate5HHHLL(qs, matrix, state);
-      } else if (qs[3] > 4) {
-        ApplyGate5HHLLL(qs, matrix, state);
-      } else if (qs[4] > 4) {
-        ApplyGate5HLLLL(qs, matrix, state);
-      } else {
-        ApplyGate5LLLLL(qs, matrix, state);
-      }
-      break;
-    case 6:
-      if (qs[0] > 4) {
-        ApplyGate6HHHHHH(qs, matrix, state);
-      } else if (qs[1] > 4) {
-        ApplyGate6HHHHHL(qs, matrix, state);
-      } else if (qs[2] > 4) {
-        ApplyGate6HHHHLL(qs, matrix, state);
-      } else if (qs[3] > 4) {
-        ApplyGate6HHHLLL(qs, matrix, state);
-      } else if (qs[4] > 4) {
-        ApplyGate6HHLLLL(qs, matrix, state);
-      } else {
-        ApplyGate6HLLLLL(qs, matrix, state);
+    } else {
+      switch (qs.size()) {
+      case 1:
+        ApplyGateL<1>(qs, matrix, state);
+        break;
+      case 2:
+        ApplyGateL<2>(qs, matrix, state);
+        break;
+      case 3:
+        ApplyGateL<3>(qs, matrix, state);
+        break;
+      case 4:
+        ApplyGateL<4>(qs, matrix, state);
+        break;
+      case 5:
+        ApplyGateL<5>(qs, matrix, state);
+        break;
+      case 6:
+        ApplyGateL<6>(qs, matrix, state);
+        break;
+      default:
+        // Not implemented.
+        break;
       }
-      break;
-    default:
-      // Not implemented.
-      break;
     }
   }
 
@@ -157,118 +126,76 @@ class SimulatorCUDA final {
    * Applies a controlled gate using CUDA instructions.
    * @param qs Indices of the qubits affected by this gate.
    * @param cqs Indices of control qubits.
-   * @param cmask Bit mask of control qubit values.
+   * @param cvals Bit mask of control qubit values.
    * @param matrix Matrix representation of the gate to be applied.
    * @param state The state of the system, to be updated by this method.
    */
   void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cmask,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
                            const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
     if (cqs.size() == 0) {
       ApplyGate(qs, matrix, state);
       return;
     }
 
-    switch (qs.size()) {
-    case 1:
-      if (qs[0] > 4) {
-        if (cqs[0] > 4) {
-          ApplyControlledGate1H_H(qs, cqs, cmask, matrix, state);
-        } else {
-          ApplyControlledGate1H_L(qs, cqs, cmask, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 4) {
-          ApplyControlledGate1L_H(qs, cqs, cmask, matrix, state);
-        } else {
-          ApplyControlledGate1L_L(qs, cqs, cmask, matrix, state);
-        }
-      }
-      break;
-    case 2:
-      if (qs[0] > 4) {
-        if (cqs[0] > 4) {
-          ApplyControlledGate2HH_H(qs, cqs, cmask, matrix, state);
-        } else {
-          ApplyControlledGate2HH_L(qs, cqs, cmask, matrix, state);
-        }
-      } else if (qs[1] > 4) {
-        if (cqs[0] > 4) {
-          ApplyControlledGate2HL_H(qs, cqs, cmask, matrix, state);
-        } else {
-          ApplyControlledGate2HL_L(qs, cqs, cmask, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 4) {
-          ApplyControlledGate2LL_H(qs, cqs, cmask, matrix, state);
-        } else {
-          ApplyControlledGate2LL_L(qs, cqs, cmask, matrix, state);
-        }
-      }
-      break;
-    case 3:
-      if (qs[0] > 4) {
-        if (cqs[0] > 4) {
-          ApplyControlledGate3HHH_H(qs, cqs, cmask, matrix, state);
-        } else {
-          ApplyControlledGate3HHH_L(qs, cqs, cmask, matrix, state);
-        }
-      } else if (qs[1] > 4) {
-        if (cqs[0] > 4) {
-          ApplyControlledGate3HHL_H(qs, cqs, cmask, matrix, state);
-        } else {
-          ApplyControlledGate3HHL_L(qs, cqs, cmask, matrix, state);
-        }
-      } else if (qs[2] > 4) {
-        if (cqs[0] > 4) {
-          ApplyControlledGate3HLL_H(qs, cqs, cmask, matrix, state);
-        } else {
-          ApplyControlledGate3HLL_L(qs, cqs, cmask, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 4) {
-          ApplyControlledGate3LLL_H(qs, cqs, cmask, matrix, state);
-        } else {
-          ApplyControlledGate3LLL_L(qs, cqs, cmask, matrix, state);
-        }
+    if (cqs[0] < 5) {
+      switch (qs.size()) {
+      case 1:
+        ApplyControlledGateL<1>(qs, cqs, cvals, matrix, state);
+        break;
+      case 2:
+        ApplyControlledGateL<2>(qs, cqs, cvals, matrix, state);
+        break;
+      case 3:
+        ApplyControlledGateL<3>(qs, cqs, cvals, matrix, state);
+        break;
+      case 4:
+        ApplyControlledGateL<4>(qs, cqs, cvals, matrix, state);
+        break;
+      default:
+        // Not implemented.
+        break;
       }
-      break;
-    case 4:
+    } else {
       if (qs[0] > 4) {
-        if (cqs[0] > 4) {
-          ApplyControlledGate4HHHH_H(qs, cqs, cmask, matrix, state);
-        } else {
-          ApplyControlledGate4HHHH_L(qs, cqs, cmask, matrix, state);
-        }
-      } else if (qs[1] > 4) {
-        if (cqs[0] > 4) {
-          ApplyControlledGate4HHHL_H(qs, cqs, cmask, matrix, state);
-        } else {
-          ApplyControlledGate4HHHL_L(qs, cqs, cmask, matrix, state);
-        }
-      } else if (qs[2] > 4) {
-        if (cqs[0] > 4) {
-          ApplyControlledGate4HHLL_H(qs, cqs, cmask, matrix, state);
-        } else {
-          ApplyControlledGate4HHLL_L(qs, cqs, cmask, matrix, state);
-        }
-      } else if (qs[3] > 4) {
-        if (cqs[0] > 4) {
-          ApplyControlledGate4HLLL_H(qs, cqs, cmask, matrix, state);
-        } else {
-          ApplyControlledGate4HLLL_L(qs, cqs, cmask, matrix, state);
+        switch (qs.size()) {
+        case 1:
+          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
+          break;
+        case 2:
+          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
+          break;
+        case 3:
+          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
+          break;
+        case 4:
+          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
+          break;
+        default:
+          // Not implemented.
+          break;
         }
       } else {
-        if (cqs[0] > 4) {
-          ApplyControlledGate4LLLL_H(qs, cqs, cmask, matrix, state);
-        } else {
-          ApplyControlledGate4LLLL_L(qs, cqs, cmask, matrix, state);
+        switch (qs.size()) {
+        case 1:
+          ApplyControlledGateLH<1>(qs, cqs, cvals, matrix, state);
+          break;
+        case 2:
+          ApplyControlledGateLH<2>(qs, cqs, cvals, matrix, state);
+          break;
+        case 3:
+          ApplyControlledGateLH<3>(qs, cqs, cvals, matrix, state);
+          break;
+        case 4:
+          ApplyControlledGateLH<4>(qs, cqs, cvals, matrix, state);
+          break;
+        default:
+          // Not implemented.
+          break;
         }
       }
-      break;
-    default:
-      // Not implemented.
-      break;
     }
   }
 
@@ -284,80 +211,42 @@ class SimulatorCUDA final {
                                         const State& state) const {
     // Assume qs[0] < qs[1] < qs[2] < ... .
 
-    switch (qs.size()) {
-    case 1:
-      if (qs[0] > 4) {
-        return ExpectationValue1H(qs, matrix, state);
-      } else {
-        return ExpectationValue1L(qs, matrix, state);
-      }
-      break;
-    case 2:
-      if (qs[0] > 4) {
-        return ExpectationValue2HH(qs, matrix, state);
-      } else if (qs[1] > 4) {
-        return ExpectationValue2HL(qs, matrix, state);
-      } else {
-        return ExpectationValue2LL(qs, matrix, state);
+    if (qs[0] > 4) {
+      switch (qs.size()) {
+      case 1:
+        return ExpectationValueH<1>(qs, matrix, state);
+      case 2:
+        return ExpectationValueH<2>(qs, matrix, state);
+      case 3:
+        return ExpectationValueH<3>(qs, matrix, state);
+      case 4:
+        return ExpectationValueH<4>(qs, matrix, state);
+      case 5:
+        return ExpectationValueH<5>(qs, matrix, state);
+      case 6:
+        return ExpectationValueH<6>(qs, matrix, state);
+      default:
+        // Not implemented.
+        break;
       }
-      break;
-    case 3:
-      if (qs[0] > 4) {
-        return ExpectationValue3HHH(qs, matrix, state);
-      } else if (qs[1] > 4) {
-        return ExpectationValue3HHL(qs, matrix, state);
-      } else if (qs[2] > 4) {
-        return ExpectationValue3HLL(qs, matrix, state);
-      } else {
-        return ExpectationValue3LLL(qs, matrix, state);
-      }
-      break;
-    case 4:
-      if (qs[0] > 4) {
-        return ExpectationValue4HHHH(qs, matrix, state);
-      } else if (qs[1] > 4) {
-        return ExpectationValue4HHHL(qs, matrix, state);
-      } else if (qs[2] > 4) {
-        return ExpectationValue4HHLL(qs, matrix, state);
-      } else if (qs[3] > 4) {
-        return ExpectationValue4HLLL(qs, matrix, state);
-      } else {
-        return ExpectationValue4LLLL(qs, matrix, state);
-      }
-      break;
-    case 5:
-      if (qs[0] > 4) {
-        return ExpectationValue5HHHHH(qs, matrix, state);
-      } else if (qs[1] > 4) {
-        return ExpectationValue5HHHHL(qs, matrix, state);
-      } else if (qs[2] > 4) {
-        return ExpectationValue5HHHLL(qs, matrix, state);
-      } else if (qs[3] > 4) {
-        return ExpectationValue5HHLLL(qs, matrix, state);
-      } else if (qs[4] > 4) {
-        return ExpectationValue5HLLLL(qs, matrix, state);
-      } else {
-        return ExpectationValue5LLLLL(qs, matrix, state);
-      }
-      break;
-    case 6:
-      if (qs[0] > 4) {
-        return ExpectationValue6HHHHHH(qs, matrix, state);
-      } else if (qs[1] > 4) {
-        return ExpectationValue6HHHHHL(qs, matrix, state);
-      } else if (qs[2] > 4) {
-        return ExpectationValue6HHHHLL(qs, matrix, state);
-      } else if (qs[3] > 4) {
-        return ExpectationValue6HHHLLL(qs, matrix, state);
-      } else if (qs[4] > 4) {
-        return ExpectationValue6HHLLLL(qs, matrix, state);
-      } else {
-        return ExpectationValue6HLLLLL(qs, matrix, state);
+    } else {
+      switch (qs.size()) {
+      case 1:
+        return ExpectationValueL<1>(qs, matrix, state);
+      case 2:
+        return ExpectationValueL<2>(qs, matrix, state);
+      case 3:
+        return ExpectationValueL<3>(qs, matrix, state);
+      case 4:
+        return ExpectationValueL<4>(qs, matrix, state);
+      case 5:
+        return ExpectationValueL<5>(qs, matrix, state);
+      case 6:
+        return ExpectationValueL<6>(qs, matrix, state);
+      default:
+        // Not implemented.
+        break;
       }
-      break;
-    default:
-      // Not implemented.
-      break;
     }
 
     return 0;
@@ -371,6750 +260,652 @@ class SimulatorCUDA final {
   }
 
  private:
-  void ApplyGate1H(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
+  // The following indices are used in kernels.
+  // xss - indices to access the state vector entries in global memory.
+  // ms  - masks to access the state vector entries in global memory.
+  // tis - indices to access the state vector entries in shared memory
+  //       in the presence of low gate qubits.
+  // qis - indices to access the state vector entries in shared memory
+  //       in the presence of low gate qubits.
+  // cis - additional indices to access the state vector entries in global
+  //       memory in the presence of low control qubits.
+
+  template <unsigned G>
+  struct IndicesH {
+    static constexpr unsigned gsize = 1 << G;
+    static constexpr unsigned matrix_size = 2 * gsize * gsize * sizeof(fp_type);
+    static constexpr unsigned xss_size = 32 * sizeof(idx_type) * (1 + (G == 6));
+    static constexpr unsigned ms_size = 32 * sizeof(idx_type);
+    static constexpr unsigned xss_offs = matrix_size;
+    static constexpr unsigned ms_offs = xss_offs + xss_size;
+    static constexpr unsigned buf_size = ms_offs + ms_size;
+
+    IndicesH(char* p)
+        : xss((idx_type*) (p + xss_offs)), ms((idx_type*) (p + ms_offs)) {}
+
+    idx_type* xss;
+    idx_type* ms;
+  };
 
-    ErrorCheck(
-        cudaMemcpy(d_wf, matrix, 8 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
+  template <unsigned G>
+  struct IndicesL : public IndicesH<G> {
+    using Base = IndicesH<G>;
+    static constexpr unsigned qis_size = 32 * sizeof(unsigned) * (1 + (G == 6));
+    static constexpr unsigned tis_size = 32 * sizeof(unsigned);
+    static constexpr unsigned qis_offs = Base::buf_size;
+    static constexpr unsigned tis_offs = qis_offs + qis_size;
+    static constexpr unsigned buf_size = tis_offs + tis_size;
+
+    IndicesL(char* p)
+        : Base(p), qis((unsigned*) (p + qis_offs)),
+          tis((unsigned*) (p + tis_offs)) {}
+
+    unsigned* qis;
+    unsigned* tis;
+  };
 
-    fp_type* rstate = state.get();
+  template <unsigned G>
+  struct IndicesLC : public IndicesL<G> {
+    using Base = IndicesL<G>;
+    static constexpr unsigned cis_size = 32 * sizeof(idx_type);
+    static constexpr unsigned cis_offs = Base::buf_size;
+    static constexpr unsigned buf_size = cis_offs + cis_size;
 
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    IndicesLC(char* p) : Base(p), cis((idx_type*) (p + cis_offs)) {}
 
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
+    idx_type* cis;
+  };
 
-    ApplyGate1H_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
+  struct DataC {
+    idx_type cvalsh;
+    unsigned num_aqs;
+    unsigned num_effective_qs;
+    unsigned remaining_low_cqs;
+  };
 
-  void ApplyGate1L(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, State& state) const {
-    unsigned p[32];
-    unsigned idx[32];
-    fp_type wf[128];
+  template <unsigned G>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    unsigned num_qubits = state.num_qubits();
 
-    unsigned qmask = (1 << qs[0]);
+    IndicesH<G> h_i(h_ws);
+    GetIndicesH(num_qubits, qs, qs.size(), h_i);
 
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask));
-      }
-    }
+    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
+    ErrorCheck(
+        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
 
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
+    unsigned k = 5 + G;
+    unsigned n = num_qubits > k ? num_qubits - k : 0;
+    unsigned size = unsigned{1} << n;
+    unsigned threads = 64U;
+    unsigned blocks = std::max(1U, size / 2);
 
-        unsigned l = 2 * (2 * i + m);
+    IndicesH<G> d_i(d_ws);
 
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
+    ApplyGateH_Kernel<G><<<blocks, threads>>>(
+        (fp_type*) d_ws, d_i.xss, d_i.ms, state.get());
+  }
 
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
+  template <unsigned G>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    unsigned num_qubits = state.num_qubits();
 
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 128 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice));
+    IndicesL<G> h_i(h_ws);
+    auto num_effective_qs = GetIndicesL(num_qubits, qs, h_i);
 
-    fp_type* rstate = state.get();
+    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
+    ErrorCheck(
+        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
 
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    unsigned k = 5 + num_effective_qs;
+    unsigned n = num_qubits > k ? num_qubits - k : 0;
+    unsigned size = unsigned{1} << n;
+    unsigned threads = 32;
+    unsigned blocks = size;
 
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
+    IndicesL<G> d_i(d_ws);
 
-    ApplyGate1L_Kernel<<<blocks, threads>>>(
-        d_wf, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
+    ApplyGateL_Kernel<G><<<blocks, threads>>>(
+        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
+        1 << num_effective_qs, state.get());
   }
 
-  void ApplyGate2HH(const std::vector<unsigned>& qs,
-                    const fp_type* matrix, State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
+  template <unsigned G>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, idx_type cvals,
+                             const fp_type* matrix, State& state) const {
+    unsigned aqs[64];
+    idx_type cmaskh = 0;
+    unsigned num_qubits = state.num_qubits();
 
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
+    IndicesH<G> h_i(h_ws);
 
-    ErrorCheck(
-        cudaMemcpy(d_wf, matrix, 32 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
+    unsigned num_aqs = GetHighQubits(qs, 0, cqs, 0, 0, cmaskh, aqs);
+    GetMs(num_qubits, aqs, num_aqs, h_i.ms);
+    GetXss(num_qubits, qs, qs.size(), h_i.xss);
 
-    fp_type* rstate = state.get();
+    idx_type cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
+
+    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
+    ErrorCheck(
+        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
 
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    unsigned k = 5 + G + cqs.size();
+    unsigned n = num_qubits > k ? num_qubits - k : 0;
+    unsigned size = unsigned{1} << n;
+    unsigned threads = 64U;
+    unsigned blocks = std::max(1U, size / 2);
 
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
+    IndicesH<G> d_i(d_ws);
 
-    ApplyGate2HH_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
+    ApplyControlledGateH_Kernel<G><<<blocks, threads>>>(
+        (fp_type*) d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get());
   }
 
-  void ApplyGate2HL(const std::vector<unsigned>& qs,
-                    const fp_type* matrix, State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
+  template <unsigned G>
+  void ApplyControlledGateLH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    unsigned num_qubits = state.num_qubits();
 
-    unsigned p[32];
-    unsigned idx[32];
-    fp_type wf[512];
+    IndicesL<G> h_i(h_ws);
+    auto d = GetIndicesLC(num_qubits, qs, cqs, cvals, h_i);
 
-    unsigned qmask = (1 << qs[0]);
+    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
+    ErrorCheck(
+        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
 
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask));
-      }
-    }
+    unsigned k = 5 + G + cqs.size();
+    unsigned n = num_qubits > k ? num_qubits - k : 0;
+    unsigned size = unsigned{1} << n;
+    unsigned threads = 32;
+    unsigned blocks = size;
 
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
+    IndicesL<G> d_i(d_ws);
 
-        unsigned l = 2 * (4 * i + m);
+    ApplyControlledGateLH_Kernel<G><<<blocks, threads>>>(
+        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
+        d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, state.get());
+  }
 
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
+  template <unsigned G>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    unsigned num_qubits = state.num_qubits();
 
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
+    IndicesLC<G> h_i(h_ws);
+    auto d = GetIndicesLCL(num_qubits, qs, cqs, cvals, h_i);
 
+    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
     ErrorCheck(
-        cudaMemcpy(d_wf, wf, 512 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
+        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
 
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    unsigned k = 5 + G + cqs.size();
+    unsigned n = num_qubits > k ? num_qubits - k : 0;
+    unsigned size = unsigned{1} << n;
+    unsigned threads = 32;
+    unsigned blocks = size;
 
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
+    IndicesLC<G> d_i(d_ws);
 
-    ApplyGate2HL_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
+    ApplyControlledGateL_Kernel<G><<<blocks, threads>>>(
+        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis,
+        d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs,
+        1 << (5 - d.remaining_low_cqs), state.get());
   }
 
-  void ApplyGate2LL(const std::vector<unsigned>& qs,
-                    const fp_type* matrix, State& state) const {
-    unsigned p[32];
-    unsigned idx[96];
-    fp_type wf[256];
+  template <unsigned G>
+  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    unsigned num_qubits = state.num_qubits();
 
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+    IndicesH<G> h_i(h_ws);
+    GetIndicesH(num_qubits, qs, qs.size(), h_i);
 
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
+    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
+    ErrorCheck(
+        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
 
-        unsigned l = 2 * (4 * i + m);
+    unsigned k = 5 + G;
+    unsigned n = num_qubits > k ? num_qubits - k : 0;
+    unsigned size = unsigned{1} << n;
 
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
+    unsigned s = std::min(n >= 14 ? n - 14 : 0, 4U);
+    unsigned threads = 64U;
+    unsigned blocks = std::max(1U, (size / 2) >> s);
+    unsigned num_iterations_per_block = 1 << s;
 
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
+    constexpr unsigned m = 16;
 
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 256 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice));
+    Complex* d_res1 = (Complex*) AllocScratch((blocks + m) * sizeof(Complex));
+    Complex* d_res2 = d_res1 + blocks;
 
-    fp_type* rstate = state.get();
+    IndicesH<G> d_i(d_ws);
 
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    ExpectationValueH_Kernel<G><<<blocks, threads>>>(
+        (fp_type*) d_ws, d_i.xss, d_i.ms, num_iterations_per_block,
+        state.get(), Plus<double>(), d_res1);
 
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
+    double mul = size == 1 ? 0.5 : 1.0;
 
-    ApplyGate2LL_Kernel<<<blocks, threads>>>(
-        d_wf, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
+    return ExpectationValueReduceFinal<m>(blocks, mul, d_res1, d_res2);
   }
 
-  void ApplyGate3HHH(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
+  template <unsigned G>
+  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    unsigned num_qubits = state.num_qubits();
 
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
+    IndicesL<G> h_i(h_ws);
+    auto num_effective_qs = GetIndicesL(num_qubits, qs, h_i);
 
+    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
     ErrorCheck(
-        cudaMemcpy(d_wf, matrix, 128 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
+        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
 
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    unsigned k = 5 + num_effective_qs;
+    unsigned n = num_qubits > k ? num_qubits - k : 0;
+    unsigned size = unsigned{1} << n;
 
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
+    unsigned s = std::min(n >= 13 ? n - 13 : 0, 5U);
+    unsigned threads = 32;
+    unsigned blocks = size >> s;
+    unsigned num_iterations_per_block = 1 << s;
 
-    ApplyGate3HHH_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
+    constexpr unsigned m = 16;
 
-  void ApplyGate3HHL(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
+    Complex* d_res1 = (Complex*) AllocScratch((blocks + m) * sizeof(Complex));
+    Complex* d_res2 = d_res1 + blocks;
 
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
+    IndicesL<G> d_i(d_ws);
 
-    unsigned p[32];
-    unsigned idx[32];
-    fp_type wf[2048];
+    ExpectationValueL_Kernel<G><<<blocks, threads>>>(
+        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
+        num_iterations_per_block, state.get(), Plus<double>(), d_res1);
 
-    unsigned qmask = (1 << qs[0]);
+    double mul = double(1 << (5 + num_effective_qs - G)) / 32;
 
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask));
-      }
-    }
+    return ExpectationValueReduceFinal<m>(blocks, mul, d_res1, d_res2);
+  }
 
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
+  template <unsigned m>
+  std::complex<double> ExpectationValueReduceFinal(
+      unsigned blocks, double mul,
+      const Complex* d_res1, Complex* d_res2) const {
+    Complex res2[m];
 
-        unsigned l = 2 * (8 * i + m);
+    if (blocks <= 16) {
+      ErrorCheck(cudaMemcpy(res2, d_res1, blocks * sizeof(Complex),
+                            cudaMemcpyDeviceToHost));
+    } else {
+      unsigned threads2 = std::min(1024U, blocks);
+      unsigned blocks2 = std::min(m, blocks / threads2);
 
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
+      unsigned dblocks = std::max(1U, blocks / (blocks2 * threads2));
+      unsigned bytes = threads2 * sizeof(Complex);
 
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
+      Reduce2Kernel<Complex><<<blocks2, threads2, bytes>>>(
+          dblocks, blocks, Plus<Complex>(), Plus<double>(), d_res1, d_res2);
 
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
+      ErrorCheck(cudaMemcpy(res2, d_res2, blocks2 * sizeof(Complex),
+                            cudaMemcpyDeviceToHost));
 
-    fp_type* rstate = state.get();
+      blocks = blocks2;
+    }
 
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    double re = 0;
+    double im = 0;
 
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
+    for (unsigned i = 0; i < blocks; ++i) {
+      re += res2[i].re;
+      im += res2[i].im;
+    }
 
-    ApplyGate3HHL_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
+    return {mul * re, mul * im};
   }
 
-  void ApplyGate3HLL(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
+  template <typename AQ>
+  unsigned GetHighQubits(const std::vector<unsigned>& qs, unsigned qi,
+                         const std::vector<unsigned>& cqs, unsigned ci,
+                         unsigned ai, idx_type& cmaskh, AQ& aqs) const {
+    while (1) {
+      if (qi < qs.size() && (ci == cqs.size() || qs[qi] < cqs[ci])) {
+        aqs[ai++] = qs[qi++];
+      } else if (ci < cqs.size()) {
+        cmaskh |= idx_type{1} << cqs[ci];
+        aqs[ai++] = cqs[ci++];
+      } else {
+        break;
       }
-      xss[i] = a;
     }
 
-    unsigned p[32];
-    unsigned idx[96];
-    fp_type wf[1024];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+    return ai;
+  }
 
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask));
+  template <typename QS>
+  void GetMs(unsigned num_qubits, const QS& qs, unsigned qs_size,
+             idx_type* ms) const {
+    if (qs_size == 0) {
+      ms[0] = idx_type(-1);
+    } else {
+      idx_type xs = idx_type{1} << (qs[0] + 1);
+      ms[0] = (idx_type{1} << qs[0]) - 1;
+      for (unsigned i = 1; i < qs_size; ++i) {
+        ms[i] = ((idx_type{1} << qs[i]) - 1) ^ (xs - 1);
+        xs = idx_type{1} << (qs[i] + 1);
       }
+      ms[qs_size] = ((idx_type{1} << num_qubits) - 1) ^ (xs - 1);
     }
+  }
 
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
+  template <typename QS>
+  void GetXss(unsigned num_qubits, const QS& qs, unsigned qs_size,
+              idx_type* xss) const {
+    if (qs_size == 0) {
+      xss[0] = 0;
+    } else {
+      unsigned g = qs_size;
+      unsigned gsize = 1 << qs_size;
 
-        unsigned l = 2 * (8 * i + m);
+      idx_type xs[64];
 
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
+      xs[0] = idx_type{1} << (qs[0] + 1);
+      for (unsigned i = 1; i < g; ++i) {
+        xs[i] = idx_type{1} << (qs[i] + 1);
+      }
 
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
+      for (unsigned i = 0; i < gsize; ++i) {
+        idx_type a = 0;
+        for (unsigned k = 0; k < g; ++k) {
+          a += xs[k] * ((i >> k) & 1);
         }
+        xss[i] = a;
       }
     }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 1024 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyGate3HLL_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
   }
 
-  void ApplyGate3LLL(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, State& state) const {
-    unsigned p[32];
-    unsigned idx[224];
-    fp_type wf[512];
+  template <unsigned G, typename qs_type>
+  void GetIndicesH(unsigned num_qubits, const qs_type& qs, unsigned qs_size,
+                   IndicesH<G>& indices) const {
+    if (qs_size == 0) {
+      indices.ms[0] = idx_type(-1);
+      indices.xss[0] = 0;
+    } else {
+      unsigned g = qs_size;
+      unsigned gsize = 1 << qs_size;
 
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+      idx_type xs[64];
 
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask));
+      xs[0] = idx_type{1} << (qs[0] + 1);
+      indices.ms[0] = (idx_type{1} << qs[0]) - 1;
+      for (unsigned i = 1; i < g; ++i) {
+        xs[i] = idx_type{1} << (qs[i] + 1);
+        indices.ms[i] = ((idx_type{1} << qs[i]) - 1) ^ (xs[i - 1] - 1);
       }
-    }
+      indices.ms[g] = ((idx_type{1} << num_qubits) - 1) ^ (xs[g - 1] - 1);
 
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
+      for (unsigned i = 0; i < gsize; ++i) {
+        idx_type a = 0;
+        for (unsigned k = 0; k < g; ++k) {
+          a += xs[k] * ((i >> k) & 1);
         }
+        indices.xss[i] = a;
+      }
+    }
+  }
 
-        unsigned l = 2 * (8 * i + m);
+  template <unsigned G>
+  void GetIndicesL(unsigned num_effective_qs, unsigned qmask,
+                   IndicesL<G>& indices) const {
+    for (unsigned i = num_effective_qs + 1; i < (G + 1); ++i) {
+      indices.ms[i] = 0;
+    }
 
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
+    for (unsigned i = (1 << num_effective_qs); i < indices.gsize; ++i) {
+      indices.xss[i] = 0;
+    }
 
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
+    for (unsigned i = 0; i < indices.gsize; ++i) {
+      indices.qis[i] = bits::ExpandBits(i, 5 + num_effective_qs, qmask);
     }
 
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 512 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice));
+    unsigned tmask = ((1 << (5 + num_effective_qs)) - 1) ^ qmask;
+    for (unsigned i = 0; i < 32; ++i) {
+      indices.tis[i] = bits::ExpandBits(i, 5 + num_effective_qs, tmask);
+    }
+  }
 
-    fp_type* rstate = state.get();
+  template <unsigned G>
+  unsigned GetIndicesL(unsigned num_qubits, const std::vector<unsigned>& qs,
+                       IndicesL<G>& indices) const {
+    unsigned eqs[32];
 
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    unsigned qmaskh = 0;
+    unsigned qmaskl = 0;
 
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
+    unsigned qi = 0;
 
-    ApplyGate3LLL_Kernel<<<blocks, threads>>>(
-        d_wf, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
+    while (qi < qs.size() && qs[qi] < 5) {
+      qmaskl |= 1 << qs[qi++];
+    }
 
-  void ApplyGate4HHHH(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
+    unsigned nq = std::max(5U, num_qubits);
+    unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size()));
 
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
+    unsigned l = 0;
+    unsigned ei = 0;
+    unsigned num_low_qs = qi;
+
+    if (qs.size() == num_low_qs) {
+      while (ei < num_effective_qs && l++ < num_low_qs) {
+        eqs[ei] = ei + 5;
+        ++ei;
+      }
+    } else {
+      while (ei < num_effective_qs && l < num_low_qs) {
+        unsigned ei5 = ei + 5;
+        eqs[ei] = ei5;
+        if (qi < qs.size() && qs[qi] == ei5) {
+          ++qi;
+          qmaskh |= 1 << ei5;
+        } else {
+          ++l;
         }
+        ++ei;
       }
-      xss[i] = a;
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, matrix, 512 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 5 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 16 * sizeof(uint64_t), cudaMemcpyHostToDevice));
 
-    fp_type* rstate = state.get();
-
-    unsigned k = 9;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
+      while (ei < num_effective_qs) {
+        eqs[ei] = qs[qi++];
+        qmaskh |= 1 << (ei + 5);
+        ++ei;
+      }
+    }
 
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
+    GetIndicesH(num_qubits, eqs, num_effective_qs, indices);
+    GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices);
 
-    ApplyGate4HHHH_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
+    return num_effective_qs;
   }
 
-  void ApplyGate4HHHL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
+  template <unsigned G>
+  DataC GetIndicesLC(unsigned num_qubits, const std::vector<unsigned>& qs,
+                     const std::vector<unsigned>& cqs, uint64_t cvals,
+                     IndicesL<G>& indices) const {
+    unsigned aqs[64];
+    unsigned eqs[32];
 
-    unsigned p[32];
-    unsigned idx[32];
-    fp_type wf[8192];
+    unsigned qmaskh = 0;
+    unsigned qmaskl = 0;
+    idx_type cmaskh = 0;
 
-    unsigned qmask = (1 << qs[0]);
+    unsigned qi = 0;
 
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask));
-      }
+    while (qi < qs.size() && qs[qi] < 5) {
+      qmaskl |= 1 << qs[qi++];
     }
 
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
+    unsigned nq = std::max(5U, num_qubits - unsigned(cqs.size()));
+    unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size()));
 
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
+    unsigned l = 0;
+    unsigned ai = 5;
+    unsigned ci = 0;
+    unsigned ei = 0;
+    unsigned num_low_qs = qi;
 
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
+    while (ai < num_qubits && l < num_low_qs) {
+      aqs[ai - 5] = ai;
+      if (qi < qs.size() && qs[qi] == ai) {
+        ++qi;
+        eqs[ei++] = ai;
+        qmaskh |= 1 << (ai - ci);
+      } else if (ci < cqs.size() && cqs[ci] == ai) {
+        ++ci;
+        cmaskh |= idx_type{1} << ai;
+      } else {
+        ++l;
+        eqs[ei++] = ai;
       }
+      ++ai;
     }
 
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 8192 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice));
+    unsigned i = ai;
+    unsigned j = qi;
 
-    fp_type* rstate = state.get();
+    while (ei < num_effective_qs) {
+      eqs[ei++] = qs[j++];
+      qmaskh |= 1 << (i++ - ci);
+    }
 
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    unsigned num_aqs = GetHighQubits(qs, qi, cqs, ci, ai - 5, cmaskh, aqs);
+    GetMs(num_qubits, aqs, num_aqs, indices.ms);
+    GetXss(num_qubits, eqs, num_effective_qs, indices.xss);
+    GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices);
 
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
+    idx_type cvalsh = bits::ExpandBits(idx_type(cvals), num_qubits, cmaskh);
 
-    ApplyGate4HHHL_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
+    return {cvalsh, num_aqs, num_effective_qs};
   }
 
-  void ApplyGate4HHLL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
+  template <unsigned G>
+  DataC GetIndicesLCL(unsigned num_qubits, const std::vector<unsigned>& qs,
+                      const std::vector<unsigned>& cqs, uint64_t cvals,
+                      IndicesLC<G>& indices) const {
+    unsigned aqs[64];
+    unsigned eqs[32];
 
-    unsigned p[32];
-    unsigned idx[96];
-    fp_type wf[4096];
+    unsigned qmaskh = 0;
+    unsigned qmaskl = 0;
+    idx_type cmaskh = 0;
+    idx_type cmaskl = 0;
+    idx_type cis_mask = 0;
 
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+    unsigned qi = 0;
+    unsigned ci = 0;
 
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask));
+    for (unsigned k = 0; k < 5; ++k) {
+      if (qi < qs.size() && qs[qi] == k) {
+        qmaskl |= 1 << (k - ci);
+        ++qi;
+      } else if (ci < cqs.size() && cqs[ci] == k) {
+        cmaskl |= idx_type{1} << k;
+        ++ci;
       }
     }
 
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
+    unsigned num_low_qs = qi;
+    unsigned num_low_cqs = ci;
 
-        unsigned l = 2 * (16 * i + m);
+    unsigned nq = std::max(5U, num_qubits - unsigned(cqs.size()));
+    unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size()));
 
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
+    unsigned l = 0;
+    unsigned ai = 5;
+    unsigned ei = 0;
+    unsigned num_low = num_low_qs + num_low_cqs;
+    unsigned remaining_low_cqs = num_low_cqs;
+    unsigned effective_low_qs = num_low_qs;
+    unsigned highest_cis_bit = 0;
 
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
+    while (ai < num_qubits && l < num_low) {
+      aqs[ai - 5] = ai;
+      if (qi < qs.size() && qs[qi] == ai) {
+        ++qi;
+        if ((ai - ci) > 4) {
+          eqs[ei++] = ai;
+          qmaskh |= 1 << (ai - ci);
+        } else {
+          highest_cis_bit = ai;
+          cis_mask |= idx_type{1} << ai;
+          qmaskl |= 1 << (ai - ci);
+          --remaining_low_cqs;
+          ++effective_low_qs;
+        }
+      } else if (ci < cqs.size() && cqs[ci] == ai) {
+        ++ci;
+        cmaskh |= idx_type{1} << ai;
+      } else {
+        ++l;
+        if (remaining_low_cqs == 0) {
+          eqs[ei++] = ai;
+        } else {
+          highest_cis_bit = ai;
+          cis_mask |= idx_type{1} << ai;
+          --remaining_low_cqs;
         }
       }
+      ++ai;
     }
 
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 4096 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
+    unsigned i = ai;
+    unsigned j = effective_low_qs;
 
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    while (ei < num_effective_qs) {
+      eqs[ei++] = qs[j++];
+      qmaskh |= 1 << (i++ - ci);
+    }
 
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
+    unsigned num_aqs = GetHighQubits(qs, qi, cqs, ci, ai - 5, cmaskh, aqs);
+    GetMs(num_qubits, aqs, num_aqs, indices.ms);
+    GetXss(num_qubits, eqs, num_effective_qs, indices.xss);
+    GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices);
 
-    ApplyGate4HHLL_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
+    idx_type cvalsh = bits::ExpandBits(idx_type(cvals), num_qubits, cmaskh);
+    idx_type cvalsl = bits::ExpandBits(idx_type(cvals), 5, cmaskl);
 
-  void ApplyGate4HLLL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
+    cis_mask |= 31 ^ cmaskl;
+    highest_cis_bit = highest_cis_bit < 5 ? 5 : highest_cis_bit;
+    for (idx_type i = 0; i < 32; ++i) {
+      auto c = bits::ExpandBits(i, highest_cis_bit + 1, cis_mask);
+      indices.cis[i] = 2 * (c & 0xffffffe0) | (c & 0x1f) | cvalsl;
     }
 
-    unsigned p[32];
-    unsigned idx[224];
-    fp_type wf[2048];
+    return {cvalsh, num_aqs, num_effective_qs, remaining_low_cqs};
+  }
 
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
 
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask));
+  void* AllocScratch(uint64_t size) const {
+    if (size > scratch_size_) {
+      if (scratch_ != nullptr) {
+        ErrorCheck(cudaFree(scratch_));
       }
-    }
 
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
-        }
+      ErrorCheck(cudaMalloc(const_cast<void**>(&scratch_), size));
 
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyGate4HLLL_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyGate4LLLL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, State& state) const {
-    unsigned p[32];
-    unsigned idx[480];
-    fp_type wf[1024];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
-
-    for (unsigned i = 0; i < 15; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 16) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 1024 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 480 * sizeof(unsigned), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyGate4LLLL_Kernel<<<blocks, threads>>>(
-        d_wf, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyGate5HHHHH(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, State& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, matrix, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 6 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 32 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 10;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyGate5HHHHH_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyGate5HHHHL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[32];
-    fp_type wf[32768];
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (64 * i + 32 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 32768 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 5 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 16 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 9;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyGate5HHHHL_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyGate5HHHLL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[96];
-    fp_type wf[16384];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (128 * i + 32 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 16384 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyGate5HHHLL_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyGate5HHLLL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[224];
-    fp_type wf[8192];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (256 * i + 32 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 8192 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyGate5HHLLL_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyGate5HLLLL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[4] + 1);
-    ms[0] = (uint64_t{1} << qs[4]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[480];
-    fp_type wf[4096];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
-
-    for (unsigned i = 0; i < 15; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 16) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (512 * i + 32 * k + 16 * (m / 16) + (k + m) % 16);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 4096 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 480 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyGate5HLLLL_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyGate5LLLLL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, State& state) const {
-    unsigned p[32];
-    unsigned idx[992];
-    fp_type wf[2048];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3])
-         | (1 << qs[4]);
-
-    for (unsigned i = 0; i < 31; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 32) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (1024 * i + 32 * k + 32 * (m / 32) + (k + m) % 32);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 992 * sizeof(unsigned), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyGate5LLLLL_Kernel<<<blocks, threads>>>(
-        d_wf, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyGate6HHHHHH(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, State& state) const {
-    uint64_t xs[6];
-    uint64_t ms[7];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 6; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1);
-
-    uint64_t xss[64];
-    for (unsigned i = 0; i < 64; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 6; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, matrix, 8192 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 7 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 64 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 11;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyGate6HHHHHH_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyGate6HHHHHL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, State& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[32];
-    fp_type wf[131072];
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 32; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (128 * i + 64 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 131072 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 6 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 32 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 10;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyGate6HHHHHL_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyGate6HHHHLL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[96];
-    fp_type wf[65536];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (256 * i + 64 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 65536 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 5 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 16 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 9;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyGate6HHHHLL_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyGate6HHHLLL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[224];
-    fp_type wf[32768];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (512 * i + 64 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 32768 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyGate6HHHLLL_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyGate6HHLLLL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[4] + 1);
-    ms[0] = (uint64_t{1} << qs[4]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 4] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 4]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[480];
-    fp_type wf[16384];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
-
-    for (unsigned i = 0; i < 15; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 16) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (1024 * i + 64 * k + 16 * (m / 16) + (k + m) % 16);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 16384 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 480 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyGate6HHLLLL_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyGate6HLLLLL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[5] + 1);
-    ms[0] = (uint64_t{1} << qs[5]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[992];
-    fp_type wf[8192];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3])
-         | (1 << qs[4]);
-
-    for (unsigned i = 0; i < 31; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 32) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (2048 * i + 64 * k + 32 * (m / 32) + (k + m) % 32);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 8192 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 992 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyGate6HLLLLL_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate1H_H(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, matrix, 8 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate1H_H_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate1H_L(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    fp_type wf[256];
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (2 * i + 2 * k + m);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0;
-          wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 256 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate1H_L_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate1L_H(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               State& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    unsigned idx[32];
-    fp_type wf[128];
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 128 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate1L_H_Kernel<<<blocks, threads>>>(
-        d_wf, state.num_qubits(), cmaskh, emaskh, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate1L_L(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               State& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl);
-
-    for (auto q : qs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    unsigned idx[32];
-    fp_type wf[128];
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0;
-          wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 128 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate1L_L_Kernel<<<blocks, threads>>>(
-        d_wf, state.num_qubits(), cmaskh, emaskh, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate2HH_H(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, matrix, 32 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate2HH_H_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate2HH_L(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    fp_type wf[1024];
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (4 * i + 4 * k + m);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
-          wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 1024 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate2HH_L_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate2HL_H(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    unsigned idx[32];
-    fp_type wf[512];
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 512 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate2HL_H_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate2HL_L(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl);
-
-    for (auto q : qs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    unsigned idx[32];
-    fp_type wf[512];
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
-          wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 512 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate2HL_L_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate2LL_H(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    unsigned idx[96];
-    fp_type wf[256];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 256 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate2LL_H_Kernel<<<blocks, threads>>>(
-        d_wf, state.num_qubits(), cmaskh, emaskh, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate2LL_L(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl);
-
-    for (auto q : qs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    unsigned idx[96];
-    fp_type wf[256];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
-          wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 256 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate2LL_L_Kernel<<<blocks, threads>>>(
-        d_wf, state.num_qubits(), cmaskh, emaskh, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate3HHH_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, matrix, 128 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate3HHH_H_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate3HHH_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    fp_type wf[4096];
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (8 * i + 8 * k + m);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 4096 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate3HHH_L_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate3HHL_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    unsigned idx[32];
-    fp_type wf[2048];
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate3HHL_H_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate3HHL_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl);
-
-    for (auto q : qs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    unsigned idx[32];
-    fp_type wf[2048];
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate3HHL_L_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate3HLL_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    unsigned idx[96];
-    fp_type wf[1024];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 1024 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate3HLL_H_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate3HLL_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl);
-
-    for (auto q : qs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    unsigned idx[96];
-    fp_type wf[1024];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 1024 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate3HLL_L_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate3LLL_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    unsigned idx[224];
-    fp_type wf[512];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 512 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate3LLL_H_Kernel<<<blocks, threads>>>(
-        d_wf, state.num_qubits(), cmaskh, emaskh, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate3LLL_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl);
-
-    for (auto q : qs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    unsigned idx[224];
-    fp_type wf[512];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 512 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate3LLL_L_Kernel<<<blocks, threads>>>(
-        d_wf, state.num_qubits(), cmaskh, emaskh, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate4HHHH_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, matrix, 512 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 5 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 16 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 9 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate4HHHH_H_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate4HHHH_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    fp_type wf[16384];
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (16 * i + 16 * k + m);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 16384 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 5 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 16 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 9 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate4HHHH_L_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate4HHHL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    unsigned idx[32];
-    fp_type wf[8192];
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 8192 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate4HHHL_H_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate4HHHL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl);
-
-    for (auto q : qs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    unsigned idx[32];
-    fp_type wf[8192];
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 8192 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate4HHHL_L_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate4HHLL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    unsigned idx[96];
-    fp_type wf[4096];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 4096 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate4HHLL_H_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate4HHLL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl);
-
-    for (auto q : qs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    unsigned idx[96];
-    fp_type wf[4096];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 4096 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate4HHLL_L_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate4HLLL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    unsigned idx[224];
-    fp_type wf[2048];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate4HLLL_H_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate4HLLL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl);
-
-    for (auto q : qs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    unsigned idx[224];
-    fp_type wf[2048];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate4HLLL_L_Kernel<<<blocks, threads>>>(
-        d_wf, d_ms, d_xss, state.num_qubits(), cmaskh, emaskh, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate4LLLL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    unsigned idx[480];
-    fp_type wf[1024];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
-
-    for (unsigned i = 0; i < 15; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 16) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 1024 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 480 * sizeof(unsigned), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate4LLLL_H_Kernel<<<blocks, threads>>>(
-        d_wf, state.num_qubits(), cmaskh, emaskh, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void ApplyControlledGate4LLLL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 5, emaskl);
-
-    for (auto q : qs) {
-      if (q > 4) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 31;
-
-    unsigned p[32];
-    unsigned idx[480];
-    fp_type wf[1024];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
-
-    for (unsigned i = 0; i < 15; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 16) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[32 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 1024 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 480 * sizeof(unsigned), cudaMemcpyHostToDevice));
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-
-    ApplyControlledGate4LLLL_L_Kernel<<<blocks, threads>>>(
-        d_wf, state.num_qubits(), cmaskh, emaskh, d_idx, rstate);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  std::complex<double> ExpectationValue1H(const std::vector<unsigned>& qs,
-                                          const fp_type* matrix,
-                                          const State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, matrix, 8 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue1H_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_ms, d_xss, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue1L(const std::vector<unsigned>& qs,
-                                          const fp_type* matrix,
-                                          const State& state) const {
-    unsigned p[32];
-    unsigned idx[32];
-    fp_type wf[128];
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 128 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue1L_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_idx, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue2HH(const std::vector<unsigned>& qs,
-                                           const fp_type* matrix,
-                                           const State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, matrix, 32 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue2HH_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_ms, d_xss, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue2HL(const std::vector<unsigned>& qs,
-                                           const fp_type* matrix,
-                                           const State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[32];
-    fp_type wf[512];
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 512 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue2HL_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue2LL(const std::vector<unsigned>& qs,
-                                           const fp_type* matrix,
-                                           const State& state) const {
-    unsigned p[32];
-    unsigned idx[96];
-    fp_type wf[256];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 256 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue2LL_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_idx, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue3HHH(const std::vector<unsigned>& qs,
-                                            const fp_type* matrix,
-                                            const State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, matrix, 128 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue3HHH_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_ms, d_xss, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue3HHL(const std::vector<unsigned>& qs,
-                                            const fp_type* matrix,
-                                            const State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[32];
-    fp_type wf[2048];
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue3HHL_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue3HLL(const std::vector<unsigned>& qs,
-                                            const fp_type* matrix,
-                                            const State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[96];
-    fp_type wf[1024];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 1024 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue3HLL_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue3LLL(const std::vector<unsigned>& qs,
-                                            const fp_type* matrix,
-                                            const State& state) const {
-    unsigned p[32];
-    unsigned idx[224];
-    fp_type wf[512];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 512 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue3LLL_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_idx, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue4HHHH(const std::vector<unsigned>& qs,
-                                             const fp_type* matrix,
-                                             const State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, matrix, 512 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 5 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 16 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 9;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue4HHHH_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_ms, d_xss, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue4HHHL(const std::vector<unsigned>& qs,
-                                             const fp_type* matrix,
-                                             const State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[32];
-    fp_type wf[8192];
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 8192 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue4HHHL_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue4HHLL(const std::vector<unsigned>& qs,
-                                             const fp_type* matrix,
-                                             const State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[96];
-    fp_type wf[4096];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 4096 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue4HHLL_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue4HLLL(const std::vector<unsigned>& qs,
-                                             const fp_type* matrix,
-                                             const State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[224];
-    fp_type wf[2048];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue4HLLL_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue4LLLL(const std::vector<unsigned>& qs,
-                                             const fp_type* matrix,
-                                             const State& state) const {
-    unsigned p[32];
-    unsigned idx[480];
-    fp_type wf[1024];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
-
-    for (unsigned i = 0; i < 15; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 16) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 1024 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 480 * sizeof(unsigned), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue4LLLL_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_idx, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue5HHHHH(const std::vector<unsigned>& qs,
-                                              const fp_type* matrix,
-                                              const State& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, matrix, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 6 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 32 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 10;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue5HHHHH_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_ms, d_xss, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue5HHHHL(const std::vector<unsigned>& qs,
-                                              const fp_type* matrix,
-                                              const State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[32];
-    fp_type wf[32768];
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (64 * i + 32 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 32768 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 5 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 16 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 9;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue5HHHHL_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue5HHHLL(const std::vector<unsigned>& qs,
-                                              const fp_type* matrix,
-                                              const State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[96];
-    fp_type wf[16384];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (128 * i + 32 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 16384 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue5HHHLL_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue5HHLLL(const std::vector<unsigned>& qs,
-                                              const fp_type* matrix,
-                                              const State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[224];
-    fp_type wf[8192];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (256 * i + 32 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 8192 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue5HHLLL_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue5HLLLL(const std::vector<unsigned>& qs,
-                                              const fp_type* matrix,
-                                              const State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[4] + 1);
-    ms[0] = (uint64_t{1} << qs[4]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[480];
-    fp_type wf[4096];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
-
-    for (unsigned i = 0; i < 15; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 16) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (512 * i + 32 * k + 16 * (m / 16) + (k + m) % 16);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 4096 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 480 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue5HLLLL_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue5LLLLL(const std::vector<unsigned>& qs,
-                                              const fp_type* matrix,
-                                              const State& state) const {
-    unsigned p[32];
-    unsigned idx[992];
-    fp_type wf[2048];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3])
-         | (1 << qs[4]);
-
-    for (unsigned i = 0; i < 31; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 32) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (1024 * i + 32 * k + 32 * (m / 32) + (k + m) % 32);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 2048 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 992 * sizeof(unsigned), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue5LLLLL_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_idx, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue6HHHHHH(const std::vector<unsigned>& qs,
-                                               const fp_type* matrix,
-                                               const State& state) const {
-    uint64_t xs[6];
-    uint64_t ms[7];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 6; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1);
-
-    uint64_t xss[64];
-    for (unsigned i = 0; i < 64; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 6; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, matrix, 8192 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 7 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 64 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 11;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue6HHHHHH_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_ms, d_xss, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue6HHHHHL(const std::vector<unsigned>& qs,
-                                               const fp_type* matrix,
-                                               const State& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[32];
-    fp_type wf[131072];
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 2) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 32; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (128 * i + 64 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 131072 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 32 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 6 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 32 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 10;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue6HHHHHL_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue6HHHHLL(const std::vector<unsigned>& qs,
-                                               const fp_type* matrix,
-                                               const State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[96];
-    fp_type wf[65536];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 4) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (256 * i + 64 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 65536 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 96 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 5 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 16 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 9;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue6HHHHLL_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue6HHHLLL(const std::vector<unsigned>& qs,
-                                               const fp_type* matrix,
-                                               const State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[224];
-    fp_type wf[32768];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 8) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (512 * i + 64 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 32768 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 224 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 8 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue6HHHLLL_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue6HHLLLL(const std::vector<unsigned>& qs,
-                                               const fp_type* matrix,
-                                               const State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[4] + 1);
-    ms[0] = (uint64_t{1} << qs[4]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 4] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 4]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[480];
-    fp_type wf[16384];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
-
-    for (unsigned i = 0; i < 15; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 16) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (1024 * i + 64 * k + 16 * (m / 16) + (k + m) % 16);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 16384 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 480 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 3 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 4 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue6HHLLLL_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  std::complex<double> ExpectationValue6HLLLLL(const std::vector<unsigned>& qs,
-                                               const fp_type* matrix,
-                                               const State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[5] + 1);
-    ms[0] = (uint64_t{1} << qs[5]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[32];
-    unsigned idx[992];
-    fp_type wf[8192];
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3])
-         | (1 << qs[4]);
-
-    for (unsigned i = 0; i < 31; ++i) {
-      for (unsigned j = 0; j < 32; ++j) {
-        idx[32 * i + j] =
-            MaskedAdd(j, i + 1, qmask, 32) | (j & (0xffffffff ^ qmask));
-      }
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 32; ++j) {
-          unsigned k = bits::CompressBits(j, 5, qmask);
-          p[j] = 2 * (2048 * i + 64 * k + 32 * (m / 32) + (k + m) % 32);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 32; ++j) {
-          wf[32 * l + j + 32] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    ErrorCheck(
-        cudaMemcpy(d_wf, wf, 8192 * sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_idx, idx, 992 * sizeof(unsigned), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_ms, ms, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(d_xss, xss, 2 * sizeof(uint64_t), cudaMemcpyHostToDevice));
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Complex = qsim::Complex<double>;
-
-    unsigned threads = std::min(32 * size, uint64_t{param_.num_threads});
-    unsigned blocks = 32 * size / threads;
-    unsigned bytes = threads * sizeof(Complex);
-
-    Complex* resd2 = (Complex*) AllocScratch((blocks + 1) * sizeof(Complex));
-    Complex* resd1 = resd2 + 1;
-
-    auto op1 = Plus<double>();
-
-    ExpectationValue6HLLLLL_Kernel<<<blocks, threads, bytes>>>(
-        d_wf, d_ms, d_xss, d_idx, rstate, op1, resd1);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    Complex result;
-
-    if (blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, resd1, sizeof(Complex), cudaMemcpyDeviceToHost));
-    } else {
-      auto op2 = Plus<Complex>();
-
-      unsigned threads2 = std::min(param_.num_threads, std::max(32U, blocks));
-      unsigned dblocks2 = std::max(1U, blocks / threads2);
-      unsigned bytes2 = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<1, threads2, bytes2>>>(
-          dblocks2, blocks, op2, op1, resd1, resd2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, resd2, sizeof(Complex), cudaMemcpyDeviceToHost));
-    }
-
-    return {result.re, result.im};
-  }
-
-  static unsigned MaskedAdd(
-      unsigned a, unsigned b, unsigned mask, unsigned lsize) {
-    unsigned c = bits::CompressBits(a, 5, mask);
-    return bits::ExpandBits((c + b) % lsize, 5, mask);
-  }
-
-  void* AllocScratch(uint64_t size) const {
-    if (size > scratch_size_) {
-      if (scratch_ != nullptr) {
-        ErrorCheck(cudaFree(scratch_));
-      }
-
-      ErrorCheck(cudaMalloc(const_cast<void**>(&scratch_), size));
-
-      const_cast<uint64_t&>(scratch_size_) = size;
-    }
+      const_cast<uint64_t&>(scratch_size_) = size;
+    }
 
     return scratch_;
   }
 
-  Parameter param_;
-
-  fp_type* d_wf;
-  unsigned* d_idx;
-  uint64_t* d_ms;
-  uint64_t* d_xss;
+  char* d_ws;
+  char h_ws0[max_buf_size];
+  char* h_ws = (char*) h_ws0;
 
   void* scratch_;
   uint64_t scratch_size_;
diff --git a/lib/simulator_cuda_kernels.h b/lib/simulator_cuda_kernels.h
index c2e66273..6510fadf 100644
--- a/lib/simulator_cuda_kernels.h
+++ b/lib/simulator_cuda_kernels.h
@@ -18,4519 +18,660 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-#include <algorithm>
-#include <complex>
-#include <cstdint>
-
 #include "util_cuda.h"
 
 namespace qsim {
 
-template <typename Integer>
-__device__ __forceinline__ Integer ExpandBits(
-    Integer bits, unsigned n, Integer mask) {
-  Integer ebits = 0;
-  unsigned k = 0;
-
-  for (unsigned i = 0; i < n; ++i) {
-    if ((mask >> i) & 1) {
-      ebits |= ((bits >> k) & 1) << i;
-      ++k;
-    }
-  }
-
-  return ebits;
-}
+template <unsigned G, typename fp_type, typename idx_type>
+__global__ void ApplyGateH_Kernel(
+    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0,
+    const idx_type* __restrict__ mss, fp_type* __restrict__ rstate) {
+  // blockDim.x must be equal to 64.
 
-template <typename fp_type>
-__global__ void ApplyGate1H_Kernel(
-    const fp_type* __restrict__ v, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[2], is[2];
+  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
+  constexpr unsigned gsize = 1 << G;
+  constexpr unsigned rows =
+      G < 4 ? gsize : (sizeof(fp_type) == 4 ?
+                       (G < 6 ? gsize : 32) : (G < 5 ? 8 : 16));
 
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]);
+  fp_type rs[gsize], is[gsize];
 
-  auto p0 = rstate + 2 * k + lane;
+  __shared__ idx_type xss[64];
+  __shared__ fp_type v[2 * gsize * rows];
 
-  for (unsigned l = 0; l < 2; ++l) {
-    rs[l] = *(p0 + xss[l]);
-    is[l] = *(p0 + xss[l] + 32);
+  if (threadIdx.x < gsize) {
+    xss[threadIdx.x] = xss0[threadIdx.x];
   }
 
-  unsigned j = 0;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rn = rs[0] * v[j] - is[0] * v[j + 1];
-    in = rs[0] * v[j + 1] + is[0] * v[j];
-
-    j += 2;
-
-    for (unsigned n = 1; n < 2; ++n) {
-      rn += rs[n] * v[j] - is[n] * v[j + 1];
-      in += rs[n] * v[j + 1] + is[n] * v[j];
-
-      j += 2;
+  if (G <= 2) {
+    if (threadIdx.x < 2 * gsize * gsize) {
+      v[threadIdx.x] = v0[threadIdx.x];
     }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyGate1L_Kernel(
-    const fp_type* __restrict__ w, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[2], is[2];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  auto p0 = rstate + 64 * i + lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rs[2 * l] = *(p0);
-    is[2 * l] = *(p0 + 32);
-
-    for (unsigned j = 1; j < 2; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]);
-      is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]);
+  } else {
+    for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
+      v[m + threadIdx.x] = v0[m + threadIdx.x];
     }
   }
 
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 2; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
+  __syncthreads();
 
-    *(p0) = rn;
-    *(p0 + 32) = in;
+  idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0;
+  idx_type ii = i & mss[0];
+  for (unsigned j = 1; j <= G; ++j) {
+    i *= 2;
+    ii |= i & mss[j];
   }
-};
-
-template <typename fp_type>
-__global__ void ApplyGate2HH_Kernel(
-    const fp_type* __restrict__ v, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[4], is[4];
 
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
+  auto p0 = rstate + 2 * ii + threadIdx.x % 32;
 
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rs[l] = *(p0 + xss[l]);
-    is[l] = *(p0 + xss[l] + 32);
+  for (unsigned k = 0; k < gsize; ++k) {
+    rs[k] = *(p0 + xss[k]);
+    is[k] = *(p0 + xss[k] + 32);
   }
 
-  unsigned j = 0;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rn = rs[0] * v[j] - is[0] * v[j + 1];
-    in = rs[0] * v[j + 1] + is[0] * v[j];
-
-    j += 2;
+  for (unsigned s = 0; s < gsize / rows; ++s) {
+    if (s > 0) {
+      __syncthreads();
 
-    for (unsigned n = 1; n < 4; ++n) {
-      rn += rs[n] * v[j] - is[n] * v[j + 1];
-      in += rs[n] * v[j + 1] + is[n] * v[j];
+      for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
+        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
+      }
 
-      j += 2;
+      __syncthreads();
     }
 
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyGate2HL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[4], is[4];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]);
+    unsigned j = 0;
 
-  auto p0 = rstate + 2 * k + lane;
+    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
+      fp_type rn = 0;
+      fp_type in = 0;
 
-  for (unsigned l = 0; l < 2; ++l) {
-    rs[2 * l] = *(p0 + xss[l]);
-    is[2 * l] = *(p0 + xss[l] + 32);
+      for (unsigned l = 0; l < gsize; ++l) {
+        fp_type rm = v[j++];
+        fp_type im = v[j++];
+        rn += rs[l] * rm;
+        rn -= is[l] * im;
+        in += rs[l] * im;
+        in += is[l] * rm;
+      }
 
-    for (unsigned j = 1; j < 2; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]);
-      is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]);
+      *(p0 + xss[k]) = rn;
+      *(p0 + xss[k] + 32) = in;
     }
   }
+}
 
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 4; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
+template <unsigned G, typename fp_type, typename idx_type>
+__global__ void ApplyGateL_Kernel(
+    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss,
+    const idx_type* __restrict__ mss, const unsigned* __restrict__ qis,
+    const unsigned* __restrict__ tis, unsigned esize,
+    fp_type* __restrict__ rstate) {
+  // blockDim.x must be equal to 32.
 
-template <typename fp_type>
-__global__ void ApplyGate2LL_Kernel(
-    const fp_type* __restrict__ w, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[4], is[4];
+  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
+  constexpr unsigned gsize = 1 << G;
+  constexpr unsigned
+      rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ?
+                              (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4));
 
-  auto p0 = rstate + 64 * i + lane;
+  fp_type rs[gsize], is[gsize];
 
-  for (unsigned l = 0; l < 1; ++l) {
-    rs[4 * l] = *(p0);
-    is[4 * l] = *(p0 + 32);
+  __shared__ fp_type v[2 * gsize * rows];
+  __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
 
-    for (unsigned j = 1; j < 4; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]);
-      is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]);
+  if (G < 2) {
+    if (threadIdx.x < 2 * gsize * gsize) {
+      v[threadIdx.x] = v0[threadIdx.x];
     }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 4; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
+  } else {
+    for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+      v[m + threadIdx.x] = v0[m + threadIdx.x];
     }
-
-    *(p0) = rn;
-    *(p0 + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyGate3HHH_Kernel(
-    const fp_type* __restrict__ v, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[8], is[8];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2])
-      | (256 * i & ms[3]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 8; ++l) {
-    rs[l] = *(p0 + xss[l]);
-    is[l] = *(p0 + xss[l] + 32);
   }
 
-  unsigned j = 0;
-
-  for (unsigned l = 0; l < 8; ++l) {
-    rn = rs[0] * v[j] - is[0] * v[j + 1];
-    in = rs[0] * v[j + 1] + is[0] * v[j];
-
-    j += 2;
-
-    for (unsigned n = 1; n < 8; ++n) {
-      rn += rs[n] * v[j] - is[n] * v[j + 1];
-      in += rs[n] * v[j + 1] + is[n] * v[j];
-
-      j += 2;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
+  idx_type i = 32 * idx_type{blockIdx.x};
+  idx_type ii = i & mss[0];
+  for (unsigned j = 1; j <= G; ++j) {
+    i *= 2;
+    ii |= i & mss[j];
   }
-};
-
-template <typename fp_type>
-__global__ void ApplyGate3HHL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[8], is[8];
 
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
+  auto p0 = rstate + 2 * ii + threadIdx.x;
 
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rs[2 * l] = *(p0 + xss[l]);
-    is[2 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 2; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]);
-      is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]);
-    }
+  for (unsigned k = 0; k < gsize; ++k) {
+    rs0[threadIdx.x][k] = *(p0 + xss[k]);
+    is0[threadIdx.x][k] = *(p0 + xss[k] + 32);
   }
 
-  unsigned j = lane;
+  for (unsigned k = 0; k < gsize; ++k) {
+    unsigned i = tis[threadIdx.x] | qis[k];
+    unsigned m = i & 0x1f;
+    unsigned n = i / 32;
 
-  for (unsigned l = 0; l < 4; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 8; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
+    rs[k] = rs0[m][n];
+    is[k] = is0[m][n];
   }
-};
-
-template <typename fp_type>
-__global__ void ApplyGate3HLL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[8], is[8];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]);
 
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rs[4 * l] = *(p0 + xss[l]);
-    is[4 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 4; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]);
-      is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]);
+  for (unsigned s = 0; s < gsize / rows; ++s) {
+    if (s > 0) {
+      for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
+      }
     }
-  }
 
-  unsigned j = lane;
+    unsigned j = 0;
 
-  for (unsigned l = 0; l < 2; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
+    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
+      fp_type rn = 0;
+      fp_type in = 0;
 
-    j += 64;
+      for (unsigned l = 0; l < gsize; ++l) {
+        fp_type rm = v[j++];
+        fp_type im = v[j++];
+        rn += rs[l] * rm;
+        rn -= is[l] * im;
+        in += rs[l] * im;
+        in += is[l] * rm;
+      }
 
-    for (unsigned n = 1; n < 8; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
+      unsigned i = tis[threadIdx.x] | qis[k];
+      unsigned m = i & 0x1f;
+      unsigned n = i / 32;
 
-      j += 64;
+      rs0[m][n] = rn;
+      is0[m][n] = in;
     }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
   }
-};
-
-template <typename fp_type>
-__global__ void ApplyGate3LLL_Kernel(
-    const fp_type* __restrict__ w, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[8], is[8];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  auto p0 = rstate + 64 * i + lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rs[8 * l] = *(p0);
-    is[8 * l] = *(p0 + 32);
 
-    for (unsigned j = 1; j < 8; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]);
-      is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]);
-    }
+  for (unsigned k = 0; k < esize; ++k) {
+    *(p0 + xss[k]) = rs0[threadIdx.x][k];
+    *(p0 + xss[k] + 32) = is0[threadIdx.x][k];
   }
+}
 
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 8; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0) = rn;
-    *(p0 + 32) = in;
-  }
-};
+template <unsigned G, typename fp_type, typename idx_type>
+__global__ void ApplyControlledGateH_Kernel(
+    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0,
+    const idx_type* __restrict__ mss, unsigned num_mss, idx_type cvalsh,
+    fp_type* __restrict__ rstate) {
+  // blockDim.x must be equal to 64.
 
-template <typename fp_type>
-__global__ void ApplyGate4HHHH_Kernel(
-    const fp_type* __restrict__ v, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[16], is[16];
+  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
+  constexpr unsigned gsize = 1 << G;
+  constexpr unsigned rows =
+      G < 4 ? gsize : (sizeof(fp_type) == 4 ?
+                           (G < 6 ? gsize : 32) : (G < 5 ? 8 : 16));
 
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2])
-      | (256 * i & ms[3]) | (512 * i & ms[4]);
+  fp_type rs[gsize], is[gsize];
 
-  auto p0 = rstate + 2 * k + lane;
+  __shared__ idx_type xss[64];
+  __shared__ fp_type v[2 * gsize * rows];
 
-  for (unsigned l = 0; l < 16; ++l) {
-    rs[l] = *(p0 + xss[l]);
-    is[l] = *(p0 + xss[l] + 32);
+  if (threadIdx.x < gsize) {
+    xss[threadIdx.x] = xss0[threadIdx.x];
   }
 
-  unsigned j = 0;
-
-  for (unsigned l = 0; l < 16; ++l) {
-    rn = rs[0] * v[j] - is[0] * v[j + 1];
-    in = rs[0] * v[j + 1] + is[0] * v[j];
-
-    j += 2;
-
-    for (unsigned n = 1; n < 16; ++n) {
-      rn += rs[n] * v[j] - is[n] * v[j + 1];
-      in += rs[n] * v[j + 1] + is[n] * v[j];
-
-      j += 2;
+  if (G <= 2) {
+    if (threadIdx.x < 2 * gsize * gsize) {
+      v[threadIdx.x] = v0[threadIdx.x];
     }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyGate4HHHL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[16], is[16];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2])
-      | (256 * i & ms[3]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 8; ++l) {
-    rs[2 * l] = *(p0 + xss[l]);
-    is[2 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 2; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]);
-      is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]);
+  } else {
+    for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
+      v[m + threadIdx.x] = v0[m + threadIdx.x];
     }
   }
 
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 8; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 16; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
+  __syncthreads();
 
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
+  idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0;
+  idx_type ii = i & mss[0];
+  for (unsigned j = 1; j < num_mss; ++j) {
+    i *= 2;
+    ii |= i & mss[j];
   }
-};
-
-template <typename fp_type>
-__global__ void ApplyGate4HHLL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[16], is[16];
 
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
+  ii |= cvalsh;
 
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]);
+  auto p0 = rstate + 2 * ii + threadIdx.x % 32;
 
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rs[4 * l] = *(p0 + xss[l]);
-    is[4 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 4; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]);
-      is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]);
-    }
+  for (unsigned k = 0; k < gsize; ++k) {
+    rs[k] = *(p0 + xss[k]);
+    is[k] = *(p0 + xss[k] + 32);
   }
 
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
+  for (unsigned s = 0; s < gsize / rows; ++s) {
+    if (s > 0) {
+      __syncthreads();
 
-    j += 64;
+      for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
+        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
+      }
 
-    for (unsigned n = 1; n < 16; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
+      __syncthreads();
     }
 
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyGate4HLLL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[16], is[16];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
+    unsigned j = 0;
 
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]);
+    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
+      fp_type rn = 0;
+      fp_type in = 0;
 
-  auto p0 = rstate + 2 * k + lane;
+      for (unsigned l = 0; l < gsize; ++l) {
+        fp_type rm = v[j++];
+        fp_type im = v[j++];
+        rn += rs[l] * rm;
+        rn -= is[l] * im;
+        in += rs[l] * im;
+        in += is[l] * rm;
+      }
 
-  for (unsigned l = 0; l < 2; ++l) {
-    rs[8 * l] = *(p0 + xss[l]);
-    is[8 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 8; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]);
-      is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]);
+      *(p0 + xss[k]) = rn;
+      *(p0 + xss[k] + 32) = in;
     }
   }
+}
 
-  unsigned j = lane;
+template <unsigned G, typename fp_type, typename idx_type>
+__global__ void ApplyControlledGateLH_Kernel(
+    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss,
+    const idx_type* __restrict__ mss, const unsigned* __restrict__ qis,
+    const unsigned* __restrict__ tis, unsigned num_mss, idx_type cvalsh,
+    unsigned esize, fp_type* __restrict__ rstate) {
+  // blockDim.x must be equal to 32.
 
-  for (unsigned l = 0; l < 2; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
+  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
-    j += 64;
+  constexpr unsigned gsize = 1 << G;
+  constexpr unsigned
+      rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ?
+                              (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4));
 
-    for (unsigned n = 1; n < 16; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
+  fp_type rs[gsize], is[gsize];
 
-      j += 64;
-    }
+  __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
+  __shared__ fp_type v[2 * gsize * rows];
 
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
+  idx_type i = 32 * idx_type{blockIdx.x};
+  idx_type ii = i & mss[0];
+  for (unsigned j = 1; j < num_mss; ++j) {
+    i *= 2;
+    ii |= i & mss[j];
   }
-};
-
-template <typename fp_type>
-__global__ void ApplyGate4LLLL_Kernel(
-    const fp_type* __restrict__ w, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[16], is[16];
 
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
+  ii |= cvalsh;
 
-  auto p0 = rstate + 64 * i + lane;
+  auto p0 = rstate + 2 * ii + threadIdx.x;
 
-  for (unsigned l = 0; l < 1; ++l) {
-    rs[16 * l] = *(p0);
-    is[16 * l] = *(p0 + 32);
-
-    for (unsigned j = 1; j < 16; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[16 * l + j] = __shfl_sync(0xffffffff, rs[16 * l], idx[k]);
-      is[16 * l + j] = __shfl_sync(0xffffffff, is[16 * l], idx[k]);
-    }
+  for (unsigned k = 0; k < gsize; ++k) {
+    rs0[threadIdx.x][k] = *(p0 + xss[k]);
+    is0[threadIdx.x][k] = *(p0 + xss[k] + 32);
   }
 
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 16; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
+  if (G < 2) {
+    if (threadIdx.x < 2 * gsize * gsize) {
+      v[threadIdx.x] = v0[threadIdx.x];
+    }
+  } else {
+    for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+      v[m + threadIdx.x] = v0[m + threadIdx.x];
     }
-
-    *(p0) = rn;
-    *(p0 + 32) = in;
   }
-};
 
-template <typename fp_type>
-__global__ void ApplyGate5HHHHH_Kernel(
-    const fp_type* __restrict__ v, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[32], is[32];
+  for (unsigned k = 0; k < gsize; ++k) {
+    unsigned i = tis[threadIdx.x] | qis[k];
+    unsigned m = i & 0x1f;
+    unsigned n = i / 32;
 
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2])
-      | (256 * i & ms[3]) | (512 * i & ms[4]) | (1024 * i & ms[5]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 32; ++l) {
-    rs[l] = *(p0 + xss[l]);
-    is[l] = *(p0 + xss[l] + 32);
+    rs[k] = rs0[m][n];
+    is[k] = is0[m][n];
   }
 
-  unsigned j = 0;
-
-  for (unsigned l = 0; l < 32; ++l) {
-    rn = rs[0] * v[j] - is[0] * v[j + 1];
-    in = rs[0] * v[j + 1] + is[0] * v[j];
-
-    j += 2;
-
-    for (unsigned n = 1; n < 32; ++n) {
-      rn += rs[n] * v[j] - is[n] * v[j + 1];
-      in += rs[n] * v[j + 1] + is[n] * v[j];
-
-      j += 2;
+  for (unsigned s = 0; s < gsize / rows; ++s) {
+    if (s > 0) {
+      for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
+      }
     }
 
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyGate5HHHHL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[32], is[32];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
+    unsigned j = 0;
 
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2])
-      | (256 * i & ms[3]) | (512 * i & ms[4]);
+    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
+      fp_type rn = 0;
+      fp_type in = 0;
 
-  auto p0 = rstate + 2 * k + lane;
+      for (unsigned l = 0; l < gsize; ++l) {
+        fp_type rm = v[j++];
+        fp_type im = v[j++];
+        rn += rs[l] * rm;
+        rn -= is[l] * im;
+        in += rs[l] * im;
+        in += is[l] * rm;
+      }
 
-  for (unsigned l = 0; l < 16; ++l) {
-    rs[2 * l] = *(p0 + xss[l]);
-    is[2 * l] = *(p0 + xss[l] + 32);
+      unsigned i = tis[threadIdx.x] | qis[k];
+      unsigned m = i & 0x1f;
+      unsigned n = i / 32;
 
-    for (unsigned j = 1; j < 2; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]);
-      is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]);
+      rs0[m][n] = rn;
+      is0[m][n] = in;
     }
   }
 
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 16; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 32; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
+  for (unsigned k = 0; k < esize; ++k) {
+    *(p0 + xss[k]) = rs0[threadIdx.x][k];
+    *(p0 + xss[k] + 32) = is0[threadIdx.x][k];
   }
-};
+}
 
-template <typename fp_type>
-__global__ void ApplyGate5HHHLL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[32], is[32];
+template <unsigned G, typename fp_type, typename idx_type>
+__global__ void ApplyControlledGateL_Kernel(
+    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss,
+    const idx_type* __restrict__ mss, const unsigned* __restrict__ qis,
+    const unsigned* __restrict__ tis, const idx_type* __restrict__ cis,
+    unsigned num_mss, idx_type cvalsh, unsigned esize, unsigned rwthreads,
+    fp_type* __restrict__ rstate) {
+  // blockDim.x must be equal to 32.
 
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
+  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2])
-      | (256 * i & ms[3]);
+  constexpr unsigned gsize = 1 << G;
+  constexpr unsigned
+      rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ?
+                              (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4));
 
-  auto p0 = rstate + 2 * k + lane;
+  fp_type rs[gsize], is[gsize];
 
-  for (unsigned l = 0; l < 8; ++l) {
-    rs[4 * l] = *(p0 + xss[l]);
-    is[4 * l] = *(p0 + xss[l] + 32);
+  __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
+  __shared__ fp_type v[2 * gsize * rows];
 
-    for (unsigned j = 1; j < 4; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]);
-      is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]);
-    }
+  idx_type i = 32 * idx_type{blockIdx.x};
+  idx_type ii = i & mss[0];
+  for (unsigned j = 1; j < num_mss; ++j) {
+    i *= 2;
+    ii |= i & mss[j];
   }
 
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 8; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
+  ii |= cvalsh;
 
-    for (unsigned n = 1; n < 32; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
+  auto p0 = rstate + 2 * ii + cis[threadIdx.x];
 
-      j += 64;
+  if (threadIdx.x < rwthreads) {
+    for (unsigned k = 0; k < gsize; ++k) {
+      rs0[threadIdx.x][k] = *(p0 + xss[k]);
+      is0[threadIdx.x][k] = *(p0 + xss[k] + 32);
     }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
   }
-};
-
-template <typename fp_type>
-__global__ void ApplyGate5HHLLL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[32], is[32];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]);
 
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rs[8 * l] = *(p0 + xss[l]);
-    is[8 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 8; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]);
-      is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]);
+  if (G < 2) {
+    if (threadIdx.x < 2 * gsize * gsize) {
+      v[threadIdx.x] = v0[threadIdx.x];
     }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 32; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
+  } else {
+    for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+      v[m + threadIdx.x] = v0[m + threadIdx.x];
     }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
   }
-};
-
-template <typename fp_type>
-__global__ void ApplyGate5HLLLL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[32], is[32];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]);
 
-  auto p0 = rstate + 2 * k + lane;
+  for (unsigned k = 0; k < gsize; ++k) {
+    unsigned i = tis[threadIdx.x] | qis[k];
+    unsigned m = i & 0x1f;
+    unsigned n = i / 32;
 
-  for (unsigned l = 0; l < 2; ++l) {
-    rs[16 * l] = *(p0 + xss[l]);
-    is[16 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 16; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[16 * l + j] = __shfl_sync(0xffffffff, rs[16 * l], idx[k]);
-      is[16 * l + j] = __shfl_sync(0xffffffff, is[16 * l], idx[k]);
-    }
+    rs[k] = rs0[m][n];
+    is[k] = is0[m][n];
   }
 
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 32; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
+  for (unsigned s = 0; s < gsize / rows; ++s) {
+    if (s > 0) {
+      for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
+      }
     }
 
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyGate5LLLLL_Kernel(
-    const fp_type* __restrict__ w, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[32], is[32];
+    unsigned j = 0;
 
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
+    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
+      fp_type rn = 0;
+      fp_type in = 0;
 
-  auto p0 = rstate + 64 * i + lane;
+      for (unsigned l = 0; l < gsize; ++l) {
+        fp_type rm = v[j++];
+        fp_type im = v[j++];
+        rn += rs[l] * rm;
+        rn -= is[l] * im;
+        in += rs[l] * im;
+        in += is[l] * rm;
+      }
 
-  for (unsigned l = 0; l < 1; ++l) {
-    rs[32 * l] = *(p0);
-    is[32 * l] = *(p0 + 32);
+      unsigned i = tis[threadIdx.x] | qis[k];
+      unsigned m = i & 0x1f;
+      unsigned n = i / 32;
 
-    for (unsigned j = 1; j < 32; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[32 * l + j] = __shfl_sync(0xffffffff, rs[32 * l], idx[k]);
-      is[32 * l + j] = __shfl_sync(0xffffffff, is[32 * l], idx[k]);
+      rs0[m][n] = rn;
+      is0[m][n] = in;
     }
   }
 
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 32; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
+  if (threadIdx.x < rwthreads) {
+    for (unsigned k = 0; k < esize; ++k) {
+      *(p0 + xss[k]) = rs0[threadIdx.x][k];
+      *(p0 + xss[k] + 32) = is0[threadIdx.x][k];
     }
-
-    *(p0) = rn;
-    *(p0 + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyGate6HHHHHH_Kernel(
-    const fp_type* __restrict__ v, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[64], is[64];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2])
-      | (256 * i & ms[3]) | (512 * i & ms[4]) | (1024 * i & ms[5])
-      | (2048 * i & ms[6]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 64; ++l) {
-    rs[l] = *(p0 + xss[l]);
-    is[l] = *(p0 + xss[l] + 32);
   }
+}
 
-  unsigned j = 0;
+template <unsigned G, typename fp_type, typename idx_type, typename Op,
+          typename cfp_type>
+__global__ void ExpectationValueH_Kernel(
+    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0,
+    const idx_type* __restrict__ mss, unsigned num_iterations_per_block,
+    const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
+  // blockDim.x must be equal to 64.
 
-  for (unsigned l = 0; l < 64; ++l) {
-    rn = rs[0] * v[j] - is[0] * v[j + 1];
-    in = rs[0] * v[j + 1] + is[0] * v[j];
+  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
-    j += 2;
+  constexpr unsigned gsize = 1 << G;
+  constexpr unsigned rows =
+      G < 5 ? gsize : (sizeof(fp_type) == 4 ? (G < 6 ? 4 : 8) : 8);
 
-    for (unsigned n = 1; n < 64; ++n) {
-      rn += rs[n] * v[j] - is[n] * v[j + 1];
-      in += rs[n] * v[j + 1] + is[n] * v[j];
+  fp_type rs[gsize], is[gsize];
 
-      j += 2;
-    }
+  __shared__ idx_type xss[64];
+  __shared__ fp_type v[2 * gsize * rows];
 
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
+  if (threadIdx.x < gsize) {
+    xss[threadIdx.x] = xss0[threadIdx.x];
   }
-};
-
-template <typename fp_type>
-__global__ void ApplyGate6HHHHHL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[64], is[64];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2])
-      | (256 * i & ms[3]) | (512 * i & ms[4]) | (1024 * i & ms[5]);
 
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 32; ++l) {
-    rs[2 * l] = *(p0 + xss[l]);
-    is[2 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 2; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]);
-      is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]);
+  if (G <= 2) {
+    if (threadIdx.x < 2 * gsize * gsize) {
+      v[threadIdx.x] = v0[threadIdx.x];
     }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 32; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 64; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
+  } else {
+    for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
+      v[m + threadIdx.x] = v0[m + threadIdx.x];
     }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
   }
-};
-
-template <typename fp_type>
-__global__ void ApplyGate6HHHHLL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[64], is[64];
 
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2])
-      | (256 * i & ms[3]) | (512 * i & ms[4]);
+  __syncthreads();
 
-  auto p0 = rstate + 2 * k + lane;
+  double re = 0;
+  double im = 0;
 
-  for (unsigned l = 0; l < 16; ++l) {
-    rs[4 * l] = *(p0 + xss[l]);
-    is[4 * l] = *(p0 + xss[l] + 32);
+  for (unsigned iter = 0; iter < num_iterations_per_block; ++iter) {
+    idx_type b = num_iterations_per_block * idx_type{blockIdx.x} + iter;
 
-    for (unsigned j = 1; j < 4; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]);
-      is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]);
+    idx_type i = (64 * b + threadIdx.x) & 0xffffffffffe0;
+    idx_type ii = i & mss[0];
+    for (unsigned j = 1; j <= G; ++j) {
+      i *= 2;
+      ii |= i & mss[j];
     }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 16; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
 
-    for (unsigned n = 1; n < 64; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
+    auto p0 = rstate + 2 * ii + threadIdx.x % 32;
 
-      j += 64;
+    for (unsigned k = 0; k < gsize; ++k) {
+      rs[k] = *(p0 + xss[k]);
+      is[k] = *(p0 + xss[k] + 32);
     }
 
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
+    for (unsigned s = 0; s < gsize / rows; ++s) {
+      if (s > 0 || iter > 0) {
+        __syncthreads();
 
-template <typename fp_type>
-__global__ void ApplyGate6HHHLLL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[64], is[64];
+        for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
+          v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
+        }
 
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
+        __syncthreads();
+      }
 
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2])
-      | (256 * i & ms[3]);
+      unsigned j = 0;
 
-  auto p0 = rstate + 2 * k + lane;
+      for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
+        fp_type rn = 0;
+        fp_type in = 0;
 
-  for (unsigned l = 0; l < 8; ++l) {
-    rs[8 * l] = *(p0 + xss[l]);
-    is[8 * l] = *(p0 + xss[l] + 32);
+        for (unsigned l = 0; l < gsize; ++l) {
+          fp_type rm = v[j++];
+          fp_type im = v[j++];
+          rn += rs[l] * rm;
+          rn -= is[l] * im;
+          in += rs[l] * im;
+          in += is[l] * rm;
+        }
 
-    for (unsigned j = 1; j < 8; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]);
-      is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]);
+        re += rs[k] * rn;
+        re += is[k] * in;
+        im += rs[k] * in;
+        im -= is[k] * rn;
+      }
     }
   }
 
-  unsigned j = lane;
+  __shared__ cfp_type partial1[64];
+  __shared__ cfp_type partial2[2];
 
-  for (unsigned l = 0; l < 8; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
+  partial1[threadIdx.x].re = re;
+  partial1[threadIdx.x].im = im;
 
-    j += 64;
+  auto val = WarpReduce(partial1[threadIdx.x], op);
 
-    for (unsigned n = 1; n < 64; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
+  if (threadIdx.x % 32 == 0) {
+    partial2[threadIdx.x / 32] = val;
+  }
 
-      j += 64;
-    }
+  __syncthreads();
 
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
+  if (threadIdx.x == 0) {
+    result[blockIdx.x].re = partial2[0].re + partial2[1].re;
+    result[blockIdx.x].im = partial2[0].im + partial2[1].im;
   }
-};
+}
 
-template <typename fp_type>
-__global__ void ApplyGate6HHLLLL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[64], is[64];
+template <unsigned G, typename fp_type, typename idx_type,
+          typename Op, typename cfp_type>
+__global__ void ExpectationValueL_Kernel(
+    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss,
+    const idx_type* __restrict__ mss, const unsigned* __restrict__ qis,
+    const unsigned* __restrict__ tis, unsigned num_iterations_per_block,
+    const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
+  // blockDim.x must be equal to 32.
 
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
+  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]);
+  constexpr unsigned gsize = 1 << G;
+  constexpr unsigned rows = G < 5 ? gsize : (sizeof(fp_type) == 4 ?
+                                             (G < 6 ? 4 : 2) : (G < 6 ? 2 : 1));
 
-  auto p0 = rstate + 2 * k + lane;
+  fp_type rs[gsize], is[gsize];
 
-  for (unsigned l = 0; l < 4; ++l) {
-    rs[16 * l] = *(p0 + xss[l]);
-    is[16 * l] = *(p0 + xss[l] + 32);
+  __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
+  __shared__ fp_type v[2 * gsize * rows];
 
-    for (unsigned j = 1; j < 16; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[16 * l + j] = __shfl_sync(0xffffffff, rs[16 * l], idx[k]);
-      is[16 * l + j] = __shfl_sync(0xffffffff, is[16 * l], idx[k]);
+  if (G < 2) {
+    if (threadIdx.x < 2 * gsize * gsize) {
+      v[threadIdx.x] = v0[threadIdx.x];
     }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 64; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
+  } else {
+    for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+      v[m + threadIdx.x] = v0[m + threadIdx.x];
     }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
   }
-};
-
-template <typename fp_type>
-__global__ void ApplyGate6HLLLLL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[64], is[64];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]);
 
-  auto p0 = rstate + 2 * k + lane;
+  double re = 0;
+  double im = 0;
 
-  for (unsigned l = 0; l < 2; ++l) {
-    rs[32 * l] = *(p0 + xss[l]);
-    is[32 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 32; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[32 * l + j] = __shfl_sync(0xffffffff, rs[32 * l], idx[k]);
-      is[32 * l + j] = __shfl_sync(0xffffffff, is[32 * l], idx[k]);
+  for (idx_type iter = 0; iter < num_iterations_per_block; ++iter) {
+    idx_type i = 32 * (num_iterations_per_block * idx_type{blockIdx.x} + iter);
+    idx_type ii = i & mss[0];
+    for (unsigned j = 1; j <= G; ++j) {
+      i *= 2;
+      ii |= i & mss[j];
     }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
 
-    j += 64;
+    auto p0 = rstate + 2 * ii + threadIdx.x;
 
-    for (unsigned n = 1; n < 64; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
+    for (unsigned k = 0; k < gsize; ++k) {
+      rs0[threadIdx.x][k] = *(p0 + xss[k]);
+      is0[threadIdx.x][k] = *(p0 + xss[k] + 32);
     }
 
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate1H_H_Kernel(
-    const fp_type* __restrict__ v, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[2], is[2];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
+    for (unsigned k = 0; k < gsize; ++k) {
+      unsigned i = tis[threadIdx.x] | qis[k];
+      unsigned m = i & 0x1f;
+      unsigned n = i / 32;
 
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rs[l] = *(p0 + xss[l]);
-    is[l] = *(p0 + xss[l] + 32);
-  }
-
-  unsigned j = 0;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rn = rs[0] * v[j] - is[0] * v[j + 1];
-    in = rs[0] * v[j + 1] + is[0] * v[j];
-
-    j += 2;
-
-    for (unsigned n = 1; n < 2; ++n) {
-      rn += rs[n] * v[j] - is[n] * v[j + 1];
-      in += rs[n] * v[j + 1] + is[n] * v[j];
-
-      j += 2;
+      rs[k] = rs0[m][n];
+      is[k] = is0[m][n];
     }
 
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate1H_L_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[2], is[2];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rs[l] = *(p0 + xss[l]);
-    is[l] = *(p0 + xss[l] + 32);
-  }
-
-  unsigned j = lane;
+    for (unsigned s = 0; s < gsize / rows; ++s) {
+      if (s > 0 || iter > 0) {
+        for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+          v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
+        }
+      }
 
-  for (unsigned l = 0; l < 2; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
+      unsigned j = 0;
 
-    j += 64;
+      for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
+        fp_type rn = 0;
+        fp_type in = 0;
 
-    for (unsigned n = 1; n < 2; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
+        for (unsigned l = 0; l < gsize; ++l) {
+          fp_type rm = v[j++];
+          fp_type im = v[j++];
+          rn += rs[l] * rm;
+          rn -= is[l] * im;
+          in += rs[l] * im;
+          in += is[l] * rm;
+        }
 
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate1L_H_Kernel(
-    const fp_type* __restrict__ w, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[2], is[2];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rs[2 * l] = *(p0);
-    is[2 * l] = *(p0 + 32);
-
-    for (unsigned j = 1; j < 2; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]);
-      is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]);
+        re += rs[k] * rn;
+        re += is[k] * in;
+        im += rs[k] * in;
+        im -= is[k] * rn;
+      }
     }
   }
 
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
+  __shared__ cfp_type partial[32];
 
-    j += 64;
+  partial[threadIdx.x].re = re;
+  partial[threadIdx.x].im = im;
 
-    for (unsigned n = 1; n < 2; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
+  auto val = WarpReduce(partial[threadIdx.x], op);
 
-    *(p0) = rn;
-    *(p0 + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate1L_L_Kernel(
-    const fp_type* __restrict__ w, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[2], is[2];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rs[2 * l] = *(p0);
-    is[2 * l] = *(p0 + 32);
-
-    for (unsigned j = 1; j < 2; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]);
-      is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]);
-    }
+  if (threadIdx.x == 0) {
+    result[blockIdx.x].re = val.re;
+    result[blockIdx.x].im = val.im;
   }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 2; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0) = rn;
-    *(p0 + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate2HH_H_Kernel(
-    const fp_type* __restrict__ v, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[4], is[4];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rs[l] = *(p0 + xss[l]);
-    is[l] = *(p0 + xss[l] + 32);
-  }
-
-  unsigned j = 0;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rn = rs[0] * v[j] - is[0] * v[j + 1];
-    in = rs[0] * v[j + 1] + is[0] * v[j];
-
-    j += 2;
-
-    for (unsigned n = 1; n < 4; ++n) {
-      rn += rs[n] * v[j] - is[n] * v[j + 1];
-      in += rs[n] * v[j + 1] + is[n] * v[j];
-
-      j += 2;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate2HH_L_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[4], is[4];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rs[l] = *(p0 + xss[l]);
-    is[l] = *(p0 + xss[l] + 32);
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 4; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate2HL_H_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[4], is[4];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rs[2 * l] = *(p0 + xss[l]);
-    is[2 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 2; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]);
-      is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]);
-    }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 4; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate2HL_L_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[4], is[4];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rs[2 * l] = *(p0 + xss[l]);
-    is[2 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 2; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]);
-      is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]);
-    }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 4; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate2LL_H_Kernel(
-    const fp_type* __restrict__ w, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[4], is[4];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rs[4 * l] = *(p0);
-    is[4 * l] = *(p0 + 32);
-
-    for (unsigned j = 1; j < 4; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]);
-      is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]);
-    }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 4; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0) = rn;
-    *(p0 + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate2LL_L_Kernel(
-    const fp_type* __restrict__ w, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[4], is[4];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rs[4 * l] = *(p0);
-    is[4 * l] = *(p0 + 32);
-
-    for (unsigned j = 1; j < 4; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]);
-      is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]);
-    }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 4; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0) = rn;
-    *(p0 + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate3HHH_H_Kernel(
-    const fp_type* __restrict__ v, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[8], is[8];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 8; ++l) {
-    rs[l] = *(p0 + xss[l]);
-    is[l] = *(p0 + xss[l] + 32);
-  }
-
-  unsigned j = 0;
-
-  for (unsigned l = 0; l < 8; ++l) {
-    rn = rs[0] * v[j] - is[0] * v[j + 1];
-    in = rs[0] * v[j + 1] + is[0] * v[j];
-
-    j += 2;
-
-    for (unsigned n = 1; n < 8; ++n) {
-      rn += rs[n] * v[j] - is[n] * v[j + 1];
-      in += rs[n] * v[j + 1] + is[n] * v[j];
-
-      j += 2;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate3HHH_L_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[8], is[8];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 8; ++l) {
-    rs[l] = *(p0 + xss[l]);
-    is[l] = *(p0 + xss[l] + 32);
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 8; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 8; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate3HHL_H_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[8], is[8];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rs[2 * l] = *(p0 + xss[l]);
-    is[2 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 2; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]);
-      is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]);
-    }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 8; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate3HHL_L_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[8], is[8];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rs[2 * l] = *(p0 + xss[l]);
-    is[2 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 2; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]);
-      is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]);
-    }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 8; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate3HLL_H_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[8], is[8];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rs[4 * l] = *(p0 + xss[l]);
-    is[4 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 4; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]);
-      is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]);
-    }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 8; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate3HLL_L_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[8], is[8];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rs[4 * l] = *(p0 + xss[l]);
-    is[4 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 4; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]);
-      is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]);
-    }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 8; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate3LLL_H_Kernel(
-    const fp_type* __restrict__ w, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[8], is[8];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rs[8 * l] = *(p0);
-    is[8 * l] = *(p0 + 32);
-
-    for (unsigned j = 1; j < 8; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]);
-      is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]);
-    }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 8; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0) = rn;
-    *(p0 + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate3LLL_L_Kernel(
-    const fp_type* __restrict__ w, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[8], is[8];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rs[8 * l] = *(p0);
-    is[8 * l] = *(p0 + 32);
-
-    for (unsigned j = 1; j < 8; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]);
-      is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]);
-    }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 8; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0) = rn;
-    *(p0 + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate4HHHH_H_Kernel(
-    const fp_type* __restrict__ v, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[16], is[16];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 16; ++l) {
-    rs[l] = *(p0 + xss[l]);
-    is[l] = *(p0 + xss[l] + 32);
-  }
-
-  unsigned j = 0;
-
-  for (unsigned l = 0; l < 16; ++l) {
-    rn = rs[0] * v[j] - is[0] * v[j + 1];
-    in = rs[0] * v[j + 1] + is[0] * v[j];
-
-    j += 2;
-
-    for (unsigned n = 1; n < 16; ++n) {
-      rn += rs[n] * v[j] - is[n] * v[j + 1];
-      in += rs[n] * v[j + 1] + is[n] * v[j];
-
-      j += 2;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate4HHHH_L_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[16], is[16];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 16; ++l) {
-    rs[l] = *(p0 + xss[l]);
-    is[l] = *(p0 + xss[l] + 32);
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 16; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 16; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate4HHHL_H_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[16], is[16];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 8; ++l) {
-    rs[2 * l] = *(p0 + xss[l]);
-    is[2 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 2; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]);
-      is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]);
-    }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 8; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 16; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate4HHHL_L_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[16], is[16];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 8; ++l) {
-    rs[2 * l] = *(p0 + xss[l]);
-    is[2 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 2; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]);
-      is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]);
-    }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 8; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 16; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate4HHLL_H_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[16], is[16];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rs[4 * l] = *(p0 + xss[l]);
-    is[4 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 4; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]);
-      is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]);
-    }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 16; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate4HHLL_L_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[16], is[16];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rs[4 * l] = *(p0 + xss[l]);
-    is[4 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 4; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]);
-      is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]);
-    }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 16; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate4HLLL_H_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[16], is[16];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rs[8 * l] = *(p0 + xss[l]);
-    is[8 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 8; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]);
-      is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]);
-    }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 16; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate4HLLL_L_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[16], is[16];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rs[8 * l] = *(p0 + xss[l]);
-    is[8 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 8; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]);
-      is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]);
-    }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 16; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0 + xss[l]) = rn;
-    *(p0 + xss[l] + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate4LLLL_H_Kernel(
-    const fp_type* __restrict__ w, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[16], is[16];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rs[16 * l] = *(p0);
-    is[16 * l] = *(p0 + 32);
-
-    for (unsigned j = 1; j < 16; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[16 * l + j] = __shfl_sync(0xffffffff, rs[16 * l], idx[k]);
-      is[16 * l + j] = __shfl_sync(0xffffffff, is[16 * l], idx[k]);
-    }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 16; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0) = rn;
-    *(p0 + 32) = in;
-  }
-};
-
-template <typename fp_type>
-__global__ void ApplyControlledGate4LLLL_L_Kernel(
-    const fp_type* __restrict__ w, unsigned num_qubits,
-    uint64_t cmaskh, uint64_t emaskh, const unsigned* __restrict__ idx,
-    fp_type* rstate) {
-  fp_type rn, in;
-  fp_type rs[16], is[16];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = ExpandBits(i, num_qubits, emaskh) | cmaskh;
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rs[16 * l] = *(p0);
-    is[16 * l] = *(p0 + 32);
-
-    for (unsigned j = 1; j < 16; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[16 * l + j] = __shfl_sync(0xffffffff, rs[16 * l], idx[k]);
-      is[16 * l + j] = __shfl_sync(0xffffffff, is[16 * l], idx[k]);
-    }
-  }
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 16; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    *(p0) = rn;
-    *(p0 + 32) = in;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue1H_Kernel(
-    const fp_type* __restrict__ v, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[2], is[2];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rs[l] = *(p0 + xss[l]);
-    is[l] = *(p0 + xss[l] + 32);
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = 0;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rn = rs[0] * v[j] - is[0] * v[j + 1];
-    in = rs[0] * v[j + 1] + is[0] * v[j];
-
-    j += 2;
-
-    for (unsigned n = 1; n < 2; ++n) {
-      rn += rs[n] * v[j] - is[n] * v[j + 1];
-      in += rs[n] * v[j + 1] + is[n] * v[j];
-
-      j += 2;
-    }
-
-    re += rs[l] * rn + is[l] * in;
-    im += rs[l] * in - is[l] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue1L_Kernel(
-    const fp_type* __restrict__ w, const unsigned* __restrict__ idx,
-    const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[2], is[2];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  auto p0 = rstate + 64 * i + lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rs[2 * l] = *(p0);
-    is[2 * l] = *(p0 + 32);
-
-    for (unsigned j = 1; j < 2; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]);
-      is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]);
-    }
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 2; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    re += rs[l] * rn + is[l] * in;
-    im += rs[l] * in - is[l] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue2HH_Kernel(
-    const fp_type* __restrict__ v, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[4], is[4];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rs[l] = *(p0 + xss[l]);
-    is[l] = *(p0 + xss[l] + 32);
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = 0;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rn = rs[0] * v[j] - is[0] * v[j + 1];
-    in = rs[0] * v[j + 1] + is[0] * v[j];
-
-    j += 2;
-
-    for (unsigned n = 1; n < 4; ++n) {
-      rn += rs[n] * v[j] - is[n] * v[j + 1];
-      in += rs[n] * v[j + 1] + is[n] * v[j];
-
-      j += 2;
-    }
-
-    re += rs[l] * rn + is[l] * in;
-    im += rs[l] * in - is[l] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue2HL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[4], is[4];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rs[2 * l] = *(p0 + xss[l]);
-    is[2 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 2; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]);
-      is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]);
-    }
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 4; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    unsigned m = 2 * l;
-
-    re += rs[m] * rn + is[m] * in;
-    im += rs[m] * in - is[m] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue2LL_Kernel(
-    const fp_type* __restrict__ w, const unsigned* __restrict__ idx,
-    const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[4], is[4];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  auto p0 = rstate + 64 * i + lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rs[4 * l] = *(p0);
-    is[4 * l] = *(p0 + 32);
-
-    for (unsigned j = 1; j < 4; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]);
-      is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]);
-    }
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 4; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    re += rs[l] * rn + is[l] * in;
-    im += rs[l] * in - is[l] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue3HHH_Kernel(
-    const fp_type* __restrict__ v, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[8], is[8];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2])
-      | (256 * i & ms[3]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 8; ++l) {
-    rs[l] = *(p0 + xss[l]);
-    is[l] = *(p0 + xss[l] + 32);
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = 0;
-
-  for (unsigned l = 0; l < 8; ++l) {
-    rn = rs[0] * v[j] - is[0] * v[j + 1];
-    in = rs[0] * v[j + 1] + is[0] * v[j];
-
-    j += 2;
-
-    for (unsigned n = 1; n < 8; ++n) {
-      rn += rs[n] * v[j] - is[n] * v[j + 1];
-      in += rs[n] * v[j + 1] + is[n] * v[j];
-
-      j += 2;
-    }
-
-    re += rs[l] * rn + is[l] * in;
-    im += rs[l] * in - is[l] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue3HHL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[8], is[8];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rs[2 * l] = *(p0 + xss[l]);
-    is[2 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 2; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]);
-      is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]);
-    }
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 8; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    unsigned m = 2 * l;
-
-    re += rs[m] * rn + is[m] * in;
-    im += rs[m] * in - is[m] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue3HLL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[8], is[8];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rs[4 * l] = *(p0 + xss[l]);
-    is[4 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 4; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]);
-      is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]);
-    }
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 8; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    unsigned m = 4 * l;
-
-    re += rs[m] * rn + is[m] * in;
-    im += rs[m] * in - is[m] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue3LLL_Kernel(
-    const fp_type* __restrict__ w, const unsigned* __restrict__ idx,
-    const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[8], is[8];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  auto p0 = rstate + 64 * i + lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rs[8 * l] = *(p0);
-    is[8 * l] = *(p0 + 32);
-
-    for (unsigned j = 1; j < 8; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]);
-      is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]);
-    }
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 8; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    re += rs[l] * rn + is[l] * in;
-    im += rs[l] * in - is[l] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue4HHHH_Kernel(
-    const fp_type* __restrict__ v, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[16], is[16];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2])
-      | (256 * i & ms[3]) | (512 * i & ms[4]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 16; ++l) {
-    rs[l] = *(p0 + xss[l]);
-    is[l] = *(p0 + xss[l] + 32);
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = 0;
-
-  for (unsigned l = 0; l < 16; ++l) {
-    rn = rs[0] * v[j] - is[0] * v[j + 1];
-    in = rs[0] * v[j + 1] + is[0] * v[j];
-
-    j += 2;
-
-    for (unsigned n = 1; n < 16; ++n) {
-      rn += rs[n] * v[j] - is[n] * v[j + 1];
-      in += rs[n] * v[j + 1] + is[n] * v[j];
-
-      j += 2;
-    }
-
-    re += rs[l] * rn + is[l] * in;
-    im += rs[l] * in - is[l] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue4HHHL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[16], is[16];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2])
-      | (256 * i & ms[3]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 8; ++l) {
-    rs[2 * l] = *(p0 + xss[l]);
-    is[2 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 2; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]);
-      is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]);
-    }
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 8; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 16; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    unsigned m = 2 * l;
-
-    re += rs[m] * rn + is[m] * in;
-    im += rs[m] * in - is[m] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue4HHLL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[16], is[16];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rs[4 * l] = *(p0 + xss[l]);
-    is[4 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 4; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]);
-      is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]);
-    }
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 16; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    unsigned m = 4 * l;
-
-    re += rs[m] * rn + is[m] * in;
-    im += rs[m] * in - is[m] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue4HLLL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[16], is[16];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rs[8 * l] = *(p0 + xss[l]);
-    is[8 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 8; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]);
-      is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]);
-    }
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 16; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    unsigned m = 8 * l;
-
-    re += rs[m] * rn + is[m] * in;
-    im += rs[m] * in - is[m] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue4LLLL_Kernel(
-    const fp_type* __restrict__ w, const unsigned* __restrict__ idx,
-    const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[16], is[16];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  auto p0 = rstate + 64 * i + lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rs[16 * l] = *(p0);
-    is[16 * l] = *(p0 + 32);
-
-    for (unsigned j = 1; j < 16; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[16 * l + j] = __shfl_sync(0xffffffff, rs[16 * l], idx[k]);
-      is[16 * l + j] = __shfl_sync(0xffffffff, is[16 * l], idx[k]);
-    }
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 16; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    re += rs[l] * rn + is[l] * in;
-    im += rs[l] * in - is[l] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue5HHHHH_Kernel(
-    const fp_type* __restrict__ v, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[32], is[32];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2])
-      | (256 * i & ms[3]) | (512 * i & ms[4]) | (1024 * i & ms[5]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 32; ++l) {
-    rs[l] = *(p0 + xss[l]);
-    is[l] = *(p0 + xss[l] + 32);
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = 0;
-
-  for (unsigned l = 0; l < 32; ++l) {
-    rn = rs[0] * v[j] - is[0] * v[j + 1];
-    in = rs[0] * v[j + 1] + is[0] * v[j];
-
-    j += 2;
-
-    for (unsigned n = 1; n < 32; ++n) {
-      rn += rs[n] * v[j] - is[n] * v[j + 1];
-      in += rs[n] * v[j + 1] + is[n] * v[j];
-
-      j += 2;
-    }
-
-    re += rs[l] * rn + is[l] * in;
-    im += rs[l] * in - is[l] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue5HHHHL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[32], is[32];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2])
-      | (256 * i & ms[3]) | (512 * i & ms[4]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 16; ++l) {
-    rs[2 * l] = *(p0 + xss[l]);
-    is[2 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 2; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]);
-      is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]);
-    }
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 16; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 32; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    unsigned m = 2 * l;
-
-    re += rs[m] * rn + is[m] * in;
-    im += rs[m] * in - is[m] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue5HHHLL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[32], is[32];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2])
-      | (256 * i & ms[3]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 8; ++l) {
-    rs[4 * l] = *(p0 + xss[l]);
-    is[4 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 4; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]);
-      is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]);
-    }
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 8; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 32; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    unsigned m = 4 * l;
-
-    re += rs[m] * rn + is[m] * in;
-    im += rs[m] * in - is[m] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue5HHLLL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[32], is[32];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rs[8 * l] = *(p0 + xss[l]);
-    is[8 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 8; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]);
-      is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]);
-    }
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 32; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    unsigned m = 8 * l;
-
-    re += rs[m] * rn + is[m] * in;
-    im += rs[m] * in - is[m] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue5HLLLL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[32], is[32];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rs[16 * l] = *(p0 + xss[l]);
-    is[16 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 16; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[16 * l + j] = __shfl_sync(0xffffffff, rs[16 * l], idx[k]);
-      is[16 * l + j] = __shfl_sync(0xffffffff, is[16 * l], idx[k]);
-    }
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 32; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    unsigned m = 16 * l;
-
-    re += rs[m] * rn + is[m] * in;
-    im += rs[m] * in - is[m] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue5LLLLL_Kernel(
-    const fp_type* __restrict__ w, const unsigned* __restrict__ idx,
-    const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[32], is[32];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  auto p0 = rstate + 64 * i + lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rs[32 * l] = *(p0);
-    is[32 * l] = *(p0 + 32);
-
-    for (unsigned j = 1; j < 32; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[32 * l + j] = __shfl_sync(0xffffffff, rs[32 * l], idx[k]);
-      is[32 * l + j] = __shfl_sync(0xffffffff, is[32 * l], idx[k]);
-    }
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 1; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 32; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    re += rs[l] * rn + is[l] * in;
-    im += rs[l] * in - is[l] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue6HHHHHH_Kernel(
-    const fp_type* __restrict__ v, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[64], is[64];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2])
-      | (256 * i & ms[3]) | (512 * i & ms[4]) | (1024 * i & ms[5])
-      | (2048 * i & ms[6]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 64; ++l) {
-    rs[l] = *(p0 + xss[l]);
-    is[l] = *(p0 + xss[l] + 32);
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = 0;
-
-  for (unsigned l = 0; l < 64; ++l) {
-    rn = rs[0] * v[j] - is[0] * v[j + 1];
-    in = rs[0] * v[j + 1] + is[0] * v[j];
-
-    j += 2;
-
-    for (unsigned n = 1; n < 64; ++n) {
-      rn += rs[n] * v[j] - is[n] * v[j + 1];
-      in += rs[n] * v[j + 1] + is[n] * v[j];
-
-      j += 2;
-    }
-
-    re += rs[l] * rn + is[l] * in;
-    im += rs[l] * in - is[l] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue6HHHHHL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[64], is[64];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2])
-      | (256 * i & ms[3]) | (512 * i & ms[4]) | (1024 * i & ms[5]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 32; ++l) {
-    rs[2 * l] = *(p0 + xss[l]);
-    is[2 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 2; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[2 * l + j] = __shfl_sync(0xffffffff, rs[2 * l], idx[k]);
-      is[2 * l + j] = __shfl_sync(0xffffffff, is[2 * l], idx[k]);
-    }
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 32; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 64; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    unsigned m = 2 * l;
-
-    re += rs[m] * rn + is[m] * in;
-    im += rs[m] * in - is[m] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue6HHHHLL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[64], is[64];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2])
-      | (256 * i & ms[3]) | (512 * i & ms[4]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 16; ++l) {
-    rs[4 * l] = *(p0 + xss[l]);
-    is[4 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 4; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[4 * l + j] = __shfl_sync(0xffffffff, rs[4 * l], idx[k]);
-      is[4 * l + j] = __shfl_sync(0xffffffff, is[4 * l], idx[k]);
-    }
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 16; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 64; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    unsigned m = 4 * l;
-
-    re += rs[m] * rn + is[m] * in;
-    im += rs[m] * in - is[m] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue6HHHLLL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[64], is[64];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2])
-      | (256 * i & ms[3]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 8; ++l) {
-    rs[8 * l] = *(p0 + xss[l]);
-    is[8 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 8; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[8 * l + j] = __shfl_sync(0xffffffff, rs[8 * l], idx[k]);
-      is[8 * l + j] = __shfl_sync(0xffffffff, is[8 * l], idx[k]);
-    }
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 8; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 64; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    unsigned m = 8 * l;
-
-    re += rs[m] * rn + is[m] * in;
-    im += rs[m] * in - is[m] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue6HHLLLL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[64], is[64];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]) | (128 * i & ms[2]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rs[16 * l] = *(p0 + xss[l]);
-    is[16 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 16; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[16 * l + j] = __shfl_sync(0xffffffff, rs[16 * l], idx[k]);
-      is[16 * l + j] = __shfl_sync(0xffffffff, is[16 * l], idx[k]);
-    }
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 4; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 64; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    unsigned m = 16 * l;
-
-    re += rs[m] * rn + is[m] * in;
-    im += rs[m] * in - is[m] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
-
-template <typename fp_type, typename Op, typename FP>
-__global__ void ExpectationValue6HLLLLL_Kernel(
-    const fp_type* __restrict__ w, const uint64_t* __restrict__ ms,
-    const uint64_t* __restrict__ xss, const unsigned* __restrict__ idx,
-    const fp_type* rstate,
-    Op op, FP* result) {
-  fp_type rn, in;
-  fp_type rs[64], is[64];
-
-  unsigned lane = threadIdx.x % 32;
-  uint64_t i = (uint64_t{blockDim.x} * blockIdx.x + threadIdx.x) / 32;
-
-  uint64_t k = (32 * i & ms[0]) | (64 * i & ms[1]);
-
-  auto p0 = rstate + 2 * k + lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rs[32 * l] = *(p0 + xss[l]);
-    is[32 * l] = *(p0 + xss[l] + 32);
-
-    for (unsigned j = 1; j < 32; ++j) {
-      unsigned k = 32 * (j - 1) + lane;
-      rs[32 * l + j] = __shfl_sync(0xffffffff, rs[32 * l], idx[k]);
-      is[32 * l + j] = __shfl_sync(0xffffffff, is[32 * l], idx[k]);
-    }
-  }
-
-  fp_type re = 0;
-  fp_type im = 0;
-
-  unsigned j = lane;
-
-  for (unsigned l = 0; l < 2; ++l) {
-    rn = rs[0] * w[j] - is[0] * w[j + 32];
-    in = rs[0] * w[j + 32] + is[0] * w[j];
-
-    j += 64;
-
-    for (unsigned n = 1; n < 64; ++n) {
-      rn += rs[n] * w[j] - is[n] * w[j + 32];
-      in += rs[n] * w[j + 32] + is[n] * w[j];
-
-      j += 64;
-    }
-
-    unsigned m = 32 * l;
-
-    re += rs[m] * rn + is[m] * in;
-    im += rs[m] * in - is[m] * rn;
-  }
-
-  extern __shared__ float shared[];
-  FP* partial1 = (FP*) shared;
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  __shared__ FP partial2[32];
-
-  if (threadIdx.x < 32) {
-    partial2[threadIdx.x] = 0;
-  }
-
-  __syncthreads();
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (lane == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  FP r = 0;
-
-  if (threadIdx.x < 32) {
-    r = WarpReduce(partial2[lane], op);
-  }
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = r;
-  }
-};
+}
 
 }  // namespace qsim
 
diff --git a/lib/vectorspace.h b/lib/vectorspace.h
index 5a5a6c94..246394e1 100644
--- a/lib/vectorspace.h
+++ b/lib/vectorspace.h
@@ -160,6 +160,8 @@ class VectorSpace {
     return true;
   }
 
+  void DeviceSync() {}
+
  protected:
   For for_;
 };
diff --git a/lib/vectorspace_cuda.h b/lib/vectorspace_cuda.h
index ac228c63..d26f003f 100644
--- a/lib/vectorspace_cuda.h
+++ b/lib/vectorspace_cuda.h
@@ -141,6 +141,10 @@ class VectorSpaceCUDA {
     return true;
   }
 
+  void DeviceSync() {
+    cudaDeviceSynchronize();
+  }
+
  protected:
 };
 
diff --git a/pybind_interface/cuda/pybind_main_cuda.cpp b/pybind_interface/cuda/pybind_main_cuda.cpp
index 88fa3a61..57b0ba84 100644
--- a/pybind_interface/cuda/pybind_main_cuda.cpp
+++ b/pybind_interface/cuda/pybind_main_cuda.cpp
@@ -27,19 +27,17 @@ namespace qsim {
       unsigned num_sim_threads,
       unsigned num_state_threads,
       unsigned num_dblocks
-    ) : ss_params{num_state_threads, num_dblocks},
-        sim_params{num_sim_threads} {}
+    ) : ss_params{num_state_threads, num_dblocks} {}
 
     StateSpace CreateStateSpace() const {
       return StateSpace(ss_params);
     }
 
     Simulator CreateSimulator() const {
-      return Simulator(sim_params);
+      return Simulator();
     }
 
     StateSpace::Parameter ss_params;
-    Simulator::Parameter sim_params;
   };
 
   inline void SetFlushToZeroAndDenormalsAreZeros() {}
diff --git a/pybind_interface/pybind_main.cpp b/pybind_interface/pybind_main.cpp
index 5ddee99f..74fa3a31 100644
--- a/pybind_interface/pybind_main.cpp
+++ b/pybind_interface/pybind_main.cpp
@@ -399,7 +399,6 @@ std::vector<std::complex<float>> qsim_simulate(const py::dict &options) {
     if (use_gpu == 0) {
       num_sim_threads = parseOptions<unsigned>(options, "t\0");
     } else if (gpu_mode == 0) {
-      num_sim_threads = parseOptions<unsigned>(options, "gsmt\0");
       num_state_threads = parseOptions<unsigned>(options, "gsst\0");
       num_dblocks = parseOptions<unsigned>(options, "gdb\0");
     }
@@ -464,7 +463,6 @@ std::vector<std::complex<float>> qtrajectory_simulate(const py::dict &options) {
     if (use_gpu == 0) {
       num_sim_threads = parseOptions<unsigned>(options, "t\0");
     } else if (gpu_mode == 0) {
-      num_sim_threads = parseOptions<unsigned>(options, "gsmt\0");
       num_state_threads = parseOptions<unsigned>(options, "gsst\0");
       num_dblocks = parseOptions<unsigned>(options, "gdb\0");
     }
@@ -659,7 +657,6 @@ class SimulatorHelper {
       if (use_gpu == 0) {
         num_sim_threads = parseOptions<unsigned>(options, "t\0");
       } else if (gpu_mode == 0) {
-        num_sim_threads = parseOptions<unsigned>(options, "gsmt\0");
         num_state_threads = parseOptions<unsigned>(options, "gsst\0");
         num_dblocks = parseOptions<unsigned>(options, "gdb\0");
       }
@@ -985,7 +982,6 @@ std::vector<unsigned> qsim_sample(const py::dict &options) {
     if (use_gpu == 0) {
       num_sim_threads = parseOptions<unsigned>(options, "t\0");
     } else if (gpu_mode == 0) {
-      num_sim_threads = parseOptions<unsigned>(options, "gsmt\0");
       num_state_threads = parseOptions<unsigned>(options, "gsst\0");
       num_dblocks = parseOptions<unsigned>(options, "gdb\0");
     }
@@ -1054,7 +1050,6 @@ std::vector<unsigned> qtrajectory_sample(const py::dict &options) {
     if (use_gpu == 0) {
       num_sim_threads = parseOptions<unsigned>(options, "t\0");
     } else if (gpu_mode == 0) {
-      num_sim_threads = parseOptions<unsigned>(options, "gsmt\0");
       num_state_threads = parseOptions<unsigned>(options, "gsst\0");
       num_dblocks = parseOptions<unsigned>(options, "gdb\0");
     }
diff --git a/qsimcirq/qsim_simulator.py b/qsimcirq/qsim_simulator.py
index b59a6f36..37d497ed 100644
--- a/qsimcirq/qsim_simulator.py
+++ b/qsimcirq/qsim_simulator.py
@@ -63,12 +63,10 @@ class QSimOptions:
         gpu_mode: use CUDA if set to 0 (default value) or use the NVIDIA
             cuStateVec library if set to any other value. The "gpu_*"
             arguments below are only considered if this is set to 0.
-        gpu_sim_threads: number of threads per CUDA block to use for the GPU
-            Simulator. This must be a power of 2 in the range [32, 256].
         gpu_state_threads: number of threads per CUDA block to use for the GPU
             StateSpace. This must be a power of 2 in the range [32, 1024].
-        gpu_data_blocks: number of data blocks to use on GPU. Below 16 data
-            blocks, performance is noticeably reduced.
+        gpu_data_blocks: number of data blocks to use for the GPU StateSpace.
+            Below 16 data blocks, performance is noticeably reduced.
         verbosity: Logging verbosity.
         denormals_are_zeros: if true, set flush-to-zero and denormals-are-zeros
             MXCSR control flags. This prevents rare cases of performance
@@ -80,7 +78,6 @@ class QSimOptions:
     ev_noisy_repetitions: int = 1
     use_gpu: bool = False
     gpu_mode: int = 0
-    gpu_sim_threads: int = 256
     gpu_state_threads: int = 512
     gpu_data_blocks: int = 16
     verbosity: int = 0
@@ -97,7 +94,6 @@ def as_dict(self):
             "r": self.ev_noisy_repetitions,
             "g": self.use_gpu,
             "gmode": self.gpu_mode,
-            "gsmt": self.gpu_sim_threads,
             "gsst": self.gpu_state_threads,
             "gdb": self.gpu_data_blocks,
             "v": self.verbosity,
diff --git a/tests/hybrid_cuda_test.cu b/tests/hybrid_cuda_test.cu
index 28da96dc..6f2640a3 100644
--- a/tests/hybrid_cuda_test.cu
+++ b/tests/hybrid_cuda_test.cu
@@ -26,35 +26,30 @@ struct Factory {
   using Simulator = qsim::SimulatorCUDA<fp_type>;
   using StateSpace = typename Simulator::StateSpace;
 
-  Factory(const typename StateSpace::Parameter& param1,
-          const typename Simulator::Parameter& param2)
-      : param1(param1), param2(param2) {}
+  Factory(const typename StateSpace::Parameter& param) : param(param) {}
 
   StateSpace CreateStateSpace() const {
-    return StateSpace(param1);
+    return StateSpace(param);
   }
 
   Simulator CreateSimulator() const {
-    return Simulator(param2);
+    return Simulator();
   }
 
-  typename StateSpace::Parameter param1;
-  typename Simulator::Parameter param2;
+  typename StateSpace::Parameter param;
 };
 
 TEST(HybridCUDATest, Hybrid2) {
   using Factory = qsim::Factory<float>;
-  Factory::StateSpace::Parameter param1;
-  Factory::Simulator::Parameter param2;
-  Factory factory(param1, param2);
+  Factory::StateSpace::Parameter param;
+  Factory factory(param);
   TestHybrid2(factory);
 }
 
 TEST(HybridCUDATest, Hybrid4) {
   using Factory = qsim::Factory<float>;
-  Factory::StateSpace::Parameter param1;
-  Factory::Simulator::Parameter param2;
-  Factory factory(param1, param2);
+  Factory::StateSpace::Parameter param;
+  Factory factory(param);
   TestHybrid4(factory);
 }
 
diff --git a/tests/qtrajectory_cuda_test.cu b/tests/qtrajectory_cuda_test.cu
index 730ff7ed..459a25f6 100644
--- a/tests/qtrajectory_cuda_test.cu
+++ b/tests/qtrajectory_cuda_test.cu
@@ -26,75 +26,65 @@ struct Factory {
   using Simulator = qsim::SimulatorCUDA<fp_type>;
   using StateSpace = typename Simulator::StateSpace;
 
-  Factory(const typename StateSpace::Parameter& param1,
-          const typename Simulator::Parameter& param2)
-      : param1(param1), param2(param2) {}
+  Factory(const typename StateSpace::Parameter& param) : param(param) {}
 
   StateSpace CreateStateSpace() const {
-    return StateSpace(param1);
+    return StateSpace(param);
   }
 
   Simulator CreateSimulator() const {
-    return Simulator(param2);
+    return Simulator();
   }
 
-  typename StateSpace::Parameter param1;
-  typename Simulator::Parameter param2;
+  typename StateSpace::Parameter param;
 };
 
 TEST(QTrajectoryCUDATest, BitFlip) {
   using Factory = qsim::Factory<float>;
-  Factory::StateSpace::Parameter param1;
-  Factory::Simulator::Parameter param2;
-  Factory factory(param1, param2);
+  Factory::StateSpace::Parameter param;
+  Factory factory(param);
   TestBitFlip(factory);
 }
 
 TEST(QTrajectoryCUDATest, GenDump) {
   using Factory = qsim::Factory<float>;
-  Factory::StateSpace::Parameter param1;
-  Factory::Simulator::Parameter param2;
-  Factory factory(param1, param2);
+  Factory::StateSpace::Parameter param;
+  Factory factory(param);
   TestGenDump(factory);
 }
 
 TEST(QTrajectoryCUDATest, ReusingResults) {
   using Factory = qsim::Factory<float>;
-  Factory::StateSpace::Parameter param1;
-  Factory::Simulator::Parameter param2;
-  Factory factory(param1, param2);
+  Factory::StateSpace::Parameter param;
+  Factory factory(param);
   TestReusingResults(factory);
 }
 
 TEST(QTrajectoryCUDATest, CollectKopStat) {
   using Factory = qsim::Factory<float>;
-  Factory::StateSpace::Parameter param1;
-  Factory::Simulator::Parameter param2;
-  Factory factory(param1, param2);
+  Factory::StateSpace::Parameter param;
+  Factory factory(param);
   TestCollectKopStat(factory);
 }
 
 TEST(QTrajectoryCUDATest, CleanCircuit) {
   using Factory = qsim::Factory<float>;
-  Factory::StateSpace::Parameter param1;
-  Factory::Simulator::Parameter param2;
-  Factory factory(param1, param2);
+  Factory::StateSpace::Parameter param;
+  Factory factory(param);
   TestCleanCircuit(factory);
 }
 
 TEST(QTrajectoryCUDATest, InitialState) {
   using Factory = qsim::Factory<float>;
-  Factory::StateSpace::Parameter param1;
-  Factory::Simulator::Parameter param2;
-  Factory factory(param1, param2);
+  Factory::StateSpace::Parameter param;
+  Factory factory(param);
   TestInitialState(factory);
 }
 
 TEST(QTrajectoryCUDATest, UncomputeFinalState) {
   using Factory = qsim::Factory<float>;
-  Factory::StateSpace::Parameter param1;
-  Factory::Simulator::Parameter param2;
-  Factory factory(param1, param2);
+  Factory::StateSpace::Parameter param;
+  Factory factory(param);
   TestUncomputeFinalState(factory);
 }
 
diff --git a/tests/simulator_cuda_test.cu b/tests/simulator_cuda_test.cu
index efc202aa..baecfd45 100644
--- a/tests/simulator_cuda_test.cu
+++ b/tests/simulator_cuda_test.cu
@@ -34,152 +34,88 @@ struct Factory {
   using Simulator = qsim::SimulatorCUDA<fp_type>;
   using StateSpace = typename Simulator::StateSpace;
 
-  Factory(const typename StateSpace::Parameter& param1,
-          const typename Simulator::Parameter& param2)
-      : param1(param1), param2(param2) {}
+  Factory(const typename StateSpace::Parameter& param) : param(param) {}
 
   StateSpace CreateStateSpace() const {
-    return StateSpace(param1);
+    return StateSpace(param);
   }
 
   Simulator CreateSimulator() const {
-    return Simulator(param2);
+    return Simulator();
   }
 
-  typename StateSpace::Parameter param1;
-  typename Simulator::Parameter param2;
+  typename StateSpace::Parameter param;
 };
 
 TYPED_TEST(SimulatorCUDATest, ApplyGate1) {
   using Factory = qsim::Factory<TypeParam>;
-
-  for (unsigned num_threads : {32, 64, 128, 256}) {
-    typename Factory::Simulator::Parameter param;
-    param.num_threads = num_threads;
-
-    Factory factory(typename Factory::StateSpace::Parameter(), param);
-
-    TestApplyGate1(factory);
-  }
+  typename Factory::StateSpace::Parameter param;
+  Factory factory(param);
+  TestApplyGate1(factory);
 }
 
 TYPED_TEST(SimulatorCUDATest, ApplyGate2) {
   using Factory = qsim::Factory<TypeParam>;
-
-  for (unsigned num_threads : {32, 64, 128, 256}) {
-    typename Factory::Simulator::Parameter param;
-    param.num_threads = num_threads;
-
-    Factory factory(typename Factory::StateSpace::Parameter(), param);
-
-    TestApplyGate2(factory);
-  }
+  typename Factory::StateSpace::Parameter param;
+  Factory factory(param);
+  TestApplyGate2(factory);
 }
 
 TYPED_TEST(SimulatorCUDATest, ApplyGate3) {
   using Factory = qsim::Factory<TypeParam>;
-
-  for (unsigned num_threads : {32, 64, 128, 256}) {
-    typename Factory::Simulator::Parameter param;
-    param.num_threads = num_threads;
-
-    Factory factory(typename Factory::StateSpace::Parameter(), param);
-
-    TestApplyGate3(factory);
-  }
+  typename Factory::StateSpace::Parameter param;
+  Factory factory(param);
+  TestApplyGate3(factory);
 }
 
 TYPED_TEST(SimulatorCUDATest, ApplyGate5) {
   using Factory = qsim::Factory<TypeParam>;
-
-  for (unsigned num_threads : {32, 64, 128, 256}) {
-    typename Factory::Simulator::Parameter param;
-    param.num_threads = num_threads;
-
-    Factory factory(typename Factory::StateSpace::Parameter(), param);
-
-    TestApplyGate5(factory);
-  }
+  typename Factory::StateSpace::Parameter param;
+  Factory factory(param);
+  TestApplyGate5(factory);
 }
 
 TYPED_TEST(SimulatorCUDATest, CircuitWithControlledGates) {
   using Factory = qsim::Factory<TypeParam>;
-
-  for (unsigned num_threads : {32, 64, 128, 256}) {
-    typename Factory::Simulator::Parameter param;
-    param.num_threads = num_threads;
-
-    Factory factory(typename Factory::StateSpace::Parameter(), param);
-
-    TestCircuitWithControlledGates(factory);
-  }
+  typename Factory::StateSpace::Parameter param;
+  Factory factory(param);
+  TestCircuitWithControlledGates(factory);
 }
 
 TYPED_TEST(SimulatorCUDATest, CircuitWithControlledGatesDagger) {
   using Factory = qsim::Factory<TypeParam>;
-
-  for (unsigned num_threads : {32, 64, 128, 256}) {
-    typename Factory::Simulator::Parameter param;
-    param.num_threads = num_threads;
-
-    Factory factory(typename Factory::StateSpace::Parameter(), param);
-
-    TestCircuitWithControlledGatesDagger(factory);
-  }
+  typename Factory::StateSpace::Parameter param;
+  Factory factory(param);
+  TestCircuitWithControlledGatesDagger(factory);
 }
 
 TYPED_TEST(SimulatorCUDATest, MultiQubitGates) {
   using Factory = qsim::Factory<TypeParam>;
-
-  for (unsigned num_threads : {32, 64, 128, 256}) {
-    typename Factory::Simulator::Parameter param;
-    param.num_threads = num_threads;
-
-    Factory factory(typename Factory::StateSpace::Parameter(), param);
-
-    TestMultiQubitGates(factory);
-  }
+  typename Factory::StateSpace::Parameter param;
+  Factory factory(param);
+  TestMultiQubitGates(factory);
 }
 
 TYPED_TEST(SimulatorCUDATest, ControlledGates) {
   using Factory = qsim::Factory<TypeParam>;
-
+  typename Factory::StateSpace::Parameter param;
+  Factory factory(param);
   bool high_precision = std::is_same<TypeParam, double>::value;
-
-  for (unsigned num_threads : {64, 256}) {
-    typename Factory::Simulator::Parameter param;
-    param.num_threads = num_threads;
-
-    Factory factory(typename Factory::StateSpace::Parameter(), param);
-
-    TestControlledGates(factory, high_precision);
-  }
+  TestControlledGates(factory, high_precision);
 }
 
 TYPED_TEST(SimulatorCUDATest, ExpectationValue1) {
   using Factory = qsim::Factory<TypeParam>;
-
-  for (unsigned num_threads : {32, 64, 128, 256}) {
-    typename Factory::Simulator::Parameter param;
-    param.num_threads = num_threads;
-
-    Factory factory(typename Factory::StateSpace::Parameter(), param);
-
-    TestExpectationValue1(factory);
-  }
+  typename Factory::StateSpace::Parameter param;
+  Factory factory(param);
+  TestExpectationValue1(factory);
 }
 
 TYPED_TEST(SimulatorCUDATest, ExpectationValue2) {
   using Factory = qsim::Factory<TypeParam>;
-
-  for (unsigned num_threads : {256}) {
-    typename Factory::Simulator::Parameter param;
-    param.num_threads = num_threads;
-
-    Factory factory(typename Factory::StateSpace::Parameter(), param);
-
-    TestExpectationValue2(factory);
-  }
+  typename Factory::StateSpace::Parameter param;
+  Factory factory(param);
+  TestExpectationValue2(factory);
 }
 
 }  // namespace qsim
diff --git a/tests/simulator_testfixture.h b/tests/simulator_testfixture.h
index ef335565..31cdcc7e 100644
--- a/tests/simulator_testfixture.h
+++ b/tests/simulator_testfixture.h
@@ -1147,60 +1147,63 @@ void TestMultiQubitGates(const Factory& factory) {
   using StateSpace = typename Simulator::StateSpace;
   using fp_type = typename StateSpace::fp_type;
 
-  unsigned max_minq = 4;
-  unsigned max_gate_qubits = 6;
-  unsigned num_qubits = max_gate_qubits + max_minq;
+  unsigned max_num_qubits = 10 + std::log2(Simulator::SIMDRegisterSize());
 
   StateSpace state_space = factory.CreateStateSpace();
   Simulator simulator = factory.CreateSimulator();
 
-  auto state = state_space.Create(num_qubits);
-
   std::vector<fp_type> matrix;
-  matrix.reserve(1 << (2 * max_gate_qubits + 1));
+  matrix.reserve(1 << (2 * 6 + 1));
 
   std::vector<unsigned> qubits;
-  qubits.reserve(max_gate_qubits);
-
-  std::vector<fp_type> vec(state_space.MinSize(num_qubits));
+  qubits.reserve(6);
 
-  unsigned size = 1 << num_qubits;
-  fp_type inorm = std::sqrt(1.0 / (1 << num_qubits));
+  std::vector<fp_type> vec(state_space.MinSize(max_num_qubits));
 
-  for (unsigned q = 1; q <= max_gate_qubits; ++q) {
-    unsigned size1 = 1 << q;
-    unsigned size2 = size1 * size1;
+  for (unsigned num_qubits = 1; num_qubits <= max_num_qubits; ++num_qubits) {
+    auto state = state_space.Create(num_qubits);
 
-    matrix.resize(0);
+    unsigned size = 1 << num_qubits;
+    fp_type inorm = std::sqrt(1.0 / (1 << num_qubits));
+    unsigned max_gate_qubits = std::min(6U, num_qubits);
 
-    for (unsigned i = 0; i < 2 * size2; ++i) {
-      matrix.push_back(i + 1);
-    }
+    for (unsigned q = 1; q <= max_gate_qubits; ++q) {
 
-    unsigned mask = (1 << q) - 1;
+      unsigned size1 = 1 << q;
+      unsigned size2 = size1 * size1;
 
-    for (unsigned k = 0; k <= max_minq; ++k) {
-      qubits.resize(0);
+      matrix.resize(0);
 
-      for (unsigned i = 0; i < q; ++i) {
-        qubits.push_back(i + k);
+      for (unsigned i = 0; i < 2 * size2; ++i) {
+        matrix.push_back(i + 1);
       }
 
-      state_space.SetStateUniform(state);
-      simulator.ApplyGate(qubits, matrix.data(), state);
+      unsigned mask = (1 << q) - 1;
+      unsigned max_minq = num_qubits - q;
 
-      state_space.InternalToNormalOrder(state);
-      state_space.Copy(state, vec.data());
+      for (unsigned k = 0; k <= max_minq; ++k) {
+        qubits.resize(0);
+
+        for (unsigned i = 0; i < q; ++i) {
+          qubits.push_back(i + k);
+        }
 
-      for (unsigned i = 0; i < size; ++i) {
-        unsigned j = (i >> k) & mask;
+        state_space.SetStateUniform(state);
+        simulator.ApplyGate(qubits, matrix.data(), state);
 
-        // Expected results are calculated analytically.
-        fp_type expected_real = size2 * (1 + 2 * j) * inorm;
-        fp_type expected_imag = expected_real + size1 * inorm;
+        state_space.InternalToNormalOrder(state);
+        state_space.Copy(state, vec.data());
 
-        EXPECT_NEAR(vec[2 * i], expected_real, 1e-6);
-        EXPECT_NEAR(vec[2 * i + 1], expected_imag, 1e-6);
+        for (unsigned i = 0; i < size; ++i) {
+          unsigned j = (i >> k) & mask;
+
+          // Expected results are calculated analytically.
+          fp_type expected_real = size2 * (1 + 2 * j) * inorm;
+          fp_type expected_imag = expected_real + size1 * inorm;
+
+          EXPECT_NEAR(vec[2 * i] / expected_real, 1.0, 1e-6);
+          EXPECT_NEAR(vec[2 * i + 1] / expected_imag, 1.0, 1e-6);
+        }
       }
     }
   }
@@ -1212,15 +1215,13 @@ void TestControlledGates(const Factory& factory, bool high_precision) {
   using StateSpace = typename Simulator::StateSpace;
   using fp_type = typename StateSpace::fp_type;
 
-  unsigned max_qubits = 5 + std::log2(Simulator::SIMDRegisterSize());
+  unsigned max_qubits = 6 + std::log2(Simulator::SIMDRegisterSize());
   unsigned max_target_qubits = 4;
   unsigned max_control_qubits = 3;
 
   StateSpace state_space = factory.CreateStateSpace();
   Simulator simulator = factory.CreateSimulator();
 
-  auto state = state_space.Create(max_qubits);
-
   std::vector<unsigned> qubits;
   qubits.reserve(max_qubits);
 
@@ -1237,6 +1238,8 @@ void TestControlledGates(const Factory& factory, bool high_precision) {
     unsigned size = 1 << num_qubits;
     unsigned nmask = size - 1;
 
+    auto state = state_space.Create(num_qubits);
+
     // Iterate over control qubits (as a binary mask).
     for (unsigned cmask = 0; cmask <= nmask; ++cmask) {
       cqubits.resize(0);
@@ -1359,47 +1362,54 @@ void TestExpectationValue1(const Factory& factory) {
   using StateSpace = typename Simulator::StateSpace;
   using fp_type = typename StateSpace::fp_type;
 
-  unsigned max_minq = 4;
-  unsigned max_gate_qubits = 6;
-  unsigned num_qubits = max_gate_qubits + max_minq;
+  unsigned rsize = std::log2(Simulator::SIMDRegisterSize());
+  unsigned max_num_qubits = 10 + rsize;
 
   StateSpace state_space = factory.CreateStateSpace();
   Simulator simulator = factory.CreateSimulator();
 
-  auto state = state_space.Create(num_qubits);
-
   std::vector<fp_type> matrix;
-  matrix.reserve(1 << (2 * max_gate_qubits + 1));
+  matrix.reserve(1 << (2 * 6 + 1));
 
   std::vector<unsigned> qubits;
-  qubits.reserve(max_gate_qubits);
+  qubits.reserve(6);
 
-  for (unsigned q = 1; q <= max_gate_qubits; ++q) {
-    unsigned size1 = 1 << q;
-    unsigned size2 = size1 * size1;
+  std::vector<fp_type> vec(state_space.MinSize(max_num_qubits));
 
-    // Expected results are calculated analytically.
-    fp_type expected_real = size2 * size1;
-    fp_type expected_imag = expected_real + size1;
+  for (unsigned num_qubits = 1; num_qubits <= max_num_qubits; ++num_qubits) {
+    auto state = state_space.Create(num_qubits);
 
-    matrix.resize(0);
+    unsigned max_gate_qubits = std::min(6U, num_qubits);
 
-    for (unsigned i = 0; i < 2 * size2; ++i) {
-      matrix.push_back(i + 1);
-    }
+    for (unsigned q = 1; q <= max_gate_qubits; ++q) {
+      unsigned size1 = 1 << q;
+      unsigned size2 = size1 * size1;
+
+      // Expected results are calculated analytically.
+      fp_type expected_real = size2 * size1;
+      fp_type expected_imag = expected_real + size1;
 
-    for (unsigned k = 0; k <= max_minq; ++k) {
-      qubits.resize(0);
+      matrix.resize(0);
 
-      for (unsigned i = 0; i < q; ++i) {
-        qubits.push_back(i + k);
+      for (unsigned i = 0; i < 2 * size2; ++i) {
+        matrix.push_back(i + 1);
       }
 
-      state_space.SetStateUniform(state);
-      auto eval = simulator.ExpectationValue(qubits, matrix.data(), state);
+      unsigned max_minq = std::min(num_qubits - q, rsize + 3);
 
-      EXPECT_NEAR(std::real(eval), expected_real, 1e-6);
-      EXPECT_NEAR(std::imag(eval), expected_imag, 1e-6);
+      for (unsigned k = 0; k <= max_minq; ++k) {
+        qubits.resize(0);
+
+        for (unsigned i = 0; i < q; ++i) {
+          qubits.push_back(i + k);
+        }
+
+        state_space.SetStateUniform(state);
+        auto eval = simulator.ExpectationValue(qubits, matrix.data(), state);
+
+        EXPECT_NEAR(std::real(eval) / expected_real, 1.0, 1e-6);
+        EXPECT_NEAR(std::imag(eval) / expected_imag, 1.0, 1e-6);
+      }
     }
   }
 }
diff --git a/tests/statespace_cuda_test.cu b/tests/statespace_cuda_test.cu
index 489b5bf0..51b3ffb8 100644
--- a/tests/statespace_cuda_test.cu
+++ b/tests/statespace_cuda_test.cu
@@ -33,32 +33,28 @@ struct Factory {
   using Simulator = qsim::SimulatorCUDA<fp_type>;
   using StateSpace = typename Simulator::StateSpace;
 
-  Factory(const typename StateSpace::Parameter& param1,
-          const typename Simulator::Parameter& param2)
-      : param1(param1), param2(param2) {}
+  Factory(const typename StateSpace::Parameter& param) : param(param) {}
 
   StateSpace CreateStateSpace() const {
-    return StateSpace(param1);
+    return StateSpace(param);
   }
 
   Simulator CreateSimulator() const {
-    return Simulator(param2);
+    return Simulator();
   }
 
-  typename StateSpace::Parameter param1;
-  typename Simulator::Parameter param2;
+  typename StateSpace::Parameter param;
 };
 
 TYPED_TEST(StateSpaceCUDATest, Add) {
   using Factory = qsim::Factory<TypeParam>;
+  typename Factory::StateSpace::Parameter param;
 
   for (unsigned num_dblocks : {2, 16}) {
+    param.num_dblocks = num_dblocks;
     for (unsigned num_threads : {64, 256, 1024}) {
-      typename Factory::StateSpace::Parameter param;
       param.num_threads = num_threads;
-
-      Factory factory(param, typename Factory::Simulator::Parameter());
-
+      Factory factory(param);
       TestAdd(factory);
     }
   }
@@ -66,14 +62,13 @@ TYPED_TEST(StateSpaceCUDATest, Add) {
 
 TYPED_TEST(StateSpaceCUDATest, NormSmall) {
   using Factory = qsim::Factory<TypeParam>;
+  typename Factory::StateSpace::Parameter param;
 
   for (unsigned num_dblocks : {2, 16}) {
+    param.num_dblocks = num_dblocks;
     for (unsigned num_threads : {64, 256, 1024}) {
-      typename Factory::StateSpace::Parameter param;
       param.num_threads = num_threads;
-
-      Factory factory(param, typename Factory::Simulator::Parameter());
-
+      Factory factory(param);
       TestNormSmall(factory);
     }
   }
@@ -81,14 +76,13 @@ TYPED_TEST(StateSpaceCUDATest, NormSmall) {
 
 TYPED_TEST(StateSpaceCUDATest, NormAndInnerProductSmall) {
   using Factory = qsim::Factory<TypeParam>;
+  typename Factory::StateSpace::Parameter param;
 
   for (unsigned num_dblocks : {2, 16}) {
+    param.num_dblocks = num_dblocks;
     for (unsigned num_threads : {64, 256, 1024}) {
-      typename Factory::StateSpace::Parameter param;
       param.num_threads = num_threads;
-
-      Factory factory(param, typename Factory::Simulator::Parameter());
-
+      Factory factory(param);
       TestNormAndInnerProductSmall(factory);
     }
   }
@@ -96,14 +90,13 @@ TYPED_TEST(StateSpaceCUDATest, NormAndInnerProductSmall) {
 
 TYPED_TEST(StateSpaceCUDATest, NormAndInnerProduct) {
   using Factory = qsim::Factory<TypeParam>;
+  typename Factory::StateSpace::Parameter param;
 
   for (unsigned num_dblocks : {2, 16}) {
+    param.num_dblocks = num_dblocks;
     for (unsigned num_threads : {64, 256, 1024}) {
-      typename Factory::StateSpace::Parameter param;
       param.num_threads = num_threads;
-
-      Factory factory(param, typename Factory::Simulator::Parameter());
-
+      Factory factory(param);
       TestNormAndInnerProduct(factory);
     }
   }
@@ -111,14 +104,13 @@ TYPED_TEST(StateSpaceCUDATest, NormAndInnerProduct) {
 
 TYPED_TEST(StateSpaceCUDATest, SamplingSmall) {
   using Factory = qsim::Factory<TypeParam>;
+  typename Factory::StateSpace::Parameter param;
 
   for (unsigned num_dblocks : {2, 16}) {
+    param.num_dblocks = num_dblocks;
     for (unsigned num_threads : {64, 256, 1024}) {
-      typename Factory::StateSpace::Parameter param;
       param.num_threads = num_threads;
-
-      Factory factory(param, typename Factory::Simulator::Parameter());
-
+      Factory factory(param);
       TestSamplingSmall(factory);
     }
   }
@@ -126,14 +118,13 @@ TYPED_TEST(StateSpaceCUDATest, SamplingSmall) {
 
 TYPED_TEST(StateSpaceCUDATest, SamplingCrossEntropyDifference) {
   using Factory = qsim::Factory<TypeParam>;
+  typename Factory::StateSpace::Parameter param;
 
   for (unsigned num_dblocks : {16}) {
+    param.num_dblocks = num_dblocks;
     for (unsigned num_threads : {256, 1024}) {
-      typename Factory::StateSpace::Parameter param;
       param.num_threads = num_threads;
-
-      Factory factory(param, typename Factory::Simulator::Parameter());
-
+      Factory factory(param);
       TestSamplingCrossEntropyDifference(factory);
     }
   }
@@ -141,14 +132,13 @@ TYPED_TEST(StateSpaceCUDATest, SamplingCrossEntropyDifference) {
 
 TYPED_TEST(StateSpaceCUDATest, Ordering) {
   using Factory = qsim::Factory<TypeParam>;
+  typename Factory::StateSpace::Parameter param;
 
   for (unsigned num_dblocks : {2, 16}) {
+    param.num_dblocks = num_dblocks;
     for (unsigned num_threads : {64, 256, 1024}) {
-      typename Factory::StateSpace::Parameter param;
       param.num_threads = num_threads;
-
-      Factory factory(param, typename Factory::Simulator::Parameter());
-
+      Factory factory(param);
       TestOrdering(factory);
     }
   }
@@ -156,22 +146,20 @@ TYPED_TEST(StateSpaceCUDATest, Ordering) {
 
 TEST(StateSpaceCUDATest, MeasurementSmall) {
   using Factory = qsim::Factory<float>;
-  Factory::StateSpace::Parameter param1;
-  Factory::Simulator::Parameter param2;
-  Factory factory(param1, param2);
+  Factory::StateSpace::Parameter param;
+  Factory factory(param);
   TestMeasurementSmall(factory, true);
 }
 
 TYPED_TEST(StateSpaceCUDATest, MeasurementLarge) {
   using Factory = qsim::Factory<TypeParam>;
+  typename Factory::StateSpace::Parameter param;
 
   for (unsigned num_dblocks : {2, 16}) {
+    param.num_dblocks = num_dblocks;
     for (unsigned num_threads : {64, 256, 1024}) {
-      typename Factory::StateSpace::Parameter param;
       param.num_threads = num_threads;
-
-      Factory factory(param, typename Factory::Simulator::Parameter());
-
+      Factory factory(param);
       TestMeasurementLarge(factory);
     }
   }
@@ -179,14 +167,13 @@ TYPED_TEST(StateSpaceCUDATest, MeasurementLarge) {
 
 TYPED_TEST(StateSpaceCUDATest, Collapse) {
   using Factory = qsim::Factory<TypeParam>;
+  typename Factory::StateSpace::Parameter param;
 
   for (unsigned num_dblocks : {2, 16}) {
+    param.num_dblocks = num_dblocks;
     for (unsigned num_threads : {64, 256, 1024}) {
-      typename Factory::StateSpace::Parameter param;
       param.num_threads = num_threads;
-
-      Factory factory(param, typename Factory::Simulator::Parameter());
-
+      Factory factory(param);
       TestCollapse(factory);
     }
   }
@@ -194,22 +181,21 @@ TYPED_TEST(StateSpaceCUDATest, Collapse) {
 
 TEST(StateSpaceCUDATest, InvalidStateSize) {
   using Factory = qsim::Factory<float>;
-  Factory::StateSpace::Parameter param1;
-  Factory::Simulator::Parameter param2;
-  Factory factory(param1, param2);
+  Factory::StateSpace::Parameter param;
+  Factory factory(param);
   TestInvalidStateSize(factory);
 }
 
 TYPED_TEST(StateSpaceCUDATest, BulkSetAmpl) {
   using Factory = qsim::Factory<TypeParam>;
+  typename Factory::StateSpace::Parameter param;
 
   for (unsigned num_dblocks : {2, 16}) {
+    param.num_dblocks = num_dblocks;
     for (unsigned num_threads : {64, 256, 1024}) {
       typename Factory::StateSpace::Parameter param;
       param.num_threads = num_threads;
-
-      Factory factory(param, typename Factory::Simulator::Parameter());
-
+      Factory factory(param);
       TestBulkSetAmplitude(factory);
     }
   }
@@ -217,14 +203,13 @@ TYPED_TEST(StateSpaceCUDATest, BulkSetAmpl) {
 
 TYPED_TEST(StateSpaceCUDATest, BulkSetAmplExclusion) {
   using Factory = qsim::Factory<TypeParam>;
+  typename Factory::StateSpace::Parameter param;
 
   for (unsigned num_dblocks : {2, 16}) {
+    param.num_dblocks = num_dblocks;
     for (unsigned num_threads : {64, 256, 1024}) {
-      typename Factory::StateSpace::Parameter param;
       param.num_threads = num_threads;
-
-      Factory factory(param, typename Factory::Simulator::Parameter());
-
+      Factory factory(param);
       TestBulkSetAmplitudeExclusion(factory);
     }
   }
@@ -232,14 +217,13 @@ TYPED_TEST(StateSpaceCUDATest, BulkSetAmplExclusion) {
 
 TYPED_TEST(StateSpaceCUDATest, BulkSetAmplDefault) {
   using Factory = qsim::Factory<TypeParam>;
+  typename Factory::StateSpace::Parameter param;
 
   for (unsigned num_dblocks : {2, 16}) {
+    param.num_dblocks = num_dblocks;
     for (unsigned num_threads : {64, 256, 1024}) {
-      typename Factory::StateSpace::Parameter param;
       param.num_threads = num_threads;
-
-      Factory factory(param, typename Factory::Simulator::Parameter());
-
+      Factory factory(param);
       TestBulkSetAmplitudeDefault(factory);
     }
   }