diff --git a/.github/workflows/bazeltest.yml b/.github/workflows/bazeltest.yml
index 9b29cda3..a7fb03f1 100644
--- a/.github/workflows/bazeltest.yml
+++ b/.github/workflows/bazeltest.yml
@@ -26,8 +26,8 @@ jobs:
         run: git submodule update --init --recursive
       - name: Install Bazel on CI
         run: |
-          wget https://github.com/bazelbuild/bazel/releases/download/0.26.0/bazel_0.26.0-linux-x86_64.deb
-          sudo dpkg -i bazel_0.26.0-linux-x86_64.deb
+          wget https://github.com/bazelbuild/bazel/releases/download/3.7.2/bazel_3.7.2-linux-x86_64.deb
+          sudo dpkg -i bazel_3.7.2-linux-x86_64.deb
       - name: Run C++ tests
         run: |
           bazel test --config=${{ matrix.hardware_opt }} \
@@ -52,8 +52,8 @@ jobs:
         run: git submodule update --init --recursive
       - name: Install Bazel on CI
         run: |
-          wget https://github.com/bazelbuild/bazel/releases/download/0.26.0/bazel_0.26.0-linux-x86_64.deb
-          sudo dpkg -i bazel_0.26.0-linux-x86_64.deb
+          wget https://github.com/bazelbuild/bazel/releases/download/3.7.2/bazel_3.7.2-linux-x86_64.deb
+          sudo dpkg -i bazel_3.7.2-linux-x86_64.deb
       - name: Run C++ tests
         run: |
           bazel test --config=avx --config=openmp \
@@ -69,8 +69,8 @@ jobs:
         run: git submodule update --init --recursive
       - name: Install Bazel on CI
         run: |
-          wget https://github.com/bazelbuild/bazel/releases/download/0.26.0/bazel_0.26.0-linux-x86_64.deb
-          sudo dpkg -i bazel_0.26.0-linux-x86_64.deb
+          wget https://github.com/bazelbuild/bazel/releases/download/3.7.2/bazel_3.7.2-linux-x86_64.deb
+          sudo dpkg -i bazel_3.7.2-linux-x86_64.deb
       - name: Install google-perftools for tcmalloc
         run: sudo apt-get install libgoogle-perftools-dev
       - name: Run C++ tests
diff --git a/.github/workflows/cirq_compatibility.yml b/.github/workflows/cirq_compatibility.yml
index f9a9df71..f4fd7473 100644
--- a/.github/workflows/cirq_compatibility.yml
+++ b/.github/workflows/cirq_compatibility.yml
@@ -7,7 +7,7 @@ on:
 jobs:
   consistency:
     name: Nightly Compatibility
-    runs-on: ubuntu-16.04
+    runs-on: ubuntu-18.04
     steps:
       - uses: actions/checkout@v2
       - uses: actions/setup-python@v1
@@ -18,5 +18,7 @@ jobs:
         run: pip3 install -U cirq --pre
       - name: Install qsim requirements
         run: pip3 install -r requirements.txt
+      - name: Install test requirements
+        run: pip3 install -r dev-requirements.txt
       - name: Run python tests
         run: make run-py-tests
diff --git a/.github/workflows/python_format.yml b/.github/workflows/python_format.yml
index adb5f56f..213abf2e 100644
--- a/.github/workflows/python_format.yml
+++ b/.github/workflows/python_format.yml
@@ -24,9 +24,7 @@ jobs:
         with:
           python-version: '3.7'
           architecture: 'x64'
-      - name: Install flynt
-        run: cat requirements.txt | grep flynt | xargs pip install
-      - name: Install black
-        run: cat requirements.txt | grep black | xargs pip install
+      - name: Install dev requirements
+        run: pip install -r dev-requirements.txt
       - name: Format
         run: check/format-incremental
diff --git a/.github/workflows/release_wheels.yml b/.github/workflows/release_wheels.yml
index 4643ac1e..b209d74e 100644
--- a/.github/workflows/release_wheels.yml
+++ b/.github/workflows/release_wheels.yml
@@ -26,7 +26,7 @@ jobs:
             name: win_amd64
             architecture: x64
             cibw:
-              build: "cp*win_amd64"
+              build: "cp36-win_amd64 cp37-win_amd64 cp38-win_amd64 cp39-win_amd64"
     env:
       CIBW_BUILD: "${{ matrix.cibw.build || '*' }}"
       CIBW_ARCHS: "${{ matrix.cibw.arch || 'auto' }}"
@@ -36,7 +36,8 @@ jobs:
       # to install latest delocate package
       CIBW_DEPENDENCY_VERSIONS: "latest"
       # due to package and module name conflict have to temporarily move it away to run tests
-      CIBW_BEFORE_TEST: "mv {package}/qsimcirq /tmp"
+      CIBW_BEFORE_TEST: mv {package}/qsimcirq /tmp
+      CIBW_TEST_EXTRAS: "dev"
       CIBW_TEST_COMMAND: "pytest {package}/qsimcirq_tests/qsimcirq_test.py && mv /tmp/qsimcirq {package}"
     steps:
       - uses: actions/checkout@v2
diff --git a/.github/workflows/testing_wheels.yml b/.github/workflows/testing_wheels.yml
index 21e6c41f..fa54cf51 100644
--- a/.github/workflows/testing_wheels.yml
+++ b/.github/workflows/testing_wheels.yml
@@ -42,6 +42,7 @@ jobs:
       CIBW_DEPENDENCY_VERSIONS: "latest"
       # due to package and module name conflict have to temporarily move it away to run tests
       CIBW_BEFORE_TEST: "mv {package}/qsimcirq /tmp"
+      CIBW_TEST_EXTRAS: "dev"
       CIBW_TEST_COMMAND: "pytest {package}/qsimcirq_tests/qsimcirq_test.py && mv /tmp/qsimcirq {package}"
     steps:
       - uses: actions/checkout@v2
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d631366c..fba9db45 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,15 @@
 cmake_minimum_required(VERSION 3.11)
-project(qsim)
+
+execute_process(COMMAND which nvcc OUTPUT_VARIABLE has_nvcc)
+if(has_nvcc STREQUAL "")
+    project(qsim)
+else()
+    project(qsim LANGUAGES CXX CUDA)
+    ADD_SUBDIRECTORY(pybind_interface/cuda)
+    if(DEFINED ENV{CUQUANTUM_DIR})
+        ADD_SUBDIRECTORY(pybind_interface/custatevec)
+    endif()
+endif()
 
 ADD_SUBDIRECTORY(pybind_interface/sse)
 ADD_SUBDIRECTORY(pybind_interface/avx512)
diff --git a/MANIFEST.in b/MANIFEST.in
index 2968589b..4b487267 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,5 @@
 include requirements.txt
+include dev-requirements.txt
 include CMakeLists.txt
 
 graft pybind_interface
diff --git a/Makefile b/Makefile
index 49d1370b..7cdaa414 100644
--- a/Makefile
+++ b/Makefile
@@ -11,6 +11,9 @@ CXXFLAGS = -O3 -fopenmp
 ARCHFLAGS = -march=native
 NVCCFLAGS = -O3
 
+# CUQUANTUM_DIR should be set.
+CUSTATEVECFLAGS = -I$(CUQUANTUM_DIR)/include -L${CUQUANTUM_DIR}/lib -L$(CUQUANTUM_DIR)/lib64 -lcustatevec -lcublas
+
 PYBIND11 = true
 
 export CXX
@@ -18,6 +21,7 @@ export CXXFLAGS
 export ARCHFLAGS
 export NVCC
 export NVCCFLAGS
+export CUSTATEVECFLAGS
 
 ifeq ($(PYBIND11), true)
   TARGETS += pybind
@@ -35,6 +39,10 @@ qsim:
 qsim-cuda:
 	$(MAKE) -C apps/ qsim-cuda
 
+.PHONY: qsim-custatevec
+qsim-custatevec:
+	$(MAKE) -C apps/ qsim-custatevec
+
 .PHONY: pybind
 pybind:
 	$(MAKE) -C pybind_interface/ pybind
@@ -47,6 +55,10 @@ cxx-tests: eigen
 cuda-tests:
 	$(MAKE) -C tests/ cuda-tests
 
+.PHONY: custatevec-tests
+custatevec-tests:
+	$(MAKE) -C tests/ custatevec-tests
+
 .PHONY: run-cxx-tests
 run-cxx-tests: cxx-tests
 	$(MAKE) -C tests/ run-cxx-tests
@@ -55,6 +67,10 @@ run-cxx-tests: cxx-tests
 run-cuda-tests: cuda-tests
 	$(MAKE) -C tests/ run-cuda-tests
 
+.PHONY: run-custatevec-tests
+run-custatevec-tests: custatevec-tests
+	$(MAKE) -C tests/ run-custatevec-tests
+
 PYTESTS = $(shell find qsimcirq_tests/ -name '*_test.py')
 
 .PHONY: run-py-tests
diff --git a/README.md b/README.md
index b5e5f6f9..767e4c2c 100644
--- a/README.md
+++ b/README.md
@@ -85,12 +85,14 @@ located in [tests](https://github.com/quantumlib/qsim/tree/master/tests).
 Python tests use pytest, and are located in
 [qsimcirq_tests](https://github.com/quantumlib/qsim/tree/master/qsimcirq_tests).
 
-To build and run all tests, navigate to the test directory and run:
+To build and run all tests, run:
 ```
-make run-all
+make run-tests
 ```
 This will compile all test binaries to files with `.x` extensions, and run each
-test in series. Testing will stop early if a test fails.
+test in series. Testing will stop early if a test fails. It will also run tests
+of the `qsimcirq` python interface. To run C++ or python tests only, run
+`make run-cxx-tests` or `make run-py-tests`, respectively.
 
 To clean up generated test files, run `make clean` from the test directory.
 
@@ -125,4 +127,4 @@ An equivalent BibTex format reference is below for all the versions:
   doi          = {10.5281/zenodo.4023103},
   url          = {https://doi.org/10.5281/zenodo.4023103}
 }
-```
\ No newline at end of file
+```
diff --git a/apps/Makefile b/apps/Makefile
index cb16c1c9..41fb81e5 100644
--- a/apps/Makefile
+++ b/apps/Makefile
@@ -1,8 +1,11 @@
 CXX_TARGETS = $(shell find . -maxdepth 1 -name '*.cc')
 CXX_TARGETS := $(CXX_TARGETS:%.cc=%.x)
 
-CUDA_TARGETS = $(shell find . -maxdepth 1 -name '*.cu')
-CUDA_TARGETS := $(CUDA_TARGETS:%.cu=%.x)
+CUDA_TARGETS = $(shell find . -maxdepth 1 -name '*cuda.cu')
+CUDA_TARGETS := $(CUDA_TARGETS:%cuda.cu=%cuda.x)
+
+CUSTATEVEC_TARGETS = $(shell find . -maxdepth 1 -name "*custatevec.cu")
+CUSTATEVEC_TARGETS := $(CUSTATEVEC_TARGETS:%custatevec.cu=%custatevec.x)
 
 .PHONY: qsim
 qsim: $(CXX_TARGETS)
@@ -10,12 +13,18 @@ qsim: $(CXX_TARGETS)
 .PHONY: qsim-cuda
 qsim-cuda: $(CUDA_TARGETS)
 
+.PHONY: qsim-custatevec
+qsim-custatevec: $(CUSTATEVEC_TARGETS)
+
 %.x: %.cc
 	$(CXX) -o ./$@ $< $(CXXFLAGS) $(ARCHFLAGS)
 
-%.x: %.cu
+%cuda.x: %cuda.cu
 	$(NVCC) -o ./$@ $< $(NVCCFLAGS)
 
+%custatevec.x: %custatevec.cu
+	$(NVCC) -o ./$@ $< $(NVCCFLAGS) $(CUSTATEVECFLAGS)
+
 .PHONY: clean
 clean:
 	-rm -f ./*.x ./*.a ./*.so ./*.mod
diff --git a/apps/make.sh b/apps/make.sh
index 679694f8..c742e192 100755
--- a/apps/make.sh
+++ b/apps/make.sh
@@ -24,3 +24,8 @@ g++ -O3 -march=native -fopenmp -o qsimh_base.x qsimh_base.cc
 g++ -O3 -march=native -fopenmp -o qsimh_amplitudes.x qsimh_amplitudes.cc
 
 nvcc -O3 -o qsim_base_cuda.x qsim_base_cuda.cu
+nvcc -O3 -o qsim_qtrajectory_cuda.x qsim_qtrajectory_cuda.cu
+
+# CUQUANTUM_DIR should be set.
+CUSTATEVECFLAGS="-I${CUQUANTUM_DIR}/include -L${CUQUANTUM_DIR}/lib -L${CUQUANTUM_DIR}/lib64 -lcustatevec -lcublas"
+nvcc -O3 $CUSTATEVECFLAGS -o qsim_base_custatevec.x qsim_base_custatevec.cu
diff --git a/apps/qsim_amplitudes.cc b/apps/qsim_amplitudes.cc
index 29268633..d37fdd6b 100644
--- a/apps/qsim_amplitudes.cc
+++ b/apps/qsim_amplitudes.cc
@@ -30,11 +30,12 @@
 #include "../lib/run_qsim.h"
 #include "../lib/simmux.h"
 #include "../lib/util.h"
+#include "../lib/util_cpu.h"
 
 constexpr char usage[] = "usage:\n  ./qsim_amplitudes -c circuit_file "
                          "-d times_to_save_results -i input_files "
                          "-o output_files -s seed -t num_threads "
-                         "-f max_fused_size -v verbosity\n";
+                         "-f max_fused_size -v verbosity -z\n";
 
 struct Options {
   std::string circuit_file;
@@ -45,6 +46,7 @@ struct Options {
   unsigned num_threads = 1;
   unsigned max_fused_size = 2;
   unsigned verbosity = 0;
+  bool denormals_are_zeros = false;
 };
 
 Options GetOptions(int argc, char* argv[]) {
@@ -56,7 +58,7 @@ Options GetOptions(int argc, char* argv[]) {
     return std::atoi(word.c_str());
   };
 
-  while ((k = getopt(argc, argv, "c:d:i:s:o:t:f:v:")) != -1) {
+  while ((k = getopt(argc, argv, "c:d:i:s:o:t:f:v:z")) != -1) {
     switch (k) {
       case 'c':
         opt.circuit_file = optarg;
@@ -82,6 +84,9 @@ Options GetOptions(int argc, char* argv[]) {
       case 'v':
         opt.verbosity = std::atoi(optarg);
         break;
+      case 'z':
+        opt.denormals_are_zeros = true;
+        break;
       default:
         qsim::IO::errorf(usage);
         exit(1);
@@ -162,6 +167,10 @@ int main(int argc, char* argv[]) {
     return 1;
   }
 
+  if (opt.denormals_are_zeros) {
+    SetFlushToZeroAndDenormalsAreZeros();
+  }
+
   struct Factory {
     Factory(unsigned num_threads) : num_threads(num_threads) {}
 
diff --git a/apps/qsim_base.cc b/apps/qsim_base.cc
index 062dfdde..5a7604d5 100644
--- a/apps/qsim_base.cc
+++ b/apps/qsim_base.cc
@@ -26,6 +26,11 @@
 #include "../lib/io_file.h"
 #include "../lib/run_qsim.h"
 #include "../lib/simmux.h"
+#include "../lib/util_cpu.h"
+
+constexpr char usage[] = "usage:\n  ./qsim_base -c circuit -d maxtime "
+                         "-s seed -t threads -f max_fused_size "
+                         "-v verbosity -z\n";
 
 struct Options {
   std::string circuit_file;
@@ -34,18 +39,15 @@ struct Options {
   unsigned num_threads = 1;
   unsigned max_fused_size = 2;
   unsigned verbosity = 0;
+  bool denormals_are_zeros = false;
 };
 
 Options GetOptions(int argc, char* argv[]) {
-  constexpr char usage[] = "usage:\n  ./qsim_base -c circuit -d maxtime "
-                           "-s seed -t threads -f max_fused_size "
-                           "-v verbosity\n";
-
   Options opt;
 
   int k;
 
-  while ((k = getopt(argc, argv, "c:d:s:t:f:v:")) != -1) {
+  while ((k = getopt(argc, argv, "c:d:s:t:f:v:z")) != -1) {
     switch (k) {
       case 'c':
         opt.circuit_file = optarg;
@@ -65,6 +67,9 @@ Options GetOptions(int argc, char* argv[]) {
       case 'v':
         opt.verbosity = std::atoi(optarg);
         break;
+      case 'z':
+        opt.denormals_are_zeros = true;
+        break;
       default:
         qsim::IO::errorf(usage);
         exit(1);
@@ -77,6 +82,7 @@ Options GetOptions(int argc, char* argv[]) {
 bool ValidateOptions(const Options& opt) {
   if (opt.circuit_file.empty()) {
     qsim::IO::errorf("circuit file is not provided.\n");
+    qsim::IO::errorf(usage);
     return false;
   }
 
@@ -114,6 +120,10 @@ int main(int argc, char* argv[]) {
     return 1;
   }
 
+  if (opt.denormals_are_zeros) {
+    SetFlushToZeroAndDenormalsAreZeros();
+  }
+
   struct Factory {
     Factory(unsigned num_threads) : num_threads(num_threads) {}
 
diff --git a/apps/qsim_base_custatevec.cu b/apps/qsim_base_custatevec.cu
new file mode 100644
index 00000000..a83f3e46
--- /dev/null
+++ b/apps/qsim_base_custatevec.cu
@@ -0,0 +1,171 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+
+#include <algorithm>
+#include <complex>
+#include <limits>
+#include <string>
+
+#include <custatevec.h>
+
+#include "../lib/circuit_qsim_parser.h"
+#include "../lib/formux.h"
+#include "../lib/fuser_mqubit.h"
+#include "../lib/gates_qsim.h"
+#include "../lib/io_file.h"
+#include "../lib/run_qsim.h"
+#include "../lib/simulator_custatevec.h"
+#include "../lib/util_custatevec.h"
+
+struct Options {
+  std::string circuit_file;
+  unsigned maxtime = std::numeric_limits<unsigned>::max();
+  unsigned seed = 1;
+  unsigned max_fused_size = 2;
+  unsigned verbosity = 0;
+};
+
+Options GetOptions(int argc, char* argv[]) {
+  constexpr char usage[] = "usage:\n  ./qsim_base -c circuit -d maxtime "
+                           "-s seed -f max_fused_size -v verbosity\n";
+
+  Options opt;
+
+  int k;
+
+  while ((k = getopt(argc, argv, "c:d:s:f:v:")) != -1) {
+    switch (k) {
+      case 'c':
+        opt.circuit_file = optarg;
+        break;
+      case 'd':
+        opt.maxtime = std::atoi(optarg);
+        break;
+      case 's':
+        opt.seed = std::atoi(optarg);
+        break;
+      case 'f':
+        opt.max_fused_size = std::atoi(optarg);
+        break;
+      case 'v':
+        opt.verbosity = std::atoi(optarg);
+        break;
+      default:
+        qsim::IO::errorf(usage);
+        exit(1);
+    }
+  }
+
+  return opt;
+}
+
+bool ValidateOptions(const Options& opt) {
+  if (opt.circuit_file.empty()) {
+    qsim::IO::errorf("circuit file is not provided.\n");
+    return false;
+  }
+
+  return true;
+}
+
+template <typename StateSpace, typename State>
+void PrintAmplitudes(
+    unsigned num_qubits, const StateSpace& state_space, const State& state) {
+  static constexpr char const* bits[8] = {
+    "000", "001", "010", "011", "100", "101", "110", "111",
+  };
+
+  uint64_t size = std::min(uint64_t{8}, uint64_t{1} << num_qubits);
+  unsigned s = 3 - std::min(unsigned{3}, num_qubits);
+
+  for (uint64_t i = 0; i < size; ++i) {
+    auto a = state_space.GetAmpl(state, i);
+    qsim::IO::messagef("%s:%16.8g%16.8g%16.8g\n",
+                       bits[i] + s, std::real(a), std::imag(a), std::norm(a));
+  }
+}
+
+int main(int argc, char* argv[]) {
+  using namespace qsim;
+
+  auto opt = GetOptions(argc, argv);
+  if (!ValidateOptions(opt)) {
+    return 1;
+  }
+
+  using fp_type = float;
+
+  Circuit<GateQSim<fp_type>> circuit;
+  if (!CircuitQsimParser<IOFile>::FromFile(opt.maxtime, opt.circuit_file,
+                                           circuit)) {
+    return 1;
+  }
+
+  struct Factory {
+    using Simulator = qsim::SimulatorCuStateVec<fp_type>;
+    using StateSpace = Simulator::StateSpace;
+
+    Factory() {
+      ErrorCheck(cublasCreate(&cublas_handle));
+      ErrorCheck(custatevecCreate(&custatevec_handle));
+    }
+
+    ~Factory() {
+      ErrorCheck(cublasDestroy(cublas_handle));
+      ErrorCheck(custatevecDestroy(custatevec_handle));
+    }
+
+    StateSpace CreateStateSpace() const {
+      return StateSpace(cublas_handle, custatevec_handle);
+    }
+
+    Simulator CreateSimulator() const {
+      return Simulator(custatevec_handle);
+    }
+
+    cublasHandle_t cublas_handle;
+    custatevecHandle_t custatevec_handle;
+  };
+
+  using Simulator = Factory::Simulator;
+  using StateSpace = Simulator::StateSpace;
+  using State = StateSpace::State;
+  using Fuser = MultiQubitGateFuser<IO, GateQSim<fp_type>>;
+  using Runner = QSimRunner<IO, Fuser, Factory>;
+
+  Factory factory;
+
+  StateSpace state_space = factory.CreateStateSpace();
+  State state = state_space.Create(circuit.num_qubits);
+
+  if (state_space.IsNull(state)) {
+    IO::errorf("not enough memory: is the number of qubits too large?\n");
+    return 1;
+  }
+
+  state_space.SetStateZero(state);
+
+  Runner::Parameter param;
+  param.max_fused_size = opt.max_fused_size;
+  param.seed = opt.seed;
+  param.verbosity = opt.verbosity;
+
+  if (Runner::Run(param, factory, circuit, state)) {
+    PrintAmplitudes(circuit.num_qubits, state_space, state);
+  }
+
+  return 0;
+}
diff --git a/apps/qsim_qtrajectory_cuda.cu b/apps/qsim_qtrajectory_cuda.cu
new file mode 100644
index 00000000..65fe1cd3
--- /dev/null
+++ b/apps/qsim_qtrajectory_cuda.cu
@@ -0,0 +1,334 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <cstdlib>
+#include <complex>
+#include <limits>
+#include <utility>
+#include <vector>
+
+#include "../lib/channels_qsim.h"
+#include "../lib/circuit_qsim_parser.h"
+#include "../lib/expect.h"
+#include "../lib/fuser_mqubit.h"
+#include "../lib/gates_qsim.h"
+#include "../lib/io_file.h"
+#include "../lib/qtrajectory.h"
+#include "../lib/simulator_cuda.h"
+
+struct Options {
+  std::string circuit_file;
+  std::vector<unsigned> times = {std::numeric_limits<unsigned>::max()};
+  double amplitude_damp_const = 0;
+  double phase_damp_const = 0;
+  unsigned traj0 = 0;
+  unsigned num_trajectories = 10;
+  unsigned max_fused_size = 2;
+  unsigned verbosity = 0;
+};
+
+constexpr char usage[] = "usage:\n  ./qsim_qtrajectory_cuda.x "
+                         "-c circuit_file -d times_to_calculate_observables "
+                         "-a amplitude_damping_const -p phase_damping_const "
+                         "-t traj0 -n num_trajectories -f max_fused_size "
+                         "-v verbosity\n";
+
+Options GetOptions(int argc, char* argv[]) {
+  Options opt;
+
+  int k;
+
+  auto to_int = [](const std::string& word) -> unsigned {
+    return std::atoi(word.c_str());
+  };
+
+  while ((k = getopt(argc, argv, "c:d:a:p:t:n:f:v:")) != -1) {
+    switch (k) {
+      case 'c':
+        opt.circuit_file = optarg;
+        break;
+      case 'd':
+        qsim::SplitString(optarg, ',', to_int, opt.times);
+        break;
+      case 'a':
+        opt.amplitude_damp_const = std::atof(optarg);
+        break;
+      case 'p':
+        opt.phase_damp_const = std::atof(optarg);
+        break;
+      case 't':
+        opt.traj0 = std::atoi(optarg);
+        break;
+      case 'n':
+        opt.num_trajectories = std::atoi(optarg);
+        break;
+      case 'f':
+        opt.max_fused_size = std::atoi(optarg);
+        break;
+      case 'v':
+        opt.verbosity = std::atoi(optarg);
+        break;
+        break;
+      default:
+        qsim::IO::errorf(usage);
+        exit(1);
+    }
+  }
+
+  return opt;
+}
+
+bool ValidateOptions(const Options& opt) {
+  if (opt.circuit_file.empty()) {
+    qsim::IO::errorf("circuit file is not provided.\n");
+    qsim::IO::errorf(usage);
+    return false;
+  }
+
+  if (opt.times.size() == 0) {
+    qsim::IO::errorf("times to calculate observables are not provided.\n");
+    return false;
+  }
+
+  for (std::size_t i = 1; i < opt.times.size(); i++) {
+    if (opt.times[i - 1] == opt.times[i]) {
+      qsim::IO::errorf("duplicate times to calculate observables.\n");
+      return false;
+    } else if (opt.times[i - 1] > opt.times[i]) {
+      qsim::IO::errorf("times to calculate observables are not sorted.\n");
+      return false;
+    }
+  }
+
+  return true;
+}
+
+template <typename Gate, typename Channel1, typename Channel2>
+std::vector<qsim::NoisyCircuit<Gate>> AddNoise(
+    const qsim::Circuit<Gate>& circuit, const std::vector<unsigned>& times,
+    const Channel1& channel1, const Channel2& channel2) {
+  std::vector<qsim::NoisyCircuit<Gate>> ncircuits;
+  ncircuits.reserve(times.size());
+
+  qsim::NoisyCircuit<Gate> ncircuit;
+
+  ncircuit.num_qubits = circuit.num_qubits;
+  ncircuit.channels.reserve(5 * circuit.gates.size());
+
+  unsigned cur_time_index = 0;
+
+  for (std::size_t i = 0; i < circuit.gates.size(); ++i) {
+    const auto& gate = circuit.gates[i];
+
+    ncircuit.channels.push_back(qsim::MakeChannelFromGate(3 * gate.time, gate));
+
+    for (auto q : gate.qubits) {
+      ncircuit.channels.push_back(channel1.Create(3 * gate.time + 1, q));
+    }
+
+    for (auto q : gate.qubits) {
+      ncircuit.channels.push_back(channel2.Create(3 * gate.time + 2, q));
+    }
+
+    unsigned t = times[cur_time_index];
+
+    if (i == circuit.gates.size() - 1 || t < circuit.gates[i + 1].time) {
+      ncircuits.push_back(std::move(ncircuit));
+
+      ncircuit = {};
+
+      if (i < circuit.gates.size() - 1) {
+        if (circuit.gates[i + 1].time > times.back()) {
+          break;
+        }
+
+        ncircuit.num_qubits = circuit.num_qubits;
+        ncircuit.channels.reserve(5 * circuit.gates.size());
+      }
+
+      ++cur_time_index;
+    }
+  }
+
+  return ncircuits;
+}
+
+template <typename Gate>
+std::vector<std::vector<qsim::OpString<Gate>>> GetObservables(
+    unsigned num_qubits) {
+  std::vector<std::vector<qsim::OpString<Gate>>> observables;
+  observables.reserve(num_qubits);
+
+  using X = qsim::GateX<typename Gate::fp_type>;
+
+  for (unsigned q = 0; q < num_qubits; ++q) {
+    observables.push_back({{{1.0, 0.0}, {X::Create(0, q)}}});
+  }
+
+  return observables;
+}
+
+int main(int argc, char* argv[]) {
+  using namespace qsim;
+
+  using fp_type = float;
+
+  struct Factory {
+    using Simulator = qsim::SimulatorCUDA<fp_type>;
+    using StateSpace = Simulator::StateSpace;
+
+    Factory(const StateSpace::Parameter& param1,
+            const Simulator::Parameter& param2)
+        : param1(param1), param2(param2) {}
+
+    StateSpace CreateStateSpace() const {
+      return StateSpace(param1);
+    }
+
+    Simulator CreateSimulator() const {
+      return Simulator(param2);
+    }
+
+    const StateSpace::Parameter& param1;
+    const Simulator::Parameter& param2;
+  };
+
+  using Simulator = Factory::Simulator;
+  using StateSpace = Simulator::StateSpace;
+  using State = StateSpace::State;
+  using Fuser = MultiQubitGateFuser<IO, GateQSim<fp_type>>;
+  using QTSimulator = QuantumTrajectorySimulator<IO, GateQSim<fp_type>,
+                                                 MultiQubitGateFuser,
+                                                 Simulator>;
+
+  auto opt = GetOptions(argc, argv);
+  if (!ValidateOptions(opt)) {
+    return 1;
+  }
+
+  Circuit<GateQSim<fp_type>> circuit;
+  unsigned maxtime = opt.times.back();
+  if (!CircuitQsimParser<IOFile>::FromFile(maxtime, opt.circuit_file,
+                                           circuit)) {
+    return 1;
+  }
+
+  if (opt.times.size() == 1
+      && opt.times[0] == std::numeric_limits<unsigned>::max()) {
+    opt.times[0] = circuit.gates.back().time;
+  }
+
+  StateSpace::Parameter param1;
+  Simulator::Parameter param2;
+  Factory factory(param1, param2);
+
+  Simulator simulator = factory.CreateSimulator();
+  StateSpace state_space = factory.CreateStateSpace();
+
+  State state = state_space.Create(circuit.num_qubits);
+
+  if (state_space.IsNull(state)) {
+    IO::errorf("not enough memory: is the number of qubits too large?\n");
+    return 1;
+  }
+
+  typename QTSimulator::Parameter param3;
+  param3.max_fused_size = opt.max_fused_size;
+  param3.verbosity = opt.verbosity;
+  param3.apply_last_deferred_ops = true;
+
+  auto channel1 = AmplitudeDampingChannel<fp_type>(opt.amplitude_damp_const);
+  auto channel2 = PhaseDampingChannel<fp_type>(opt.phase_damp_const);
+
+  auto noisy_circuits = AddNoise(circuit, opt.times, channel1, channel2);
+
+  auto observables = GetObservables<GateQSim<fp_type>>(circuit.num_qubits);
+
+  std::vector<std::vector<std::vector<std::complex<double>>>> results;
+  results.reserve(opt.num_trajectories);
+
+  QTSimulator::Stat stat;
+
+  using CleanResults = std::vector<std::vector<std::complex<double>>>;
+  CleanResults primary_results(noisy_circuits.size());
+
+  for (unsigned i = 0; i < opt.num_trajectories; ++i) {
+    results.push_back({});
+    results[i].reserve(noisy_circuits.size());
+
+    state_space.SetStateZero(state);
+
+    auto seed = noisy_circuits.size() * (i + opt.traj0);
+
+    for (unsigned s = 0; s < noisy_circuits.size(); ++s) {
+      if (!QTSimulator::RunOnce(param3, noisy_circuits[s], seed++,
+                                state_space, simulator, state, stat)) {
+        return 1;
+      }
+
+      results[i].push_back({});
+      results[i][s].reserve(observables.size());
+
+      primary_results[s].reserve(observables.size());
+
+      if (stat.primary && !primary_results[s].empty()) {
+        for (std::size_t k = 0; k < observables.size(); ++k) {
+          results[i][s].push_back(primary_results[s][k]);
+        }
+      } else {
+        for (const auto& obs : observables) {
+          auto result = ExpectationValue<IO, Fuser>(obs, simulator, state);
+          results[i][s].push_back(result);
+
+          if (stat.primary) {
+            primary_results[s].push_back(result);
+            param3.apply_last_deferred_ops = false;
+          }
+        }
+      }
+    }
+  }
+
+  for (unsigned i = 1; i < opt.num_trajectories; ++i) {
+    for (unsigned s = 0; s < noisy_circuits.size(); ++s) {
+      for (unsigned k = 0; k < observables.size(); ++k) {
+        results[0][s][k] += results[i][s][k];
+      }
+    }
+  }
+
+  double f = 1.0 / opt.num_trajectories;
+
+  for (unsigned s = 0; s < noisy_circuits.size(); ++s) {
+    for (unsigned k = 0; k < observables.size(); ++k) {
+      results[0][s][k] *= f;
+    }
+  }
+
+  for (unsigned s = 0; s < noisy_circuits.size(); ++s) {
+    IO::messagef("#time=%u\n", opt.times[s]);
+
+    for (unsigned k = 0; k < observables.size(); ++k) {
+      IO::messagef("%4u %4u %17.9g %17.9g\n", s, k,
+                   std::real(results[0][s][k]), std::imag(results[0][s][k]));
+    }
+  }
+
+  return 0;
+}
diff --git a/apps/qsim_von_neumann.cc b/apps/qsim_von_neumann.cc
index df9b32fb..46576083 100644
--- a/apps/qsim_von_neumann.cc
+++ b/apps/qsim_von_neumann.cc
@@ -28,6 +28,11 @@
 #include "../lib/io_file.h"
 #include "../lib/run_qsim.h"
 #include "../lib/simmux.h"
+#include "../lib/util_cpu.h"
+
+constexpr char usage[] = "usage:\n  ./qsim_von_neumann -c circuit -d maxtime "
+                         "-s seed -t threads -f max_fused_size "
+                         "-v verbosity -z\n";
 
 struct Options {
   std::string circuit_file;
@@ -36,18 +41,15 @@ struct Options {
   unsigned num_threads = 1;
   unsigned max_fused_size = 2;
   unsigned verbosity = 0;
+  bool denormals_are_zeros = false;
 };
 
 Options GetOptions(int argc, char* argv[]) {
-  constexpr char usage[] = "usage:\n  ./qsim_von_neumann -c circuit -d maxtime "
-                           "-s seed -t threads -f max_fused_size "
-                           "-v verbosity\n";
-
   Options opt;
 
   int k;
 
-  while ((k = getopt(argc, argv, "c:d:s:t:f:v:")) != -1) {
+  while ((k = getopt(argc, argv, "c:d:s:t:f:v:z")) != -1) {
     switch (k) {
       case 'c':
         opt.circuit_file = optarg;
@@ -67,6 +69,9 @@ Options GetOptions(int argc, char* argv[]) {
       case 'v':
         opt.verbosity = std::atoi(optarg);
         break;
+      case 'z':
+        opt.denormals_are_zeros = true;
+        break;
       default:
         qsim::IO::errorf(usage);
         exit(1);
@@ -79,6 +84,7 @@ Options GetOptions(int argc, char* argv[]) {
 bool ValidateOptions(const Options& opt) {
   if (opt.circuit_file.empty()) {
     qsim::IO::errorf("circuit file is not provided.\n");
+    qsim::IO::errorf(usage);
     return false;
   }
 
@@ -99,6 +105,10 @@ int main(int argc, char* argv[]) {
     return 1;
   }
 
+  if (opt.denormals_are_zeros) {
+    SetFlushToZeroAndDenormalsAreZeros();
+  }
+
   struct Factory {
     Factory(unsigned num_threads) : num_threads(num_threads) {}
 
diff --git a/apps/qsimh_amplitudes.cc b/apps/qsimh_amplitudes.cc
index cf57f121..7cb1b085 100644
--- a/apps/qsimh_amplitudes.cc
+++ b/apps/qsimh_amplitudes.cc
@@ -30,12 +30,13 @@
 #include "../lib/run_qsimh.h"
 #include "../lib/simmux.h"
 #include "../lib/util.h"
+#include "../lib/util_cpu.h"
 
 constexpr char usage[] = "usage:\n  ./qsimh_amplitudes -c circuit_file "
                          "-d maxtime -k part1_qubits "
                          "-w prefix -p num_prefix_gates -r num_root_gates "
                          "-i input_file -o output_file -t num_threads "
-                         "-v verbosity\n";
+                         "-v verbosity -z\n";
 
 struct Options {
   std::string circuit_file;
@@ -48,6 +49,7 @@ struct Options {
   unsigned num_root_gatexs = 0;
   unsigned num_threads = 1;
   unsigned verbosity = 0;
+  bool denormals_are_zeros = false;
 };
 
 Options GetOptions(int argc, char* argv[]) {
@@ -59,7 +61,7 @@ Options GetOptions(int argc, char* argv[]) {
     return std::atoi(word.c_str());
   };
 
-  while ((k = getopt(argc, argv, "c:d:k:w:p:r:i:o:t:v:")) != -1) {
+  while ((k = getopt(argc, argv, "c:d:k:w:p:r:i:o:t:v:z")) != -1) {
     switch (k) {
       case 'c':
         opt.circuit_file = optarg;
@@ -91,6 +93,9 @@ Options GetOptions(int argc, char* argv[]) {
       case 'v':
         opt.verbosity = std::atoi(optarg);
         break;
+      case 'z':
+        opt.denormals_are_zeros = true;
+        break;
       default:
         qsim::IO::errorf(usage);
         exit(1);
@@ -180,6 +185,10 @@ int main(int argc, char* argv[]) {
   }
   auto parts = GetParts(circuit.num_qubits, opt.part1);
 
+  if (opt.denormals_are_zeros) {
+    SetFlushToZeroAndDenormalsAreZeros();
+  }
+
   std::vector<Bitstring> bitstrings;
   auto num_qubits = circuit.num_qubits;
   if (!BitstringsFromFile<IOFile>(num_qubits, opt.input_file, bitstrings)) {
diff --git a/apps/qsimh_base.cc b/apps/qsimh_base.cc
index 7b9190d8..eb0c9c6c 100644
--- a/apps/qsimh_base.cc
+++ b/apps/qsimh_base.cc
@@ -30,11 +30,12 @@
 #include "../lib/run_qsimh.h"
 #include "../lib/simmux.h"
 #include "../lib/util.h"
+#include "../lib/util_cpu.h"
 
 constexpr char usage[] = "usage:\n  ./qsimh_base -c circuit_file "
                          "-d maximum_time -k part1_qubits "
                          "-w prefix -p num_prefix_gates -r num_root_gates "
-                         "-t num_threads -v verbosity\n";
+                         "-t num_threads -v verbosity -z\n";
 
 struct Options {
   std::string circuit_file;
@@ -45,6 +46,7 @@ struct Options {
   unsigned num_root_gatexs = 0;
   unsigned num_threads = 1;
   unsigned verbosity = 0;
+  bool denormals_are_zeros = false;
 };
 
 Options GetOptions(int argc, char* argv[]) {
@@ -56,7 +58,7 @@ Options GetOptions(int argc, char* argv[]) {
     return std::atoi(word.c_str());
   };
 
-  while ((k = getopt(argc, argv, "c:d:k:w:p:r:t:v:")) != -1) {
+  while ((k = getopt(argc, argv, "c:d:k:w:p:r:t:v:z")) != -1) {
     switch (k) {
       case 'c':
         opt.circuit_file = optarg;
@@ -82,6 +84,9 @@ Options GetOptions(int argc, char* argv[]) {
       case 'v':
         opt.verbosity = std::atoi(optarg);
         break;
+      case 'z':
+        opt.denormals_are_zeros = true;
+        break;
       default:
         qsim::IO::errorf(usage);
         exit(1);
@@ -142,6 +147,10 @@ int main(int argc, char* argv[]) {
   }
   auto parts = GetParts(circuit.num_qubits, opt.part1);
 
+  if (opt.denormals_are_zeros) {
+    SetFlushToZeroAndDenormalsAreZeros();
+  }
+
   uint64_t num_bitstrings =
       std::min(uint64_t{8}, uint64_t{1} << circuit.num_qubits);
 
diff --git a/dev-requirements.txt b/dev-requirements.txt
new file mode 100644
index 00000000..2d2638bc
--- /dev/null
+++ b/dev-requirements.txt
@@ -0,0 +1,3 @@
+black~=22.3.0
+flynt~=0.60
+pytest
diff --git a/docs/_book.yaml b/docs/_book.yaml
index 3f344eb1..2fe9a2c8 100644
--- a/docs/_book.yaml
+++ b/docs/_book.yaml
@@ -6,51 +6,62 @@ upper_tabs:
   menu:
   - include: /_book/menu_software.yaml
   lower_tabs:
-    # Subsite tabs
-    other:
-    - name: "Guide & Tutorials"
-      contents:
-      - title: "qsim and qsimh"
-        path: /qsim/overview
-      - title: "Usage"
-        path: /qsim/usage
-      - title: "Installing qsimcirq"
-        path: /qsim/install_qsimcirq
-      - title: "Cirq interface"
-        path: /qsim/cirq_interface
-      - title: "Input circuit file format"
-        path: /qsim/input_format
-      - title: "Template naming"
-        path: /qsim/type_reference
-      - title: "Build with Bazel"
-        path: /qsim/bazel
-      - title: "Testing qsim"
-        path: /qsim/testing
-      - title: "Docker"
-        path: /qsim/docker
-      - title: "Release process"
-        path: /qsim/release
+   # Subsite tabs
+   other:
+   - name: "Tutorials"
+     contents:
+       - title: "Get started with qsimcirq"
+         path: /qsim/tutorials/qsimcirq
+       - heading: "qsim on Google Cloud"
+       - title: "Before you begin"
+         path: /qsim/tutorials/gcp_before_you_begin
+       - title: "CPU-based simulation"
+         path: /qsim/tutorials/gcp_cpu
+       - title: "GPU-based simulation"
+         path: /qsim/tutorials/gcp_gpu
+       - title: "Multinode simulation"
+         path: /qsim/tutorials/multinode
+       - heading: "Other tutorials"
+       - title: "Simulate a large circuit"
+         path: /qsim/tutorials/q32d14
+       - title: "Simulate noise"
+         path: /qsim/tutorials/noisy_qsimcirq
 
-      - heading: "Tutorials"
-      - title: "Get started with qsimcirq"
-        path: /qsim/tutorials/qsimcirq
-      - title: "Quantum simulation on GCP with Cirq and qsim"
-        path: /qsim/tutorials/qsimcirq_gcp
-      - title: "Simulate a large quantum circuit"
-        path: /qsim/tutorials/q32d14
+   - name: "Guides"
+     contents:
+       - title: "Introduction"
+         path: /qsim/overview
+       - title: "Choosing hardware"
+         path: /qsim/choose_hw
+       - title: "Command line reference"
+         path: /qsim/usage
+       - title: "Python interface"
+         path: /qsim/cirq_interface
+       - title: "C++ templates"
+         path: /qsim/type_reference
+       - title: "C++ builds with Bazel"
+         path: /qsim/bazel
+       - title: "Docker builds"
+         path: /qsim/docker
+       - title: "Testing qsim"
+         path: /qsim/testing
+       - title: "Release process"
+         path: /qsim/release
 
-    - name: "Python Reference"
-      skip_translation: true
-      contents:
-      - title: "All Symbols"
-        path: /reference/python/qsimcirq/all_symbols
-      - include: /reference/python/qsimcirq/_toc.yaml
+   - name: "Python Reference"
+     skip_translation: true
+     contents:
+       - title: "All Symbols"
+         path: /reference/python/qsimcirq/all_symbols
+       - include: /reference/python/qsimcirq/_toc.yaml
 
-    - name: "C++ Reference"
-      skip_translation: true
-      contents:
-      - title: "All Symbols"
-        path: /reference/cc/qsim
-      - include: /reference/cc/qsim/_doxygen.yaml
+   - name: "C++ Reference"
+     skip_translation: true
+     contents:
+     - title: "All Symbols"
+       path: /reference/cc/qsim
+     - include: /reference/cc/qsim/_doxygen.yaml
 
 - include: /_book/upper_tabs_right.yaml
+
+
diff --git a/docs/_index.yaml b/docs/_index.yaml
index 0b93d76c..e525f69c 100644
--- a/docs/_index.yaml
+++ b/docs/_index.yaml
@@ -15,7 +15,7 @@
 book_path: /qsim/_book.yaml
 project_path: /qsim/_project.yaml
 description: >
-  Quantum circuit simulators qsim and qsimh.
+  Quantum circuit simulator qsim.
 landing_page:
   custom_css_path: /site-assets/css/style.css
   rows:
@@ -24,7 +24,7 @@ landing_page:
     - hero
     - description-50
     - padding-large
-    heading: qsim and qsimh
+    heading: qsim
     icon:
       path: /site-assets/images/icons/icon_qsim.png
     description: >
@@ -39,12 +39,6 @@ landing_page:
         qsim is integrated with Cirq and can be used to run simulations of up
         to 40 qubits on a 90 core Intel Xeon workstation.
         </p>
-        <h4>qsimh</h4>
-        <p>
-        qsimh is a hybrid Schrödinger-Feynman simulator built for parallel
-        execution on a cluster of machines. It produces amplitudes for user-
-        specified output bitstrings.
-        </p>
       buttons:
       - label: Get started with qsim on Cirq
         path: /qsim/tutorials/qsimcirq
@@ -64,9 +58,9 @@ landing_page:
 
         # Define a circuit to run
         # (Example is from the 2019 "Quantum Supremacy" experiement)
-        circuit = cirq.experiments.
+        circuit = (cirq.experiments.
             random_rotations_between_grid_interaction_layers_circuit(
-            qubits=qubits, depth=16)
+            qubits=qubits, depth=16))
 
         # Measure qubits at the end of the circuit
         circuit.append(cirq.measure(*qubits, key='all_qubits'))
@@ -84,28 +78,11 @@ landing_page:
     options:
     - cards
     items:
-    - heading: "Schrödinger simulation via qsim"
-      image_path: /site-assets/images/cards/qsim-card-schrodinger.png
-      description: >
-        qsim is a full wavefunction simulator that has been optimized to support
-        vectorized operations and multi-threading.
-      buttons:
-      - label: "Learn more"
-        path: /qsim/usage
-    - heading: "Schrödinger-Feynman simulation via qsimh"
-      image_path: /site-assets/images/cards/qsim-card-schrodinger-feynman.png
-      description: >
-        qsimh is a hybrid Schrödinger-Feynman simulator. It simulates separate
-        disjoint sets of qubit using a full wave vector simulator, and then uses
-        Feynman paths to sum over gates that span the sets.
-      buttons:
-      - label: "Learn more"
-        path: /qsim/usage#qsimh_base_usage
     - heading: "Cirq integration"
       image_path: /site-assets/images/cards/qsim-card-cirq-integrations.png
       description: >
         Cirq is a python framework for writing, simulating, and executing
-        quantum programs. Cirq’s built in simulator is useful to around 20
+        quantum programs. Cirq's built in simulator is useful to around 20
         qubits. By using the qsim Cirq simulator one can boost the number of
         qubits simulated to be mostly limited by available ram. Up to 40 qubits
         can be simulated on a 90 core Intel Xeon workstation.
@@ -115,9 +92,27 @@ landing_page:
     - heading: "Install qsim on GCP"
       image_path: /site-assets/images/cards/qsim-card-gcp.jpg
       description: >
-        Learn how to simulate up to 38 qubits on Google Cloud’s Compute Engine.
+        Learn how to simulate up to 38 qubits on Google Cloud's Compute Engine.
         qsim has a prepackaged docker image that allows easy deployment of qsim,
         Juypter, and Cirq onto a virtual machine.
       buttons:
       - label: "Learn more"
-        path: /qsim/tutorials/qsimcirq_gcp
+        path: /qsim/tutorials/gcp_before_you_begin
+    - heading: "Upgrades to qsim"
+      image_path: /site-assets/images/cards/qsim-card-schrodinger.png
+      description: >
+        To help researchers and developers develop quantum algorithms today, we
+        have made updates to qsim that make it more performant and intuitive,
+        and more "hardware-like".
+      buttons:
+      - label: "Learn more"
+        path: https://opensource.googleblog.com/2021/11/Upgrading%20qsim%20Google%20Quantum%20AIs%20Open%20Source%20Quantum%20Simulator%20.html?linkId=138925083
+    - heading: "Integrating qsim with NVIDIA's cuQuantum SDK"
+      image_path: /qsim/images/qsim_nvidia.png
+      description: >
+        The integration between qsim and the NVIDIA cuQuantum SDK will enable
+        qsim users to make the most of GPUs when developing quantum algorithms
+        and applications.
+      buttons:
+      - label: "Learn more"
+        path: https://opensource.googleblog.com/2021/11/qsim%20integrates%20with%20NVIDIA%20cuQuantum%20SDK%20to%20accelerate%20quantum%20circuit%20simulations%20on%20NVIDIA%20GPUs.html
diff --git a/docs/choose_hw.md b/docs/choose_hw.md
new file mode 100644
index 00000000..3be2bc06
--- /dev/null
+++ b/docs/choose_hw.md
@@ -0,0 +1,934 @@
+<devsite-mathjax config="TeX-AMS-MML_SVG"></devsite-mathjax>
+
+# Choosing hardware for your qsim simulation
+
+As you increase the size and complexity of your quantum simulation, you rapidly
+require a large increase in computational power. This guide describes
+considerations that can help you choose hardware for your simulation.
+
+Your simulation setup depends on the following:
+
+*   Noise; noisy (realistic) simulations require more compute power than
+    noiseless (idealised) simulations.
+*   Number of qubits.
+*   Circuit depth; the number of time steps required to perform the circuit.
+
+## Quick start
+
+The following graph provides loose guidelines to help you get started with
+choosing hardware for your simulation. The qubit upper bounds in this chart are
+not technical limits.
+
+![Decision tree for hardware to run a qsim simulation.](images/choose_hw.png)
+
+## Choose hardware for your simulation
+
+### 1. Evaluate whether your simulation can be run locally
+
+If you have a modern laptop with at least 8GB of memory, you can run your
+simulation locally in the following cases:
+
+*   Noiseless simulations that use fewer than 29 qubits.
+*   Noisy simulations that use fewer than 18 qubits.
+
+If you intend to simulate a circuit many times, consider multinode simulation.
+For more information about multinode simulation [see step 5,
+below](#5_consider_multiple_compute_nodes).
+
+### 2. Estimate your memory requirements
+
+You can estimate your memory requirements with the following rule of thumb:
+$ memory\ required = 8 \cdot 2^N bytes $ for an N-qubit circuit
+
+In addition to memory size, consider the bandwidth of your memory. qsim performs
+best when it can use the maximum number of threads. Multi-threaded simulation
+benefits from high-bandwidth memory (above 100GB/s).
+
+### 3. Decide between CPUs and GPUs
+
+*   GPU hardware starts to outperform CPU hardware significantly (up to 15x
+    faster) for circuits with more than 20 qubits.
+*   The maximum number of qubits that you can simulate with a GPU is limited by
+    the memory of the GPU. Currently, for a noiseless simulation on an NVIDIA
+    A100 GPU (with 40GB of memory), the maximum number of qubits is 32.
+*   For noiseless simulations with 32-40 qubits, you can use CPUs. However, the
+    runtime time increases exponentially with the number of qubits, and runtimes
+    are long for simulations above 32 qubits.
+
+The following charts show the runtime for a random circuit run on
+[Google Compute Engine](https://cloud.google.com/compute), using an NVidia A100
+GPU, and a compute-optimized CPU (c2-standard-4). The first chart shows the
+runtimes for the noiseless simulation. The second chart shows the runtimes for a
+noisy simulation, using a phase damping channel (p=0.01). The charts use a log
+scale.
+
+![qsim runtime comparison on multipe processors: noiseless](images/qsim_runtime_comparison_noiseless.png)
+![qsim runtime comparison on multipe processors: noisy](images/qsim_runtime_comparison_noisy.png)
+
+### 4. Select a specific machine
+
+After you decide whether you want to use CPUs or GPUs for your simulation,
+choose a specific machine:
+
+1.  Restrict your options to machines that meet your memory requirements. For
+    more information about memory requirements, see step 2.
+2.  Decide if performance (speed) or cost is more important to you:
+    *   For a table of performance benchmarks, see
+        [Sample benchmarks](#sample_benchmarks) below.
+    *   For more information about GCP pricing, see the
+        [Google Cloud pricing calculator](https://cloud.google.com/products/calculator).
+    *   Prioritizing performance is particularly important in the following
+        scenarios:
+        *   Simulating with a **higher f value** (f is the maximum number of
+            qubits allowed per fused gate).
+            *   For small to medium size circuits (up to 22 qubits), keep f low
+                (2 or 3).
+            *   For medium to large size qubits (22+ qubits), use a higher f
+                typically, f=4 is the best option).
+        *   Simulating a **deep circuit** (depth 30+).
+
+### 5. Consider multiple compute nodes
+
+Simulating in multinode mode is useful when your simulation can be parallelized.
+In a noisy simulation, the trajectories (also known as repetitions, iterations)
+are “embarrassingly parallelizable”, there is an automated workflow for
+distributing these trajectories over multiple nodes. A simulation of many
+noiseless circuits can also be distributed over multiple compute nodes.
+
+For mor information about running a mulitnode simulation, see [Multinode quantum
+simulation using HTCondor on Google Cloud](/qsim/tutorials/multinode).
+
+## Runtime estimates
+
+Runtime grows exponentially with the number of qubits, and linearly with circuit
+depth beyond 20 qubits.
+
+*   For noiseless simulations, runtime grows at a rate of $ 2^N $ for an N-qubit
+    circuit. For more information about runtimes for small circuits, see
+    [Additional notes for advanced users](#additional_notes_for_advanced_users)
+    below).
+*   For noisy simulations, runtime grows at a rate of $ 2^N $ multiplied by the
+    number of iterations for an N-qubit circuit.
+
+## Additional notes for advanced users
+
+*   The impact of noise on simulation depends on:
+    *   What type of errors are included in your noise channel (decoherence,
+        depolarizing channels, coherent errors, readout errors).
+    *   How you can represent your noise model using Kraus operator formalism:
+        *   Performance is best in the case where all Kraus operators are
+            proportional to unitary matrices, such as when using only a
+            depolarizing channel.
+        *   Using noise which cannot be represented with Kraus operators
+            proportional to unitary matrices, can slow down simulations by a
+            factor of up to 6** **compared to using a depolarizing channel only
+        *   Noisy simulations are faster with lower noise (when one Kraus
+            operator dominates).
+*   Experimenting with the 'f' parameter (maximum number of qubits allowed per
+    fused gate):
+    *   The advanced user is advised to try out multiple f values to optimize
+        their simulation setup.
+        *   Note that f=2 or f=3 can be optimal for large circuits simulated on
+            CPUs with a smaller number of threads (say, up to four or eight
+            threads). However, this depends on the circuit structure.
+        *   Note that f=6 is very rarely optimal.
+*   Using the optimal number of threads:
+    *   Use the maximum number of threads on CPUs for the best performance.
+    *   If the maximum number of threads is not used on multi-socket machines
+        then it is advisable to distribute threads evenly to all sockets or to
+        run all threads within a single socket. Separate simulations on each
+        socket can be run simultaneously in the latter case.
+    *   Note that currently the number of CPU threads does not affect the
+        performance for small circuits (smaller than 17 qubits). Only one thread
+        is used because of OpenMP overhead.
+*   Runtime estimates for small circuits:
+    *   For circuits that contain fewer than 20 qubits, the qsimcirq translation
+        layer performance overhead tends to dominate the runtime estimate. In
+        addition to this, qsim is not optimized for small circuits.
+    *   The total small circuits runtime overhead for an N qubit circuit
+        depends on the circuit depth and on N. The overhead can be large enough to
+        conceal the  $ 2^N $ growth in runtime.
+
+## Sample benchmarks
+
+**Noiseless simulation benchmarks data sheet**
+
+For a random circuit, depth=20, f=3, max threads.
+
+<table>
+  <tr>
+   <td style="background-color: null"><strong>processor type</strong>
+   </td>
+   <td style="background-color: null"><strong>machine</strong>
+   </td>
+   <td style="background-color: null"><strong># of qubits</strong>
+   </td>
+   <td style="background-color: null"><strong>runtime</strong>
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">CPU
+   </td>
+   <td style="background-color: #f3f3f3">c2-standard-60
+   </td>
+   <td style="background-color: #f3f3f3">34
+   </td>
+   <td style="background-color: #f3f3f3">291.987
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">CPU
+   </td>
+   <td style="background-color: #f3f3f3">c2-standard-60
+   </td>
+   <td style="background-color: #f3f3f3">32
+   </td>
+   <td style="background-color: #f3f3f3">54.558
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">CPU
+   </td>
+   <td style="background-color: #f3f3f3">c2-standard-60
+   </td>
+   <td style="background-color: #f3f3f3">30
+   </td>
+   <td style="background-color: #f3f3f3">13.455
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">CPU
+   </td>
+   <td style="background-color: #f3f3f3">c2-standard-60
+   </td>
+   <td style="background-color: #f3f3f3">28
+   </td>
+   <td style="background-color: #f3f3f3">2.837
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">CPU
+   </td>
+   <td style="background-color: #f3f3f3">c2-standard-60
+   </td>
+   <td style="background-color: #f3f3f3">24
+   </td>
+   <td style="background-color: #f3f3f3">0.123
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">CPU
+   </td>
+   <td style="background-color: #f3f3f3">c2-standard-60
+   </td>
+   <td style="background-color: #f3f3f3">20
+   </td>
+   <td style="background-color: #f3f3f3">0.013
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">CPU
+   </td>
+   <td style="background-color: #f3f3f3">c2-standard-60
+   </td>
+   <td style="background-color: #f3f3f3">16
+   </td>
+   <td style="background-color: #f3f3f3">0.009
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #ffffff">CPU
+   </td>
+   <td style="background-color: null">c2-standard-4-4
+   </td>
+   <td style="background-color: null">30
+   </td>
+   <td style="background-color: null">52.880
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #ffffff">CPU
+   </td>
+   <td style="background-color: null">c2-standard-4-4
+   </td>
+   <td style="background-color: null">28
+   </td>
+   <td style="background-color: null">12.814
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #ffffff">CPU
+   </td>
+   <td style="background-color: null">c2-standard-4-4
+   </td>
+   <td style="background-color: null">24
+   </td>
+   <td style="background-color: null">0.658
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #ffffff">CPU
+   </td>
+   <td style="background-color: null">c2-standard-4-4
+   </td>
+   <td style="background-color: null">20
+   </td>
+   <td style="background-color: null">0.031
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #ffffff">CPU
+   </td>
+   <td style="background-color: null">c2-standard-4-4
+   </td>
+   <td style="background-color: null">16
+   </td>
+   <td style="background-color: null">0.008
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">GPU
+   </td>
+   <td style="background-color: #f3f3f3">a100
+   </td>
+   <td style="background-color: #f3f3f3">32
+   </td>
+   <td style="background-color: #f3f3f3">7.415
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">GPU
+   </td>
+   <td style="background-color: #f3f3f3">a100
+   </td>
+   <td style="background-color: #f3f3f3">30
+   </td>
+   <td style="background-color: #f3f3f3">1.561
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">GPU
+   </td>
+   <td style="background-color: #f3f3f3">a100
+   </td>
+   <td style="background-color: #f3f3f3">28
+   </td>
+   <td style="background-color: #f3f3f3">0.384
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">GPU
+   </td>
+   <td style="background-color: #f3f3f3">a100
+   </td>
+   <td style="background-color: #f3f3f3">24
+   </td>
+   <td style="background-color: #f3f3f3">0.030
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">GPU
+   </td>
+   <td style="background-color: #f3f3f3">a100
+   </td>
+   <td style="background-color: #f3f3f3">20
+   </td>
+   <td style="background-color: #f3f3f3">0.010
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">GPU
+   </td>
+   <td style="background-color: #f3f3f3">a100
+   </td>
+   <td style="background-color: #f3f3f3">16
+   </td>
+   <td style="background-color: #f3f3f3">0.007
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">GPU
+   </td>
+   <td style="background-color: #f3f3f3">t4
+   </td>
+   <td style="background-color: #f3f3f3">30
+   </td>
+   <td style="background-color: #f3f3f3">10.163
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">GPU
+   </td>
+   <td style="background-color: null">t4
+   </td>
+   <td style="background-color: null">28
+   </td>
+   <td style="background-color: null">2.394
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">GPU
+   </td>
+   <td style="background-color: null">t4
+   </td>
+   <td style="background-color: null">24
+   </td>
+   <td style="background-color: null">0.118
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">GPU
+   </td>
+   <td style="background-color: null">t4
+   </td>
+   <td style="background-color: null">20
+   </td>
+   <td style="background-color: null">0.014
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">GPU
+   </td>
+   <td style="background-color: null">t4
+   </td>
+   <td style="background-color: null">16
+   </td>
+   <td style="background-color: null">0.007
+   </td>
+  </tr>
+</table>
+
+**Noisy simulation benchmarks data sheet**
+
+For one trajectory of a random circuit, depth=20, f=3, max threads.
+
+<table>
+  <tr>
+   <td style="background-color: null"><strong>processor type</strong>
+   </td>
+   <td style="background-color: null"><strong>machine</strong>
+   </td>
+   <td style="background-color: null"><strong>noise type</strong>
+   </td>
+   <td style="background-color: null"><strong># of qubits</strong>
+   </td>
+   <td style="background-color: null"><strong>runtime</strong>
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">CPU
+   </td>
+   <td style="background-color: #f3f3f3">c2-standard-60
+   </td>
+   <td style="background-color: #f3f3f3">depolarizing
+   </td>
+   <td style="background-color: #f3f3f3">30
+   </td>
+   <td style="background-color: #f3f3f3">13.021
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">CPU
+   </td>
+   <td style="background-color: #f3f3f3">c2-standard-60
+   </td>
+   <td style="background-color: #f3f3f3">depolarizing
+   </td>
+   <td style="background-color: #f3f3f3">28
+   </td>
+   <td style="background-color: #f3f3f3">2.840
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">CPU
+   </td>
+   <td style="background-color: #f3f3f3">c2-standard-60
+   </td>
+   <td style="background-color: #f3f3f3">depolarizing
+   </td>
+   <td style="background-color: #f3f3f3">26
+   </td>
+   <td style="background-color: #f3f3f3">0.604
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">CPU
+   </td>
+   <td style="background-color: #f3f3f3">c2-standard-60
+   </td>
+   <td style="background-color: #f3f3f3">depolarizing
+   </td>
+   <td style="background-color: #f3f3f3">24
+   </td>
+   <td style="background-color: #f3f3f3">0.110
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">CPU
+   </td>
+   <td style="background-color: #f3f3f3">c2-standard-60
+   </td>
+   <td style="background-color: #f3f3f3">depolarizing
+   </td>
+   <td style="background-color: #f3f3f3">20
+   </td>
+   <td style="background-color: #f3f3f3">0.009
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">CPU
+   </td>
+   <td style="background-color: #f3f3f3">c2-standard-60
+   </td>
+   <td style="background-color: #f3f3f3">depolarizing
+   </td>
+   <td style="background-color: #f3f3f3">16
+   </td>
+   <td style="background-color: #f3f3f3">0.006
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">CPU
+   </td>
+   <td style="background-color: #f3f3f3">c2-standard-60
+   </td>
+   <td style="background-color: #f3f3f3">dephasing
+   </td>
+   <td style="background-color: #f3f3f3">30
+   </td>
+   <td style="background-color: #f3f3f3">122.788
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">CPU
+   </td>
+   <td style="background-color: #f3f3f3">c2-standard-60
+   </td>
+   <td style="background-color: #f3f3f3">dephasing
+   </td>
+   <td style="background-color: #f3f3f3">28
+   </td>
+   <td style="background-color: #f3f3f3">29.966
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">CPU
+   </td>
+   <td style="background-color: #f3f3f3">c2-standard-60
+   </td>
+   <td style="background-color: #f3f3f3">dephasing
+   </td>
+   <td style="background-color: #f3f3f3">26
+   </td>
+   <td style="background-color: #f3f3f3">6.378
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">CPU
+   </td>
+   <td style="background-color: #f3f3f3">c2-standard-60
+   </td>
+   <td style="background-color: #f3f3f3">dephasing
+   </td>
+   <td style="background-color: #f3f3f3">24
+   </td>
+   <td style="background-color: #f3f3f3">1.181
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">CPU
+   </td>
+   <td style="background-color: #f3f3f3">c2-standard-60
+   </td>
+   <td style="background-color: #f3f3f3">dephasing
+   </td>
+   <td style="background-color: #f3f3f3">20
+   </td>
+   <td style="background-color: #f3f3f3">0.045
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">CPU
+   </td>
+   <td style="background-color: #f3f3f3">c2-standard-60
+   </td>
+   <td style="background-color: #f3f3f3">dephasing
+   </td>
+   <td style="background-color: #f3f3f3">16
+   </td>
+   <td style="background-color: #f3f3f3">0.023
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #ffffff">CPU
+   </td>
+   <td style="background-color: #ffffff">c2-standard-4-4
+   </td>
+   <td style="background-color: null">depolarizing
+   </td>
+   <td style="background-color: #ffffff">26
+   </td>
+   <td style="background-color: #ffffff">2.807
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">CPU
+   </td>
+   <td style="background-color: null">c2-standard-4-4
+   </td>
+   <td style="background-color: null">depolarizing
+   </td>
+   <td style="background-color: null">24
+   </td>
+   <td style="background-color: null">0.631
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">CPU
+   </td>
+   <td style="background-color: null">c2-standard-4-4
+   </td>
+   <td style="background-color: null">depolarizing
+   </td>
+   <td style="background-color: null">20
+   </td>
+   <td style="background-color: null">0.027
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">CPU
+   </td>
+   <td style="background-color: null">c2-standard-4-4
+   </td>
+   <td style="background-color: null">depolarizing
+   </td>
+   <td style="background-color: null">16
+   </td>
+   <td style="background-color: null">0.005
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">CPU
+   </td>
+   <td style="background-color: null">c2-standard-4-4
+   </td>
+   <td style="background-color: null">dephasing
+   </td>
+   <td style="background-color: null">26
+   </td>
+   <td style="background-color: null">33.038
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">CPU
+   </td>
+   <td style="background-color: null">c2-standard-4-4
+   </td>
+   <td style="background-color: null">dephasing
+   </td>
+   <td style="background-color: null">24
+   </td>
+   <td style="background-color: null">7.432
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">CPU
+   </td>
+   <td style="background-color: null">c2-standard-4-4
+   </td>
+   <td style="background-color: null">dephasing
+   </td>
+   <td style="background-color: null">20
+   </td>
+   <td style="background-color: null">0.230
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">CPU
+   </td>
+   <td style="background-color: null">c2-standard-4-4
+   </td>
+   <td style="background-color: null">dephasing
+   </td>
+   <td style="background-color: null">16
+   </td>
+   <td style="background-color: null">0.014
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">GPU
+   </td>
+   <td style="background-color: #f3f3f3">a100
+   </td>
+   <td style="background-color: #f3f3f3">depolarizing
+   </td>
+   <td style="background-color: #f3f3f3">30
+   </td>
+   <td style="background-color: #f3f3f3">1.568
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">GPU
+   </td>
+   <td style="background-color: #f3f3f3">a100
+   </td>
+   <td style="background-color: #f3f3f3">depolarizing
+   </td>
+   <td style="background-color: #f3f3f3">28
+   </td>
+   <td style="background-color: #f3f3f3">0.391
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">GPU
+   </td>
+   <td style="background-color: #f3f3f3">a100
+   </td>
+   <td style="background-color: #f3f3f3">depolarizing
+   </td>
+   <td style="background-color: #f3f3f3">26
+   </td>
+   <td style="background-color: #f3f3f3">0.094
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">GPU
+   </td>
+   <td style="background-color: #f3f3f3">a100
+   </td>
+   <td style="background-color: #f3f3f3">depolarizing
+   </td>
+   <td style="background-color: #f3f3f3">24
+   </td>
+   <td style="background-color: #f3f3f3">0.026
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">GPU
+   </td>
+   <td style="background-color: #f3f3f3">a100
+   </td>
+   <td style="background-color: #f3f3f3">depolarizing
+   </td>
+   <td style="background-color: #f3f3f3">20
+   </td>
+   <td style="background-color: #f3f3f3">0.006
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">GPU
+   </td>
+   <td style="background-color: #f3f3f3">a100
+   </td>
+   <td style="background-color: #f3f3f3">depolarizing
+   </td>
+   <td style="background-color: #f3f3f3">16
+   </td>
+   <td style="background-color: #f3f3f3">0.004
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">GPU
+   </td>
+   <td style="background-color: #f3f3f3">a100
+   </td>
+   <td style="background-color: #f3f3f3">dephasing
+   </td>
+   <td style="background-color: #f3f3f3">30
+   </td>
+   <td style="background-color: #f3f3f3">17.032
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">GPU
+   </td>
+   <td style="background-color: #f3f3f3">a100
+   </td>
+   <td style="background-color: #f3f3f3">dephasing
+   </td>
+   <td style="background-color: #f3f3f3">28
+   </td>
+   <td style="background-color: #f3f3f3">3.959
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">GPU
+   </td>
+   <td style="background-color: #f3f3f3">a100
+   </td>
+   <td style="background-color: #f3f3f3">dephasing
+   </td>
+   <td style="background-color: #f3f3f3">26
+   </td>
+   <td style="background-color: #f3f3f3">0.896
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">GPU
+   </td>
+   <td style="background-color: #f3f3f3">a100
+   </td>
+   <td style="background-color: #f3f3f3">dephasing
+   </td>
+   <td style="background-color: #f3f3f3">24
+   </td>
+   <td style="background-color: #f3f3f3">0.236
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">GPU
+   </td>
+   <td style="background-color: #f3f3f3">a100
+   </td>
+   <td style="background-color: #f3f3f3">dephasing
+   </td>
+   <td style="background-color: #f3f3f3">20
+   </td>
+   <td style="background-color: #f3f3f3">0.029
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: #f3f3f3">GPU
+   </td>
+   <td style="background-color: #f3f3f3">a100
+   </td>
+   <td style="background-color: #f3f3f3">dephasing
+   </td>
+   <td style="background-color: #f3f3f3">16
+   </td>
+   <td style="background-color: #f3f3f3">0.021
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">GPU
+   </td>
+   <td style="background-color: null">t4
+   </td>
+   <td style="background-color: null">depolarizing
+   </td>
+   <td style="background-color: null">30
+   </td>
+   <td style="background-color: null">10.229
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">GPU
+   </td>
+   <td style="background-color: null">t4
+   </td>
+   <td style="background-color: null">depolarizing
+   </td>
+   <td style="background-color: null">28
+   </td>
+   <td style="background-color: null">2.444
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">GPU
+   </td>
+   <td style="background-color: null">t4
+   </td>
+   <td style="background-color: null">depolarizing
+   </td>
+   <td style="background-color: null">26
+   </td>
+   <td style="background-color: null">0.519
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">GPU
+   </td>
+   <td style="background-color: null">t4
+   </td>
+   <td style="background-color: null">depolarizing
+   </td>
+   <td style="background-color: null">24
+   </td>
+   <td style="background-color: null">0.115
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">GPU
+   </td>
+   <td style="background-color: null">t4
+   </td>
+   <td style="background-color: null">depolarizing
+   </td>
+   <td style="background-color: null">20
+   </td>
+   <td style="background-color: null">0.009
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">GPU
+   </td>
+   <td style="background-color: null">t4
+   </td>
+   <td style="background-color: null">depolarizing
+   </td>
+   <td style="background-color: null">16
+   </td>
+   <td style="background-color: null">0.004
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">GPU
+   </td>
+   <td style="background-color: null">t4
+   </td>
+   <td style="background-color: null">dephasing
+   </td>
+   <td style="background-color: null">28
+   </td>
+   <td style="background-color: null">21.800
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">GPU
+   </td>
+   <td style="background-color: null">t4
+   </td>
+   <td style="background-color: null">dephasing
+   </td>
+   <td style="background-color: null">26
+   </td>
+   <td style="background-color: null">5.056
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">GPU
+   </td>
+   <td style="background-color: null">t4
+   </td>
+   <td style="background-color: null">dephasing
+   </td>
+   <td style="background-color: null">24
+   </td>
+   <td style="background-color: null">1.164
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">GPU
+   </td>
+   <td style="background-color: null">t4
+   </td>
+   <td style="background-color: null">dephasing
+   </td>
+   <td style="background-color: null">20
+   </td>
+   <td style="background-color: null">0.077
+   </td>
+  </tr>
+  <tr>
+   <td style="background-color: null">GPU
+   </td>
+   <td style="background-color: null">t4
+   </td>
+   <td style="background-color: null">dephasing
+   </td>
+   <td style="background-color: null">16
+   </td>
+   <td style="background-color: null">0.017
+   </td>
+  </tr>
+</table>
diff --git a/docs/cirq_interface.md b/docs/cirq_interface.md
index 20291fe8..24149708 100644
--- a/docs/cirq_interface.md
+++ b/docs/cirq_interface.md
@@ -50,7 +50,7 @@ which invokes qsim through the qsim-Cirq interface.
 ## Interface design and operations
 
 The purpose of this interface is to provide a performant simulator for quantum
-circuits defined in Cirq. 
+circuits defined in Cirq.
 
 ### Classes
 
@@ -75,12 +75,17 @@ This circuit can then be simulated using either `QSimSimulator` or
 `QSimSimulator` uses a Schrödinger full state-vector simulator, suitable for
 acquiring the complete state of a reasonably-sized circuit (~25 qubits on an
 average PC, or up to 40 qubits on high-performance VMs).
+
 Options for the simulator, including number of threads and verbosity, can be
-set with the `qsim_options` field using the `qsim_base` flag format defined in
-the [usage docs](./usage.md).
+set with the `qsim_options` field, which accepts a `QSimOptions` object as
+defined in
+[qsim_simulator.py](https://github.com/quantumlib/qsim/blob/master/qsimcirq/qsim_simulator.py).
+These options can also be passed as a {str: val} dict, using the format
+described by that class.
 
 ```
-qsim_options = {'t': 8, 'v': 0}
+# equivalent to {'t': 8, 'v': 0}
+qsim_options = qsimcirq.QSimOptions(cpu_threads=8, verbosity=0)
 my_sim = qsimcirq.QSimSimulator(qsim_options)
 myres = my_sim.simulate(program=my_circuit)
 ```
@@ -112,6 +117,18 @@ the circuit once for each repetition unless all measurements are terminal. This
 ensures that nondeterminism from intermediate measurements is properly
 reflected in the results.
 
+In rare cases when the state vector and gate matrices have many zero entries
+(denormal numbers), a significant performance slowdown can occur. Set
+the `denormals_are_zeros` option to `True` to prevent this issue potentially
+at the cost of a tiny precision loss:
+
+```
+# equivalent to {'t': 8, 'v': 0, 'z': True}
+qsim_options = qsimcirq.QSimOptions(cpu_threads=8, verbosity=0, denormals_are_zeros=True)
+my_sim = qsimcirq.QSimSimulator(qsim_options)
+myres = my_sim.simulate(program=my_circuit)
+```
+
 #### QSimhSimulator
 
 `QSimhSimulator` uses a hybrid Schrödinger-Feynman simulator. This limits it to
@@ -136,8 +153,8 @@ outlined in the [usage docs](./usage.md).
 
 ## Additional features
 
-The qsim-Cirq interface provides basic support for gate decomposition and
-circuit parameterization.
+The qsim-Cirq interface supports arbitrary gates and circuit parameterization.
+Additionally, GPU execution of circuits can be requested if GPUs are available.
 
 ### Gate decompositions
 
@@ -148,7 +165,36 @@ matrices, if one is specified.
 
 ### Parametrized circuits
 
-QSimCircuit objects can also contain
+`QSimCircuit` objects can also contain
 [parameterized gates](https://cirq.readthedocs.io/en/stable/docs/tutorials/basics.html#Using-parameter-sweeps)
 which have values assigned by Cirq's `ParamResolver`. See the link above for
 details on how to use this feature.
+
+### GPU execution
+
+`QSimSimulator` provides optional support for GPU execution of circuits, which
+may improve performance. In order to use this feature, qsim must be compiled on
+a device with the [CUDA toolkit](https://developer.nvidia.com/cuda-downloads)
+and run on a device with available NVIDIA GPUs.
+
+Compilation for GPU follows the same steps outlined in the
+[Compiling qsimcirq](./cirq_interface.md#compiling-qsimcirq) section.
+To compile with the NVIDIA cuStateVec library (v1.0.0 or higher is required),
+set the environmment variable `CUQUANTUM_DIR` to the path to the cuStateVec
+library.
+
+`QSimOptions` provides five parameters to configure GPU execution. `use_gpu`
+is required to enable GPU execution:
+* `use_gpu`: if True, use GPU instead of CPU for simulation.
+* `gpu_mode`: use CUDA if set to 0 (default value) or use the NVIDIA cuStateVec
+library if set to any other value.
+
+If `use_gpu` is set and `gpu_mode` is set to 0, the remaining parameters can
+optionally be set to fine-tune perfomance for a specific device or circuit.
+In most cases, the default values provide good performance.
+* `gpu_sim_threads`: number of threads per CUDA block to use for the GPU
+Simulator. This must be a power of 2 in the range [32, 256].
+* `gpu_state_threads`: number of threads per CUDA block to use for the GPU
+StateSpace. This must be a power of 2 in the range [32, 1024].
+* `gpu_data_blocks`: number of data blocks to use on GPU. Below 16 data blocks,
+performance is noticeably reduced.
diff --git a/docs/images/choose_hw.png b/docs/images/choose_hw.png
new file mode 100644
index 00000000..76a894b4
Binary files /dev/null and b/docs/images/choose_hw.png differ
diff --git a/docs/images/colab_connect.png b/docs/images/colab_connect.png
new file mode 100644
index 00000000..1a52c18f
Binary files /dev/null and b/docs/images/colab_connect.png differ
diff --git a/docs/images/colab_connected.png b/docs/images/colab_connected.png
new file mode 100644
index 00000000..779a85ab
Binary files /dev/null and b/docs/images/colab_connected.png differ
diff --git a/docs/images/colab_remote.png b/docs/images/colab_remote.png
new file mode 100644
index 00000000..c1614895
Binary files /dev/null and b/docs/images/colab_remote.png differ
diff --git a/docs/images/qsim_runtime_comparison_noiseless.png b/docs/images/qsim_runtime_comparison_noiseless.png
new file mode 100644
index 00000000..5849813c
Binary files /dev/null and b/docs/images/qsim_runtime_comparison_noiseless.png differ
diff --git a/docs/images/qsim_runtime_comparison_noisy.png b/docs/images/qsim_runtime_comparison_noisy.png
new file mode 100644
index 00000000..ae736592
Binary files /dev/null and b/docs/images/qsim_runtime_comparison_noisy.png differ
diff --git a/docs/images/qsimcirq_gcp/connection.png b/docs/images/qsimcirq_gcp/connection.png
new file mode 100644
index 00000000..ca66bc8a
Binary files /dev/null and b/docs/images/qsimcirq_gcp/connection.png differ
diff --git a/docs/images/qsimcirq_gcp/container.png b/docs/images/qsimcirq_gcp/container.png
new file mode 100644
index 00000000..61ef9c07
Binary files /dev/null and b/docs/images/qsimcirq_gcp/container.png differ
diff --git a/docs/install_qsimcirq.md b/docs/install_qsimcirq.md
index 9fe69d4e..0a0f3182 100644
--- a/docs/install_qsimcirq.md
+++ b/docs/install_qsimcirq.md
@@ -1,49 +1,58 @@
 # Installing qsimcirq
 
-The qsim-Cirq Python interface is available as a PyPI package for Linux users.
-For all other users, Dockerfiles are provided to install qsim in a contained
+The qsim-Cirq Python interface is available as a PyPI package for Linux, MacOS and Windows users.
+For all others, Dockerfiles are provided to install qsim in a contained
 environment.
 
 **Note:** The core qsim library (under
 [lib/](https://github.com/quantumlib/qsim/blob/master/lib)) can be included
 directly in C++ code without installing this interface.
 
-## Linux installation
+## Before installation
 
 Prior to installation, consider opening a
 [virtual environment](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
 
-The qsim-Cirq interface uses [CMake](https://cmake.org/) to ensure stable
-compilation of its C++ libraries across a variety of Linux distributions.
-CMake can be installed from their website, or with the command
-`apt-get install cmake`.
-
-Other prerequisites (including pybind11 and pytest) are included in the
+Prerequisites are included in the
 [`requirements.txt`](https://github.com/quantumlib/qsim/blob/master/requirements.txt)
 file, and will be automatically installed along with qsimcirq.
 
-To install the qsim-Cirq interface on Linux, simply run `pip3 install qsimcirq`.
-For examples of how to use this package, see the tests in
-[qsim/qsimcirq_tests/](https://github.com/quantumlib/qsim/blob/master/qsimcirq_tests/).
+If you'd like to develop qsimcirq, a separate set of dependencies are includes
+in the
+[`dev-requirements.txt`](https://github.com/quantumlib/qsim/blob/master/dev-requirements.txt)
+file. You can install them with `pip3 install -r dev-requirements.txt` or
+`pip3 install qsimcirq[dev]`.
+
+## Linux installation
+
+We provide `qsimcirq` Python wheels on 64-bit `x86` architectures with `Python 3.{6,7,8,9}`.
+
+Simply run `pip3 install qsimcirq`.
 
-## MacOS and Windows installation
+## MacOS installation
 
-For users interested in running qsim on a MacOS or Windows device, we strongly
-recommend using the [Docker config](./docker.md) provided with this
-repository.
+We provide `qsimcirq` Python wheels on `x86` architectures with `Python 3.{6,7,8,9}`.
 
-### Experimental install process
+Simply run `pip3 install qsimcirq`.
 
-Alternatively, MacOS and Windows users can follow the Linux install process,
-but it is currently untested on those platforms. Users are encouraged to report
-any issues seen with this process.
+## Windows installation
+
+We provide `qsimcirq` Python wheels on 64-bit `x86` and `amd64` architectures with `Python 3.{6,7,8,9}`.
+
+Simply run `pip3 install qsimcirq`.
+
+## There's no compatible wheel for my machine!
+
+If existing wheels do no meet your needs please open an issue with your machine configuration (i.e. CPU architecture, Python version) and consider using the [Docker config](./docker.md) provided with this repository.
 
 ## Testing
 
-After installing qsimcirq on your machine, you can test the installation by
+After installing `qsimcirq` on your machine, you can test the installation by
 copying [qsimcirq_tests/qsimcirq_test.py](qsimcirq_tests/qsimcirq_test.py)
 to your machine and running `python3 -m pytest qsimcirq_test.py`.
 
+It also has examples of how how to use this package.
+
 **Note:** Because of how Python searches for modules, the test file cannot
 be run from inside a clone of the qsim repository, or from any parent
 directory of such a repository. Failure to meet this criteria may result
diff --git a/docs/tutorials/gcp_before_you_begin.md b/docs/tutorials/gcp_before_you_begin.md
new file mode 100644
index 00000000..13296dfe
--- /dev/null
+++ b/docs/tutorials/gcp_before_you_begin.md
@@ -0,0 +1,34 @@
+# Before you begin
+
+The following tutorials demonstrate how to configure the Google Cloud Platform
+to run quantum simulations with qsim.
+
+You can use Google Cloud to run high-performance CPU-based simulations or
+GPU-based simulations, depending on your requirements. For more information
+about making a choice between CPU- and GPU-based simulations, see
+[Choosing hardware for your qsim simulation]().
+
+This tutorial depends on resources provided by the Google Cloud Platform.
+
+*   **Ensure that you have a Google Cloud Platform project.** You can reuse an
+    existing project, or create a new one, from your
+    [project dashboard](https://console.cloud.google.com/projectselector2/home/dashboard).
+    *   For more information about Google Cloud projects, see
+        [Creating and managing projects](https://cloud.google.com/resource-manager/docs/creating-managing-projects)
+        in the Google Cloud documentation.
+*   **Ensure that billing is enabled for your project.**
+    *   For more information about billing, see
+        [Enable, disable, or change billing for a project](https://cloud.google.com/billing/docs/how-to/modify-project#enable-billing)
+        in the Google Cloud documentation.
+*   **Estimate costs for your project** Use the
+    [Google Cloud Pricing Calculator](https://cloud.google.com/products/calculator)
+    to estimate the scale of the costs you might incur, based on your projected
+    usage. The resources that you use to simulate a quantum circuit on the
+    Google Cloud platform are billable.
+*   **Enable the Compute Engine API for your project.** You can enable APIs from
+    the [API Library Console](https://console.cloud.google.com/apis/library). On
+    the console, in the search box, enter "compute engine api" to find the API
+    and click through to Enable it.
+    *   For more information about enabling the Compute Engine API, see
+        [Getting Started](https://cloud.google.com/apis/docs/getting-started) in
+        the Google Cloud documentation.
diff --git a/docs/tutorials/gcp_cpu.md b/docs/tutorials/gcp_cpu.md
new file mode 100644
index 00000000..0795e622
--- /dev/null
+++ b/docs/tutorials/gcp_cpu.md
@@ -0,0 +1,243 @@
+# CPU-based quantum simulation on Google Cloud
+
+In this tutorial, you configure and test a virtual machine (VM) to run CPU-based
+quantum simulations. The configuration in this tutorial uses the qsim Docker
+container, running on a Google Compute Engine VM.
+
+## 1. Create a virtual machine
+
+Follow the instructions in the
+[Quickstart using a Linux VM](https://cloud.google.com/compute/docs/quickstart-linux)
+guide to create a VM. In addition to the guidance under the Create a Linux VM
+instance heading, ensure that your VM has the following properties:
+
+*   In the **Machine Configuration** section:
+    1. Select the tab for the **Compute Optimized** machine family.
+    2. In the machine **Series** option, choose **C2**.
+    3. In the **Machine type** option, choose **c2-standard-16**. This option
+       gives you 16 virtual CPUS and 64MB of RAM.
+       Note: This choice is for demonstration purposes only. For a live
+       experiment, see [Choosing hardware for your qsim
+       simulation](/qsim/choose_hw).
+*   In the **Boot disk section**, click the **Change** button, and choose
+    **Container-Optimized** operating system. This overrides the seletion in
+    step 3 in [Create a Linux VM
+    instance](https://cloud.google.com/compute/docs/quickstart-linux#create_a_linux_vm_instance).
+*   In the **Firewall** section, ensure that both the **Allow HTTP traffic**
+    checkbox and the **Allow HTTPS traffic** checkbox are selected.
+
+When Google Cloud finishes creating the VM, you can see your VM listed in the
+[Compute Instances dashboard](https://pantheon.corp.google.com/compute/instances)
+for your project.
+
+### Find out more
+* [Choosing the right machine family and
+  type](https://cloud.google.com/blog/products/compute/choose-the-right-google-compute-engine-machine-type-for-you)
+* [Container-Optimized OS
+  Overview](https://cloud.google.com/container-optimized-os/docs/concepts/features-and-benefits)
+
+## 2. Prepare your computer
+
+Use SSH to create an encrypted tunnel from your computer to your VM and redirect
+a local port to your VM over the tunnel.
+
+1.  Install the `gcloud` command line tool. Follow the instructions in the
+    [Installing Cloud SDK](https://cloud.google.com/sdk/docs/install)
+    documentation.
+2.  After installation, run the `gcloud init` command to initialize the Google
+    Cloud environment. You need to provide the `gcloud` tool with details
+    about your VM, such as the project name and the region where your VM is
+    located.
+    1.  You can verify your environment by using the `gcloud config list`
+        command.
+3.  Create an SSH tunnel and redirect a local port to use the tunnel by typing
+    the following command in a terminal window on your computer. Replace
+    `[YOUR_INSTANCE_NAME]` with the name of your VM.
+
+    ```
+    gcloud compute ssh [YOUR_INSTANCE_NAME] -- -L 8888:localhost:8888
+    ```
+
+When the command completes successfully, your prompt changes from your local
+machine to your virtual machine.
+
+## 3. Start the qsim Docker container on your virtual machine
+
+1.  On the VM that you just created, start the qsim container:
+
+    ```
+    docker run -v `pwd`:/homedir -p 8888:8888 gcr.io/quantum-builds/github.com/quantumlib/jupyter_qsim:latest &
+    ```
+
+    If you see a `permission denied` error message, you might need to add `sudo`
+    before your Docker command. For more information about running Docker, see the
+    [`docker run` command reference](https://docs.docker.com/engine/reference/run/#general-form).
+
+2.  Verify the output from Docker when as it downloads and starts the container.
+    The last few lines should be similar to the following output:
+
+    ```
+    To access the notebook, open this file in a browser:
+        file:///root/.local/share/jupyter/runtime/nbserver-1-open.html
+    Or copy and paste one of these URLs:
+        http://e1f7a7cca9fa:8888/?token=aa16e1b6d3f51c58928037d34cc6854dac47347dd4c0eae5
+        or http://127.0.0.1:8888/?token=aa16e1b6d3f51c58928037d34cc6854dac47347dd4c0eae5
+    ```
+
+3.  Copy the URL in the last line of output from your console, and save it for
+    the next task.
+
+## 4. Connect to your virtual machine
+
+The easiest way to use your VM is through a notebook environment like
+[Google Colaboratory](https://colab.sandbox.google.com/notebooks/intro.ipynb?utm_source=scs-index#recent=true)
+(Colab). Google Colab is a free, hosted notebook environment that enables you to
+write, execute, and share Python code from your browser.
+
+However, the qsim Docker image also includes a Jupyter kernel and other
+command-line tools. These tools enable you to connect directly to your container
+and run your code.
+
+*   {Colab}
+
+      You can write code in a Colab notebook, and use your VM to run your code. In
+      this scenario, we use the
+      [Get Started with qsimcirq Colab notebook](https://quantumai.google/qsim/tutorials/qsimcirq).
+
+      1.  Open the
+          [Get Started with qsimcirq notebook](https://quantumai.google/qsim/tutorials/qsimcirq).
+      2.  Click the **Connect** drop-down menu.
+          the Connect button to open the menu.
+      3.  Choose the **Connect to a local runtime** option to open the Local
+          connection settings window.
+          <img src="../images/colab_connect.png" alt="Google Colab Connect to Local Runtime button" style="width: 62%" class="screenshot">
+      4.  In the **Backend URL** text field, paste the URL that you saved in
+          [task 3](#3_start_the_qsim_docker_container_on_your_virtual_machine).
+      5.  Change the part of your URL that read `127.0.0.1` to `localhost`.
+          <img src="../images/colab_remote.png" alt="Google Colab Local Runtime connection window" style="width: 62%" class="screenshot">
+      6.  Click the **Connect** button in the Local connection settings window.
+
+      When your connection is ready, Colab displays a green checkmark beside the
+      Connected (Local) drop-down menu.
+
+      <img src="../images/colab_connected.png" alt="Google Colab Local Runtime connection windo" style="width: 62%" class="screenshot">
+
+      The code cells in your notebook now execute on your VM instead of your local
+      computer.
+
+*   {Jupyter}
+
+      You can run your simulation directly in your Docker container, in Jupyter.
+      Jupyter runs in the qsim Docker container.
+
+      1.  Open a browser window.
+      2.  In the navigation bar, paste the URL that you copied in [task
+          3](#3_start_the_qsim_docker_container_on_your_virtual_machine).
+      3.  In the browser you should now see the Jupyter UI, running on your VM.
+
+      The code that you execute here executes on your VM. You can navigate to qsim >
+      docs > tutorials to try other tutorials.
+
+*   {Command line}
+
+      You can run a quantum simulation using qsim from the command line.
+      Your code runs in the qsim Docker container.
+
+      **Before you begin**
+
+      For this scenario, you can connect to your machine directly over SSH
+      rather than create a tunnel. In [task 2, step 3](#2_prepare_your_computer)
+      above, remove the second half of the command.  Instead of this command:
+
+      ```
+      gcloud compute ssh [YOUR_INSTANCE_NAME] -- -L 8888:localhost:8888
+      ```
+
+      Run:
+
+      ```
+      gcloud compute ssh [YOUR_INSTANCE_NAME]
+      ```
+
+      Either command works for the purpose of this tutorial. Continue to task 4 then
+      complete the steps below, regardless of which command you use.
+
+      **1. Copy the container ID for your qsim Docker container**
+
+      Run `docker ps` to display the container ID. The output should look like
+      the following text:
+
+      ```
+      CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES i
+      8ab217d640a3 gcr.io/quantum-291919/jupyter_qsim:latest "jupyter-notebook --…" 2 hours ago Up 2 hours 0.0.0.0:8888->8888/tcp dazzling_lovelace.
+      ```
+
+      In this case, the container ID is `8ab217d640a3`.
+
+      **2. Connect to your qsim Docker container**
+
+      Run `docker exec` to login to your container. Replace `[CONTAINER_ID]`
+      with the ID that you copied in step 1.
+
+      ```
+      docker exec -it [CONTAINER_ID] /bin/bash
+      ```
+
+      Your command prompt now executes commands in the container.
+
+      **3. Verify your installation**
+
+      You can use the code below to verify that qsim uses your qsim installation.
+      You can paste the code directly into the REPL, or paste the code in a
+      file.
+
+      ```
+      # Import Cirq and qsim
+      import cirq
+      import qsimcirq
+
+      # Instantiate qubits and create a circuit
+      q0, q1 = cirq.LineQubit.range(2)
+      circuit = cirq.Circuit(cirq.H(q0), cirq.CX(q0, q1))
+
+      # Instantiate a simulator that uses the GPU
+      qsim_simulator = qsimcirq.QSimSimulator()
+
+      # Run the simulation
+      print("Running simulation for the following circuit:")
+      print(circuit)
+
+      qsim_results = qsim_simulator.compute_amplitudes(
+          circuit, bitstrings=[0b00, 0b01])
+
+      print("qsim results:")
+      print(qsim_results)
+      ```
+
+      After a moment, you should see a result that looks similar to the following.
+
+      ```
+      [(0.7071067690849304+0j), 0j]
+      ```
+
+## Next steps
+
+After you finish, don't forget to stop or delete your VM on the Compute
+Instances dashboard to prevent further billing.
+
+You are now ready to run your own large simulations on Google Cloud. If you want
+to try a large circuit on Google Cloud, you can connect the
+[Simulate a large quantum circuit](https://colab.sandbox.google.com/github/quantumlib/qsim/blob/master/docs/tutorials/q32d14.ipynb)
+Colab notebook to your VM
+([documentation](https://quantumai.google/qsim/tutorials/q32d14)).
+
+For more information about managing your VM, see the following documentation
+from Google Cloud:
+
+*   [Stopping and starting a VM](https://cloud.google.com/compute/docs/instances/stop-start-instance)
+*   [Suspending and resuming an instance](https://cloud.google.com/compute/docs/instances/suspend-resume-instance)
+*   [Deleting a VM instance](https://cloud.google.com/compute/docs/instances/deleting-instance)
+
+As an alternative to Google Cloud, you can download the Docker container or the
+qsim source code to run quantum simulations on your own high-performance
+computing platform.
diff --git a/docs/tutorials/gcp_gpu.md b/docs/tutorials/gcp_gpu.md
new file mode 100644
index 00000000..cf57e48d
--- /dev/null
+++ b/docs/tutorials/gcp_gpu.md
@@ -0,0 +1,209 @@
+# GPU-based quantum simulation on Google Cloud
+
+In this tutorial, you configure and test a virtual machine (VM) to run GPU-based
+quantum simulations on Google Cloud.
+
+Note: The later steps in this tutorial require you to enter several commands at the
+command line. Some commands might require you to add `sudo` before the command.
+For example, if a step asks you to type `icecream -fancy`, you might need to
+type `sudo icecream -fancy`.
+
+## 1. Create a virtual machine
+
+Follow the instructions in the
+[Quickstart using a Linux VM](https://cloud.google.com/compute/docs/quickstart-linux)
+guide to create a VM. In addition to the guidance specified in the Create a Linux VM
+instance section, ensure that your VM has the following properties:
+
+*   In the **Machine Configuration** section:
+    1.  Select the tab for the **GPU** machine family.
+    2.   In the **GPU type** option, choose **NVIDIA Tesla A100**.
+    3.   In the **Number of GPUs** option, choose **1**.
+*   In the **Boot disk** section, click the **Change** button:
+    1.   In the **Operating System** option, choose **Ubuntu**.
+    2.   In the **Version** option, choose **20.04 LTS**.
+    3.   In the **Size** field, enter **30** (minimum).
+*   The instructions above override steps 3 through 5 in the [Create a Linux VM
+    instance](https://cloud.google.com/compute/docs/quickstart-linux)
+    Quickstart.
+*   In the **Firewall** section, ensure that both the **Allow HTTP traffic**
+    checkbox and the **Allow HTTPS traffic** checkboxs are selected.
+
+When Google Cloud finishes creating the VM, you can see your VM listed in the
+[Compute Instances dashboard](https://pantheon.corp.google.com/compute/instances)
+for your project.
+
+### Find out more
+
+*   [Choosing hardware for your qsim simulation](/qsim/choose_hw)
+*   [Choosing the right machine family and type](https://cloud.google.com/blog/products/compute/choose-the-right-google-compute-engine-machine-type-for-you)
+*   [Creating a VM with attached GPUs](https://cloud.google.com/compute/docs/gpus/create-vm-with-gpus#create-new-gpu-vm)
+
+## 2. Prepare your computer
+
+Use SSH in the `glcoud` tool to communicate with your VM.
+
+1.  Install the `gcloud` command line tool. Follow the instructions in the
+    [Installing Cloud SDK](https://cloud.google.com/sdk/docs/install)
+    documentation.
+2.  After installation, run the `gcloud init` command to initialize the Google
+    Cloud environment. You need to provide the `gcloud` tool with details
+    about your VM, such as the project name and the region where your VM is
+    located.
+    1.  You can verify your environment by using the `gcloud config list`
+        command.
+3.  Connect to your VM by using SSH.  Replace `[YOUR_INSTANCE_NAME]` with the
+    name of your VM.
+
+    ```shell
+    gcloud compute ssh [YOUR_INSTANCE_NAME]
+    ```
+
+When the command completes successfully, your prompt changes from your local
+machine to your virtual machine.
+
+## 3. Enable your virtual machine to use the GPU
+
+1.  Install the GPU driver. Complete the steps provided in the following
+    sections of the [Installing GPU
+    drivers](https://cloud.google.com/compute/docs/gpus/install-drivers-gpu):
+    guide:
+    *   [Examples](https://cloud.google.com/compute/docs/gpus/install-drivers-gpu#examples),
+        under the **Ubuntu** tab. For step 3, only perform the steps for
+        **Ubuntu 20.04** (steps 3a through 3f).
+    *   [Verifying the GPU driver install](https://cloud.google.com/compute/docs/gpus/install-drivers-gpu#verify-driver-install)
+2.  Install the CUDA toolkit.
+
+    ```shell
+    sudo apt install -y nvidia-cuda-toolkit
+    ```
+
+3.  Add your CUDA toolkit to the environment search path.
+    1.  Discover the directory of the CUDA toolkit that you installed.
+
+        ```shell
+        ls /usr/local
+        ```
+
+        The toolkit is the highest number that looks like the pattern
+        `cuda-XX.Y`.  The output of the command should resemble the
+        following:
+
+        ```shell
+        bin cuda cuda-11 cuda-11.4 etc games include lib man sbin share src
+        ```
+
+        In this case, the directory is `cuda-11.4`.
+    2.  Add the CUDA toolkit path to your environment. You can run the following
+        command to append the path to your `~/.bashrc` file.  Replace `[DIR]`
+        with the CUDA directory that you discovered in the previous step.
+
+        ```shell
+        echo "export PATH=/usr/local/[DIR]/bin${PATH:+:${PATH}}" >> ~/.bashrc
+        ```
+
+    3.  Run `source ~/.bashrc` to activate the new environment search path
+
+## 4. Install build tools
+
+Install the tools required to build qsim. This step might take a few minutes to
+complete.
+
+```shell
+sudo apt install cmake && sudo apt install pip && pip install pybind11
+```
+
+
+## 5. Create a GPU-enabled version of qsim
+
+1.  Clone the qsim repository.
+
+    ```shell
+    git clone https://github.com/quantumlib/qsim.git
+    ```
+
+2.  Run `cd qsim` to change your working directory to qsim.
+3.  Run `make` to compile qsim. When make detects the CUDA toolkit during
+    compilation, make builds the GPU version of qsim automatically.
+4.  Run `pip install .` to install your local version of qsimcirq.
+5.  Verify your qsim installation.
+
+    ```shell
+    python3 -c "import qsimcirq; print(qsimcirq.qsim_gpu)"
+    ```
+
+    If the installation completed successfully, the output from the command
+    should resemble the following:
+
+    ```none
+    <module 'qsimcirq.qsim_cuda' from '/home/user_org_com/qsim/qsimcirq/qsim_cuda.cpython-38-x86_64-linux-gnu.so'>
+    ```
+
+
+## 6. Verify your installation
+
+You can use the following code to verify that qsim uses your GPU. You can paste
+the code directly into the REPL, or paste the code in a file.
+
+```
+# Import Cirq and qsim
+import cirq
+import qsimcirq
+
+# Instantiate qubits and create a circuit
+q0, q1 = cirq.LineQubit.range(2)
+circuit = cirq.Circuit(cirq.H(q0), cirq.CX(q0, q1))
+
+# Instantiate a simulator that uses the GPU
+gpu_options = qsimcirq.QSimOptions(use_gpu=True)
+qsim_simulator = qsimcirq.QSimSimulator(qsim_options=gpu_options)
+
+# Run the simulation
+print("Running simulation for the following circuit:")
+print(circuit)
+
+qsim_results = qsim_simulator.compute_amplitudes(
+    circuit, bitstrings=[0b00, 0b01])
+
+print("qsim results:")
+print(qsim_results)
+```
+
+After a moment, you should see a result that looks similar to the following.
+
+```none
+[(0.7071067690849304+0j), 0j]
+```
+
+### Optional: Use the NVIDIA cuQuantum SDK
+
+If you have the [NVIDIA cuQuantum SDK](https://developer.nvidia.com/cuquantum-sdk)
+installed (instructions are provided
+[here](https://docs.nvidia.com/cuda/cuquantum/custatevec/html/getting_started.html#installation-and-compilation),
+cuStateVec v1.0.0 or higher is required),
+you can use it with this tutorial. Before building qsim in step 5,
+set the `CUQUANTUM_DIR` environment variable from the command line:
+
+```bash
+export CUQUANTUM_DIR=[PATH_TO_CUQUANTUM_SDK]
+```
+
+Once you have built qsim, modify the `gpu_options` line like so:
+
+```python
+gpu_options = qsimcirq.QSimOptions(use_gpu=True, gpu_mode=1)
+```
+
+This instructs qsim to make use of its cuQuantum integration, which provides
+improved performance on NVIDIA GPUs. If you experience issues with this
+option, please file an issue on the qsim repository.
+
+
+## Next steps
+
+After you finish, don't forget to stop or delete your VM on the Compute
+Instances dashboard to prevent further billing.
+
+You are now ready to run your own large simulations on Google Cloud. For sample
+code of a large circuit, see the [Simulate a large
+circuit](https://quantumai.google/qsim/tutorials/q32d14) tutorial.
diff --git a/docs/tutorials/multinode.md b/docs/tutorials/multinode.md
new file mode 100644
index 00000000..8d425cc2
--- /dev/null
+++ b/docs/tutorials/multinode.md
@@ -0,0 +1,333 @@
+# Multinode quantum simulation using HTCondor on GCP
+
+In this tutorial, you will configure HTCondor to run multiple simulations of a
+quantum circuit in parallel across multiple nodes. This method can be used to
+accelerate Monte Carlo simulations of noisy quantum circuits.
+
+Objectives of this tutorial:
+
+* Use `terraform` to deploy a HTCondor cluster
+* Run a multinode simulation using HTCondor
+* Query cluster information and monitor running jobs in HTCondor
+* Use `terraform` to destroy the cluster
+
+## 1. Configure your environment
+
+Although this tutorial can be run from your local computer, we recommend the use
+of [Google Cloud Shell](https://cloud.google.com/shell). Cloud Shell has many useful tools pre-installed.
+
+Once you have completed the [Before you begin](./gcp_before_you_begin.md)
+tutorial, open the [Cloud Shell in the Cloud Console](https://console.cloud.google.com/home/dashboard?cloudshell=true).
+
+### Clone this repo
+
+In your Cloud Shell window, clone this Github repo.
+
+``` bash
+git clone https://github.com/quantumlib/qsim.git
+```
+
+If you get an error saying something like `qsim already exists`, you may need
+to delete the `qsim` directory with `rm -rf qsim` and rerun the clone command.
+
+### Change directory
+
+Change directory to the tutorial:
+
+``` bash
+cd qsim/docs/tutorials/multinode/terraform
+```
+
+This is where you will use `terraform` to create the HTCondor cluster required to run your jobs.
+
+### Edit `init.sh` file to match your environment
+
+Using your favorite text file editor, open the `init.sh` file. The first few
+lines should look like this:
+
+```bash
+# ---- Edit below -----#
+
+export TF_VAR_project=[USER_PROJECT]
+export TF_VAR_zone=us-east4-c
+export TF_VAR_region=us-east4
+```
+
+Replace `[USER_PROJECT]` with the project name you chose on the
+`Before you begin` page.
+
+The other lines can optionally be modified to adjust your environment.
+* The `TF_VAR_zone` and `TF_VAR_region` lines can be modified to select where
+your project will create new jobs.
+
+#### Find out more
+
+* [Choosing a zone and region](https://cloud.google.com/compute/docs/regions-zones)
+
+### Source the `init.sh` file
+
+The edited `init.sh` file should be "sourced" in the cloud shell:
+
+``` bash
+source init.sh
+```
+
+Respond `Agree` to any pop-ups that request permissions on the Google Cloud platform.
+
+The final outcome of this script will include:
+
+* A gcloud config setup correctly
+* A service account created
+* The appropriate permissions assigned to the service account
+* A key file created to enable the use of Google Cloud automation.
+
+This will take up to 60 seconds. At the end you will see output about
+permissions and the configuration of the account.
+
+## 2. Run terraform
+
+After the previous steps are completed, you can initialize `terraform` to begin
+your cluster creation. The first step is to initialize the `terraform` state.
+``` bash
+terraform init
+```
+A successful result will contain the text:
+```
+Terraform has been successfully initialized!
+```
+
+### Run the `make` command
+
+For convenience, some terraform commands are prepared in a `Makefile`. This
+means you can now create your cluster, with a simple `make` command.
+
+```bash
+make apply
+```
+
+A successful run will show:
+
+```
+Apply complete! Resources: 4 added, 0 changed, 0 destroyed.
+```
+
+## 3. Connect to the submit node for HTCondor
+
+Although there are ways to run HTCondor commands from your local machine, 
+the normal path  is to login to the submit node. From there you can run 
+commands to submit and monitor jobs on HTCondor.
+
+### List VMs that were created by HTCondor
+
+To see the VMs created by HTCondor, run:
+
+```bash
+gcloud compute instances list
+```
+
+At this point in the tutorial, you will see two instances listed:
+
+```
+NAME: c-manager
+ZONE: us-central1-a
+MACHINE_TYPE: n1-standard-1
+PREEMPTIBLE:
+INTERNAL_IP: X.X.X.X
+EXTERNAL_IP: X.X.X.X
+STATUS: RUNNING
+
+NAME: c-submit
+ZONE: us-central1-a
+MACHINE_TYPE: n1-standard-1
+PREEMPTIBLE:
+INTERNAL_IP: X.X.X.X
+EXTERNAL_IP: X.X.X.X
+STATUS: RUNNING
+```
+
+### Connecting to the submit node
+
+To connect to the submit node, click the `Compute Engine` item on the Cloud
+dashboard. This will open the VM Instances page, where you should see the two
+instances listed above. In the `c-submit` row, click on the `SSH` button to
+open a new window connected to the submit node. During this step, you may see a
+prompt that reads `Connection via Cloud Identity-Aware Proxy Failed`; simply
+click on `Connect without Identity-Aware Proxy` and the connection should
+complete.
+
+This new window is logged into your HTCondor cluster. You will see a command
+prompt that looks something like this:
+
+```bash
+[mylogin@c-submit ~]$
+```
+
+The following steps should be performed in this window.
+
+### Checking the status
+
+You can run `condor_q` to verify if the HTCondor install is completed. The output should look something like this:
+
+```
+-- Schedd: c-submit.c.quantum-htcondor-14.internal : <10.150.0.2:9618?... @ 08/18/21 18:37:50
+OWNER BATCH_NAME      SUBMITTED   DONE   RUN    IDLE   HOLD  TOTAL JOB_IDS
+
+Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended
+Total for drj: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended
+Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended
+```
+
+If you get `command not found`, you will need to wait a few minutes for the HTCondor install to complete.
+
+## 4. Get the sample code and run it
+
+The HTCondor cluster is now ready for your jobs to be run. For this tutorial,
+sample jobs have been provided in the Github repo.
+
+###  Clone the repo on your cluster
+
+On the submit node, you can clone the repo to get access to previously
+created submission files:
+
+```bash
+git clone https://github.com/quantumlib/qsim.git
+```
+
+Then cd to the tutorial directory.
+
+```bash
+cd qsim/docs/tutorials/multinode
+```
+
+### Submit a job
+
+Now it is possible to submit a job:
+```
+condor_submit noiseless.sub
+```
+This job will run the code in `noiseless3.py`, which executes a simple circuit and prints the results as a histogram. If successful, the output will be:
+```
+Submitting job(s).
+1 job(s) submitted to cluster 1.
+```
+You can see the job in queue with the `condor_q` command.
+
+The job will take several minutes to finish. The time includes creating a VM
+compute node, installing the HTCondor system and running the job. When complete, the following files will be stored in the `out` directory:
+
+* `out/log.1-0` contains a progress log for the job as it executes.
+* `out/out.1-0` contains the final output of the job.
+* `out/err.1-0` contains any error reports. This should be empty.
+
+To view one of these files in the shell, you can run `cat out/[FILE]`,
+replacing `[FILE]` with the name of the file to be viewed.
+
+## 5. Run multinode noise simulations
+
+Noise simulations make use of a [Monte Carlo
+method](https://en.wikipedia.org/wiki/Monte_Carlo_method) for [quantum
+trajectories](https://en.wikipedia.org/wiki/Quantum_Trajectory_Theory).
+
+### The noise.sub file
+
+To run multiple simulations, you can define a "submit" file. `noise.sub` is
+an example of this file format, and is shown below. Notable features include:
+
+* `universe = docker` means that all jobs will run inside a `docker` container.
+* `queue 50` submits 50 separate copies of the job.
+
+```
+universe                = docker
+docker_image            = gcr.io/quantum-builds/github.com/quantumlib/jupyter_qsim:latest
+arguments               = python3 noise3.py
+should_transfer_files   = YES
+transfer_input_files    = noise3.py
+when_to_transfer_output = ON_EXIT
+output                  = out/out.$(Cluster)-$(Process)
+error                   = out/err.$(Cluster)-$(Process)
+log                     = out/log.$(Cluster)-$(Process)
+request_memory          = 10GB
+queue 50
+```
+The job can be submitted with the `condor_submit` command.
+```
+condor_submit noise.sub
+```
+The output should look like this:
+```
+Submitting job(s)..................................................
+50 job(s) submitted to cluster 2.
+```
+To monitor the ongoing process of jobs running, you can take advantage of the
+Linux `watch` command to run `condor_q` repeatedly:
+```
+watch "condor_q; condor_status"
+```
+The output of this command will show you the jobs in the queue as well as the
+VMs being created to run the jobs. There is a limit of 20 VMs for this
+configuration of the cluster.
+
+When the queue is empty, the command can be stopped with CTRL-C.
+
+The output from all trajectories will be stored in the `out` directory. To see
+the results of all simulations together, you can run:
+```
+cat out/out.2-*
+```
+The output should look something like this:
+```
+Counter({3: 462, 0: 452, 2: 50, 1: 36})
+Counter({0: 475, 3: 435, 1: 49, 2: 41})
+Counter({0: 450, 3: 440, 1: 59, 2: 51})
+Counter({0: 459, 3: 453, 2: 51, 1: 37})
+Counter({3: 471, 0: 450, 2: 46, 1: 33})
+Counter({3: 467, 0: 441, 1: 54, 2: 38})
+Counter({3: 455, 0: 455, 1: 50, 2: 40})
+Counter({3: 466, 0: 442, 2: 51, 1: 41})
+.
+.
+.
+```
+
+## 6. Shutting down
+
+**IMPORTANT**:  To avoid excess billing for this project, it is important to
+shut down the cluster. Return to the Cloud dashboard window for the steps below.
+
+If your Cloud Shell is still open, simply run:
+```
+make destroy
+```
+If your Cloud Shell closed at any point, you'll need to re-initialize it.
+[Open a new shell](https://console.cloud.google.com/home/dashboard?cloudshell=true)
+and run:
+```
+cd qsim/docs/tutorials/multinode/terraform
+source init.sh
+make destroy
+```
+After these commands complete, check the Compute Instances dashboard to verify
+that all VMs have been shut down. This tutorial makes use of an experimental
+[autoscaling script](./terraform/htcondor/autoscaler.py) to bring up and turn
+down VMs as needed. If any VMs remain after several minutes, you may need to
+shut them down manually, as described in the next section.
+
+## Next steps
+
+The file being run in the previous example was `noise3.py`. To run your own
+simulations, simply create a new python file with your circuit and change the
+`noise3.py` references in `noise.sub` to point to the new file.
+
+A detailed discussion of how to construct various types of noise in Cirq can be
+found [here](https://quantumai.google/cirq/noise).
+
+For more information about managing your VMs, see the following documentation
+from Google Cloud:
+
+*   [Stopping and starting a VM](https://cloud.google.com/compute/docs/instances/stop-start-instance)
+*   [Suspending and resuming an instance](https://cloud.google.com/compute/docs/instances/suspend-resume-instance)
+*   [Deleting a VM instance](https://cloud.google.com/compute/docs/instances/deleting-instance)
+
+As an alternative to Google Cloud, you can download the Docker container or the
+qsim source code to run quantum simulations on your own high-performance
+computing platform.
diff --git a/docs/tutorials/multinode/noise.sub b/docs/tutorials/multinode/noise.sub
new file mode 100644
index 00000000..cccc85b0
--- /dev/null
+++ b/docs/tutorials/multinode/noise.sub
@@ -0,0 +1,11 @@
+universe                = docker
+docker_image            = gcr.io/quantum-builds/github.com/quantumlib/jupyter_qsim:latest
+arguments               = python3  noise3.py
+should_transfer_files   = YES
+transfer_input_files    = noise3.py
+when_to_transfer_output = ON_EXIT
+output                  = out/out.$(Cluster)-$(Process)
+error                   = out/err.$(Cluster)-$(Process)
+log                     = out/log.$(Cluster)-$(Process)
+request_memory          = 10GB
+queue 50
\ No newline at end of file
diff --git a/docs/tutorials/multinode/noise3.py b/docs/tutorials/multinode/noise3.py
new file mode 100644
index 00000000..f3abbdd5
--- /dev/null
+++ b/docs/tutorials/multinode/noise3.py
@@ -0,0 +1,18 @@
+import cirq, qsimcirq
+
+# Create a Bell state, |00) + |11)
+q0, q1 = cirq.LineQubit.range(2)
+circuit = cirq.Circuit(cirq.H(q0), cirq.CNOT(q0, q1), cirq.measure(q0, q1, key="m"))
+
+# Constructs a noise model that adds depolarizing noise after each gate.
+noise = cirq.NoiseModel.from_noise_model_like(cirq.depolarize(p=0.05))
+
+# Use the noise model to create a noisy circuit.
+noisy_circuit = cirq.Circuit(noise.noisy_moments(circuit, system_qubits=[q0, q1]))
+
+sim = qsimcirq.QSimSimulator()
+result = sim.run(noisy_circuit, repetitions=1000)
+# Outputs a histogram dict of result:count pairs.
+# Expected result is a bunch of 0s and 3s, with fewer 1s and 2s.
+# (For comparison, the noiseless circuit will only have 0s and 3s)
+print(result.histogram(key="m"))
diff --git a/docs/tutorials/multinode/noiseless.sub b/docs/tutorials/multinode/noiseless.sub
new file mode 100644
index 00000000..f4bde2c8
--- /dev/null
+++ b/docs/tutorials/multinode/noiseless.sub
@@ -0,0 +1,11 @@
+universe                = docker
+docker_image            = gcr.io/quantum-builds/github.com/quantumlib/jupyter_qsim:latest
+arguments               = python3  noiseless3.py
+should_transfer_files   = YES
+transfer_input_files    = noiseless3.py
+when_to_transfer_output = ON_EXIT
+output                  = out/out.$(Cluster)-$(Process)
+error                   = out/err.$(Cluster)-$(Process)
+log                     = out/log.$(Cluster)-$(Process)
+request_memory          = 10GB
+queue 1
\ No newline at end of file
diff --git a/docs/tutorials/multinode/noiseless3.py b/docs/tutorials/multinode/noiseless3.py
new file mode 100644
index 00000000..f35dcb5c
--- /dev/null
+++ b/docs/tutorials/multinode/noiseless3.py
@@ -0,0 +1,11 @@
+import cirq, qsimcirq
+
+# Create a Bell state, |00) + |11)
+q0, q1 = cirq.LineQubit.range(2)
+circuit = cirq.Circuit(cirq.H(q0), cirq.CNOT(q0, q1), cirq.measure(q0, q1, key="m"))
+
+sim = qsimcirq.QSimSimulator()
+result = sim.run(circuit, repetitions=1000)
+# Outputs a histogram dict of result:count pairs.
+# Expected result is a bunch of 0s and 3s, with no 1s or 2s.
+print(result.histogram(key="m"))
diff --git a/docs/tutorials/multinode/out/placeholder b/docs/tutorials/multinode/out/placeholder
new file mode 100644
index 00000000..e69de29b
diff --git a/docs/tutorials/multinode/terraform/Makefile b/docs/tutorials/multinode/terraform/Makefile
new file mode 100644
index 00000000..ebd5018d
--- /dev/null
+++ b/docs/tutorials/multinode/terraform/Makefile
@@ -0,0 +1,7 @@
+.PHONY: apply destroy 
+
+apply: 
+	terraform apply -auto-approve
+
+destroy:
+	terraform destroy -auto-approve
diff --git a/docs/tutorials/multinode/terraform/README.md b/docs/tutorials/multinode/terraform/README.md
new file mode 100644
index 00000000..094f1ee8
--- /dev/null
+++ b/docs/tutorials/multinode/terraform/README.md
@@ -0,0 +1,3 @@
+# Multinode quantum simulation using HTCondor on GCP
+
+Please refer to the [README in the parent directory](../README.md).
\ No newline at end of file
diff --git a/docs/tutorials/multinode/terraform/htcondor/autoscaler.py b/docs/tutorials/multinode/terraform/htcondor/autoscaler.py
new file mode 100644
index 00000000..7795d8c9
--- /dev/null
+++ b/docs/tutorials/multinode/terraform/htcondor/autoscaler.py
@@ -0,0 +1,378 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Script for resizing managed instance group (MIG) cluster size based
+# on the number of jobs in the Condor Queue.
+
+from absl import app
+from absl import flags
+from pprint import pprint
+from googleapiclient import discovery
+from oauth2client.client import GoogleCredentials
+
+import os
+import math
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--p", required=True, help="Project id", type=str)
+parser.add_argument(
+    "--z",
+    required=True,
+    help="Name of GCP zone where the managed instance group is located",
+    type=str,
+)
+parser.add_argument(
+    "--r",
+    required=True,
+    help="Name of GCP region where the managed instance group is located",
+    type=str,
+)
+parser.add_argument(
+    "--mz",
+    required=False,
+    help="Enabled multizone (regional) managed instance group",
+    action="store_true",
+)
+parser.add_argument(
+    "--g", required=True, help="Name of the managed instance group", type=str
+)
+parser.add_argument(
+    "--c", required=True, help="Maximum number of compute instances", type=int
+)
+parser.add_argument(
+    "--v",
+    default=0,
+    help="Increase output verbosity. 1-show basic debug info. 2-show detail debug info",
+    type=int,
+    choices=[0, 1, 2],
+)
+parser.add_argument(
+    "--d",
+    default=0,
+    help="Dry Run, default=0, if 1, then no scaling actions",
+    type=int,
+    choices=[0, 1],
+)
+
+
+args = parser.parse_args()
+
+
+class AutoScaler:
+    def __init__(self, multizone=False):
+
+        self.multizone = multizone
+        # Obtain credentials
+        self.credentials = GoogleCredentials.get_application_default()
+        self.service = discovery.build("compute", "v1", credentials=self.credentials)
+
+        if self.multizone:
+            self.instanceGroupManagers = self.service.regionInstanceGroupManagers()
+        else:
+            self.instanceGroupManagers = self.service.instanceGroupManagers()
+
+    # Remove specified instance from MIG and decrease MIG size
+    def deleteFromMig(self, instance):
+        instanceUrl = "https://www.googleapis.com/compute/v1/projects/" + self.project
+        if self.multizone:
+            instanceUrl += "/regions/" + self.region
+        else:
+            instanceUrl += "/zones/" + self.zone
+        instanceUrl += "/instances/" + instance
+
+        instances_to_delete = {"instances": [instanceUrl]}
+
+        requestDelInstance = self.instanceGroupManagers.deleteInstances(
+            project=self.project,
+            **self.zoneargs,
+            instanceGroupManager=self.instance_group_manager,
+            body=instances_to_delete,
+        )
+
+        # execute if not a dry-run
+        if not self.dryrun:
+            response = requestDelInstance.execute()
+            if self.debug > 0:
+                print("Request to delete instance " + instance)
+                pprint(response)
+            return response
+        return "Dry Run"
+
+    def getInstanceTemplateInfo(self):
+        requestTemplateName = self.instanceGroupManagers.get(
+            project=self.project,
+            **self.zoneargs,
+            instanceGroupManager=self.instance_group_manager,
+            fields="instanceTemplate",
+        )
+        responseTemplateName = requestTemplateName.execute()
+        template_name = ""
+
+        if self.debug > 1:
+            print("Request for the template name")
+            pprint(responseTemplateName)
+
+        if len(responseTemplateName) > 0:
+            template_url = responseTemplateName.get("instanceTemplate")
+            template_url_partitioned = template_url.split("/")
+            template_name = template_url_partitioned[len(template_url_partitioned) - 1]
+
+        requestInstanceTemplate = self.service.instanceTemplates().get(
+            project=self.project, instanceTemplate=template_name, fields="properties"
+        )
+        responseInstanceTemplateInfo = requestInstanceTemplate.execute()
+
+        if self.debug > 1:
+            print("Template information")
+            pprint(responseInstanceTemplateInfo["properties"])
+
+        machine_type = responseInstanceTemplateInfo["properties"]["machineType"]
+        is_preemtible = responseInstanceTemplateInfo["properties"]["scheduling"][
+            "preemptible"
+        ]
+        if self.debug > 0:
+            print("Machine Type: " + machine_type)
+            print("Is preemtible: " + str(is_preemtible))
+        request = self.service.machineTypes().get(
+            project=self.project, zone=self.zone, machineType=machine_type
+        )
+        response = request.execute()
+        guest_cpus = response["guestCpus"]
+        if self.debug > 1:
+            print("Machine information")
+            pprint(responseInstanceTemplateInfo["properties"])
+        if self.debug > 0:
+            print("Guest CPUs: " + str(guest_cpus))
+
+        instanceTemlateInfo = {
+            "machine_type": machine_type,
+            "is_preemtible": is_preemtible,
+            "guest_cpus": guest_cpus,
+        }
+        return instanceTemlateInfo
+
+    def scale(self):
+        # diagnosis
+        if self.debug > 1:
+            print("Launching autoscaler.py with the following arguments:")
+            print("project_id: " + self.project)
+            print("zone: " + self.zone)
+            print("region: " + self.region)
+            print(f"multizone: {self.multizone}")
+            print("group_manager: " + self.instance_group_manager)
+            print("computeinstancelimit: " + str(self.compute_instance_limit))
+            print("debuglevel: " + str(self.debug))
+
+        if self.multizone:
+            self.zoneargs = {"region": self.region}
+        else:
+            self.zoneargs = {"zone": self.zone}
+
+        # Get total number of jobs in the queue that includes number of jos waiting as well as number of jobs already assigned to nodes
+        queue_length_req = (
+            'condor_q -totals -format "%d " Jobs -format "%d " Idle -format "%d " Held'
+        )
+        queue_length_resp = os.popen(queue_length_req).read().split()
+
+        if len(queue_length_resp) > 1:
+            queue = int(queue_length_resp[0])
+            idle_jobs = int(queue_length_resp[1])
+            on_hold_jobs = int(queue_length_resp[2])
+        else:
+            queue = 0
+            idle_jobs = 0
+            on_hold_jobs = 0
+
+        print("Total queue length: " + str(queue))
+        print("Idle jobs: " + str(idle_jobs))
+        print("Jobs on hold: " + str(on_hold_jobs))
+
+        instanceTemlateInfo = self.getInstanceTemplateInfo()
+        if self.debug > 1:
+            print("Information about the compute instance template")
+            pprint(instanceTemlateInfo)
+
+        self.cores_per_node = instanceTemlateInfo["guest_cpus"]
+        print("Number of CPU per compute node: " + str(self.cores_per_node))
+
+        # Get state for for all jobs in Condor
+        name_req = "condor_status  -af name state"
+        slot_names = os.popen(name_req).read().splitlines()
+        if self.debug > 1:
+            print("Currently running jobs in Condor")
+            print(slot_names)
+
+        # Adjust current queue length by the number of jos that are on-hold
+        queue -= on_hold_jobs
+        if on_hold_jobs > 0:
+            print("Adjusted queue length: " + str(queue))
+
+        # Calculate number instances to satisfy current job queue length
+        if queue > 0:
+            self.size = int(math.ceil(float(queue) / float(self.cores_per_node)))
+            if self.debug > 0:
+                print(
+                    "Calucalting size of MIG: ⌈"
+                    + str(queue)
+                    + "/"
+                    + str(self.cores_per_node)
+                    + "⌉ = "
+                    + str(self.size)
+                )
+        else:
+            self.size = 0
+
+        # If compute instance limit is specified, can not start more instances then specified in the limit
+        if self.compute_instance_limit > 0 and self.size > self.compute_instance_limit:
+            self.size = self.compute_instance_limit
+            print(
+                "MIG target size will be limited by " + str(self.compute_instance_limit)
+            )
+
+        print("New MIG target size: " + str(self.size))
+
+        # Get current number of instances in the MIG
+        requestGroupInfo = self.instanceGroupManagers.get(
+            project=self.project,
+            **self.zoneargs,
+            instanceGroupManager=self.instance_group_manager,
+        )
+        responseGroupInfo = requestGroupInfo.execute()
+        currentTarget = int(responseGroupInfo["targetSize"])
+        print("Current MIG target size: " + str(currentTarget))
+
+        if self.debug > 1:
+            print("MIG Information:")
+            print(responseGroupInfo)
+
+        if self.size == 0 and currentTarget == 0:
+            print(
+                "No jobs in the queue and no compute instances running. Nothing to do"
+            )
+            exit()
+
+        if self.size == currentTarget:
+            print(
+                "Running correct number of compute nodes to handle number of jobs in the queue"
+            )
+            exit()
+
+        if self.size < currentTarget:
+            print("Scaling down. Looking for nodes that can be shut down")
+            # Find nodes that are not busy (all slots showing status as "Unclaimed")
+
+            node_busy = {}
+            for slot_name in slot_names:
+                name_status = slot_name.split()
+                if len(name_status) > 1:
+                    name = name_status[0]
+                    status = name_status[1]
+                    slot = "NO-SLOT"
+                    slot_server = name.split("@")
+                    if len(slot_server) > 1:
+                        slot = slot_server[0]
+                        server = slot_server[1].split(".")[0]
+                    else:
+                        server = slot_server[0].split(".")[0]
+
+                    if self.debug > 0:
+                        print(slot + ", " + server + ", " + status + "\n")
+
+                    if server not in node_busy:
+                        if status == "Unclaimed":
+                            node_busy[server] = False
+                        else:
+                            node_busy[server] = True
+                    else:
+                        if status != "Unclaimed":
+                            node_busy[server] = True
+
+            if self.debug > 1:
+                print("Compuute node busy status:")
+                print(node_busy)
+
+            # Shut down nodes that are not busy
+            for node in node_busy:
+                if not node_busy[node]:
+                    print("Will shut down: " + node + " ...")
+                    respDel = self.deleteFromMig(node)
+                    if self.debug > 1:
+                        print("Shut down request for compute node " + node)
+                        pprint(respDel)
+
+            if self.debug > 1:
+                print("Scaling down complete")
+
+        if self.size > currentTarget:
+            print(
+                "Scaling up. Need to increase number of instances to " + str(self.size)
+            )
+            # Request to resize
+            request = self.instanceGroupManagers.resize(
+                project=self.project,
+                **self.zoneargs,
+                instanceGroupManager=self.instance_group_manager,
+                size=self.size,
+            )
+            response = request.execute()
+            if self.debug > 1:
+                print("Requesting to increase MIG size")
+                pprint(response)
+                print("Scaling up complete")
+
+
+def main():
+
+    scaler = AutoScaler(args.mz)
+
+    # Project ID
+    scaler.project = args.p  # Ex:'slurm-var-demo'
+
+    # Name of the zone where the managed instance group is located
+    scaler.zone = args.z  # Ex: 'us-central1-f'
+
+    # Name of the region where the managed instance group is located
+    scaler.region = args.r  # Ex: 'us-central1'
+
+    # The name of the managed instance group.
+    scaler.instance_group_manager = args.g  # Ex: 'condor-compute-igm'
+
+    # Default number of cores per intance, will be replaced with actual value
+    scaler.cores_per_node = 4
+
+    # Default number of running instances that the managed instance group should maintain at any given time. This number will go up and down based on the load (number of jobs in the queue)
+    scaler.size = 0
+
+    # Dry run: : 0, run scaling; 1, only provide info.
+    scaler.dryrun = args.d > 0
+
+    # Debug level: 1-print debug information, 2 - print detail debug information
+    scaler.debug = 0
+    if args.v:
+        scaler.debug = args.v
+
+    # Limit for the maximum number of compute instance. If zero (default setting), no limit will be enforced by the  script
+    scaler.compute_instance_limit = 0
+    if args.c:
+        scaler.compute_instance_limit = abs(args.c)
+
+    scaler.scale()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/tutorials/multinode/terraform/htcondor/noise.py b/docs/tutorials/multinode/terraform/htcondor/noise.py
new file mode 100644
index 00000000..b5542959
--- /dev/null
+++ b/docs/tutorials/multinode/terraform/htcondor/noise.py
@@ -0,0 +1,20 @@
+#!/usr/bin/python3
+
+import cirq, qsimcirq
+
+# Create a Bell state, |00) + |11)
+q0, q1 = cirq.LineQubit.range(2)
+circuit = cirq.Circuit(cirq.H(q0), cirq.CNOT(q0, q1), cirq.measure(q0, q1, key="m"))
+
+# Constructs a noise model that adds depolarizing noise after each gate.
+noise = cirq.NoiseModel.from_noise_model_like(cirq.depolarize(p=0.05))
+
+# Use the noise model to create a noisy circuit.
+noisy_circuit = cirq.Circuit(noise.noisy_moments(circuit, system_qubits=[q0, q1]))
+
+sim = qsimcirq.QSimSimulator()
+result = sim.run(noisy_circuit, repetitions=1000)
+# Outputs a histogram dict of result:count pairs.
+# Expected result is a bunch of 0s and 3s, with fewer 1s and 2s.
+# (For comparison, the noiseless circuit will only have 0s and 3s)
+print(result.histogram(key="m"))
diff --git a/docs/tutorials/multinode/terraform/htcondor/noise.sub b/docs/tutorials/multinode/terraform/htcondor/noise.sub
new file mode 100644
index 00000000..aa5fff28
--- /dev/null
+++ b/docs/tutorials/multinode/terraform/htcondor/noise.sub
@@ -0,0 +1,12 @@
+universe                = docker
+docker_image            = gcr.io/quantum-builds/github.com/quantumlib/jupyter_qsim
+executable              = /usr/bin/python3
+arguments               = noise.py
+transfer_input_files    = noise.py
+should_transfer_files   = YES
+when_to_transfer_output = ON_EXIT
+output                  = out/out.$(Cluster)-$(Process)
+error                   = out/err.$(Cluster)-$(Process)
+log                     = out/log.$(Cluster)-$(Process)
+request_memory          = 1GB
+queue 100
diff --git a/docs/tutorials/multinode/terraform/htcondor/resources.tf b/docs/tutorials/multinode/terraform/htcondor/resources.tf
new file mode 100644
index 00000000..0b4212d9
--- /dev/null
+++ b/docs/tutorials/multinode/terraform/htcondor/resources.tf
@@ -0,0 +1,384 @@
+variable "cluster_name" {
+    type = string
+    default = "condor"
+}
+variable "admin_email" {
+    type = string
+    default = ""
+}
+variable "osversion" {
+    type = string
+    default = "7"
+}
+variable "osimage" {
+    type = string
+    default = "hpc-centos-7"
+}
+variable "osproject" {
+  type = string
+  default = "cloud-hpc-image-public"
+}
+variable "condorversion" {
+    type = string
+    default = ""
+}
+variable "project" {
+    type = string
+}
+variable "zone" {
+    type = string
+}
+variable "region" {
+    type = string
+}
+variable "numzones" {
+    type = string
+}
+variable "multizone" {
+    type = bool
+}
+variable "min_replicas" {
+    type = number
+    default = 0
+}
+variable "max_replicas" {
+    type = number
+    default = 20
+}
+variable "use_preemptibles" {
+    type = bool
+    default = true
+}
+variable "metric_target_loadavg" { 
+  type = number 
+  default = "1.0"
+}
+variable "metric_target_queue" { 
+  type = number 
+  default = 10
+}
+variable "compute_instance_type" {
+  type = string
+  default = "n1-standard-1"
+}
+variable "instance_type" {
+  type = string
+  default = "n1-standard-1"
+}
+variable "service_account" {
+  type = string
+  default = "default"
+}
+locals{
+  autoscaler = file("${path.module}/autoscaler.py")
+  compute_startup = templatefile(
+    "${path.module}/startup-centos.sh", 
+    {
+      "project" = var.project,
+      "cluster_name" = var.cluster_name,
+      "htserver_type" = "compute",
+      "osversion" = var.osversion,
+      "zone" = var.zone,
+      "region" = var.region,
+      "multizone" = var.multizone,
+      "condorversion" = var.condorversion,
+      "max_replicas" = var.max_replicas,
+      "autoscaler" = "",
+      "admin_email" = var.admin_email
+    })
+  submit_startup = templatefile(
+    "${path.module}/startup-centos.sh", 
+    {
+      "project" = var.project,
+      "cluster_name" = var.cluster_name,
+      "htserver_type" = "submit",
+      "osversion" = var.osversion,
+      "condorversion" = var.condorversion,
+      "zone" = var.zone,
+      "region" = var.region,
+      "multizone" = var.multizone,
+      "max_replicas" = var.max_replicas,
+      "autoscaler" = local.autoscaler,
+      "admin_email" = var.admin_email
+    })
+  manager_startup = templatefile(
+    "${path.module}/startup-centos.sh", 
+    {
+      "project" = var.project,
+      "cluster_name" = var.cluster_name,
+      "htserver_type" = "manager",
+      "osversion" = var.osversion,
+      "zone" = var.zone,
+      "region" = var.region,
+      "multizone" = var.multizone,
+      "max_replicas" = var.max_replicas,
+      "condorversion" = var.condorversion,
+      "autoscaler" = "",
+      "admin_email" = var.admin_email
+    })
+}
+data "google_compute_image" "startimage" {
+  family  = var.osimage
+  project = var.osproject
+}
+resource "google_compute_instance" "condor-manager" {
+  boot_disk {
+    auto_delete = "true"
+    device_name = "boot"
+
+    initialize_params {
+      image = data.google_compute_image.startimage.self_link
+      size  = "200"
+      type  = "pd-standard"
+    }
+
+    mode   = "READ_WRITE"
+  }
+
+  can_ip_forward      = "false"
+  deletion_protection = "false"
+  enable_display      = "false"
+
+  machine_type            = var.instance_type
+  metadata_startup_script = local.manager_startup
+  name                    = "${var.cluster_name}-manager"
+  network_interface {
+    access_config {
+      network_tier = "PREMIUM"
+    }
+
+    network            = "default"
+    #network_ip         = "10.128.0.2"
+    subnetwork         = "default"
+    subnetwork_project = var.project
+  }
+
+  project = var.project
+
+  scheduling {
+    automatic_restart   = "true"
+    on_host_maintenance = "MIGRATE"
+    preemptible         = "false"
+  }
+
+  service_account {
+    email = var.service_account
+    scopes = ["https://www.googleapis.com/auth/cloud-platform"]
+  }
+
+  shielded_instance_config {
+    enable_integrity_monitoring = "true"
+    enable_secure_boot          = "false"
+    enable_vtpm                 = "true"
+  }
+
+  tags = ["${var.cluster_name}-manager"]
+  zone = var.zone
+}
+
+resource "google_compute_instance" "condor-submit" {
+  boot_disk {
+    auto_delete = "true"
+    device_name = "boot"
+
+    initialize_params {
+      image = data.google_compute_image.startimage.self_link
+      size  = "200"
+      type  = "pd-standard"
+    }
+
+    mode   = "READ_WRITE"
+  }
+
+  can_ip_forward      = "false"
+  deletion_protection = "false"
+  enable_display      = "false"
+
+  labels = {
+    goog-dm = "mycondorcluster"
+  }
+
+  machine_type            = var.instance_type
+  metadata_startup_script = local.submit_startup
+  name                    = "${var.cluster_name}-submit"
+
+  network_interface {
+    access_config {
+      network_tier = "PREMIUM"
+    }
+
+    network            = "default"
+    #network_ip         = "10.128.0.3"
+    subnetwork         = "default"
+    subnetwork_project = var.project
+  }
+
+  project = var.project
+
+  scheduling {
+    automatic_restart   = "true"
+    on_host_maintenance = "MIGRATE"
+    preemptible         = "false"
+  }
+
+  service_account {
+      email = var.service_account
+  #  email  = "487217491196-compute@developer.gserviceaccount.com"
+    #scopes = ["https://www.googleapis.com/auth/monitoring.write", "https://www.googleapis.com/auth/compute", "https://www.googleapis.com/auth/servicecontrol", "https://www.googleapis.com/auth/devstorage.read_only", "https://www.googleapis.com/auth/logging.write", "https://www.googleapis.com/auth/service.management.readonly", "https://www.googleapis.com/auth/trace.append"]
+    scopes = ["https://www.googleapis.com/auth/cloud-platform"]
+  }
+
+  shielded_instance_config {
+    enable_integrity_monitoring = "true"
+    enable_secure_boot          = "false"
+    enable_vtpm                 = "true"
+  }
+
+  tags = ["${var.cluster_name}-submit"]
+  zone = var.zone
+}
+resource "google_compute_instance_template" "condor-compute" {
+  can_ip_forward = "false"
+
+  disk {
+    auto_delete  = "true"
+    boot         = "true"
+    device_name  = "boot"
+    disk_size_gb = "200"
+    mode         = "READ_WRITE"
+    source_image = data.google_compute_image.startimage.self_link
+    type         = "PERSISTENT"
+  }
+
+  machine_type = var.compute_instance_type
+
+  metadata = {
+    startup-script = local.compute_startup
+  }
+
+  name = "${var.cluster_name}-compute"
+
+  network_interface {
+    access_config {
+      network_tier = "PREMIUM"
+    }
+
+    network = "default"
+  }
+
+  project = var.project
+  region  = var.zone
+
+  scheduling {
+    automatic_restart   = "false"
+    on_host_maintenance = "TERMINATE"
+    preemptible         = var.use_preemptibles
+  }
+
+  service_account {
+    email = var.service_account
+    scopes = ["cloud-platform"]
+  }
+
+  tags = ["${var.cluster_name}-compute"]
+}
+resource "google_compute_instance_group_manager" "condor-compute-igm" {
+  count = var.multizone ? 0 : 1
+  base_instance_name = var.cluster_name
+  name               = var.cluster_name
+
+  project            = var.project
+  target_size        = "0"
+
+  update_policy {
+    max_surge_fixed         = 2
+    minimal_action          = "REPLACE"
+    type                    = "OPPORTUNISTIC"
+  }
+
+  version {
+    instance_template = google_compute_instance_template.condor-compute.self_link
+    name              = ""
+  }
+  timeouts {
+    create = "60m"
+    delete = "2h"
+  }
+  # Yup, didn't want to use this, but I was getting create and destroy errors. 
+  depends_on = [
+   google_compute_instance_template.condor-compute 
+  ]
+  zone = var.zone
+}
+
+resource "google_compute_region_instance_group_manager" "condor-compute-igm" {
+  count = var.multizone ? 1 : 0
+  base_instance_name = var.cluster_name
+  name               = var.cluster_name
+
+  project            = var.project
+  target_size        = "0"
+
+  update_policy {
+    max_surge_fixed         = var.numzones
+    minimal_action          = "REPLACE"
+    type                    = "OPPORTUNISTIC"
+  }
+
+  version {
+    instance_template = google_compute_instance_template.condor-compute.self_link
+    name              = ""
+  }
+  timeouts {
+    create = "60m"
+    delete = "2h"
+  }
+  # Yup, didn't want to use this, but I was getting create and destroy errors.
+  depends_on = [
+   google_compute_instance_template.condor-compute
+  ]
+  region = var.region
+}
+/*
+resource "google_compute_autoscaler" "condor-compute-as" {
+  name    = "${var.cluster_name}-compute-as"
+  project = var.project
+  target  = google_compute_instance_group_manager.condor-compute-igm.self_link
+  zone    = var.zone
+
+  autoscaling_policy {
+    cooldown_period = 30
+    max_replicas    = var.max_replicas
+    min_replicas = var.min_replicas
+
+    cpu_utilization {
+      target = 0.2
+    }
+
+    metric {
+       name   = "custom.googleapis.com/q0"
+       target = var.metric_target_queue
+       type   = "GAUGE"
+    }
+    metric {
+       name   = "custom.googleapis.com/la0"
+       target = var.metric_target_loadavg
+       type   = "GAUGE"
+    }
+
+  }
+
+  timeouts {
+    create = "60m"
+    delete = "2h"
+  }
+
+  depends_on = [
+   google_compute_instance_group_manager.condor-compute-igm
+  ]
+}
+*/
+
+output "startup_script" {
+  value = local.submit_startup
+}
\ No newline at end of file
diff --git a/docs/tutorials/multinode/terraform/htcondor/startup-centos.sh b/docs/tutorials/multinode/terraform/htcondor/startup-centos.sh
new file mode 100644
index 00000000..7ec9572d
--- /dev/null
+++ b/docs/tutorials/multinode/terraform/htcondor/startup-centos.sh
@@ -0,0 +1,189 @@
+#!/bin/bash -x
+
+SERVER_TYPE="${htserver_type}"
+
+##############################################################
+## Install and configure HTCONDOR
+##############################################################
+
+if [ "${condorversion}" == "" ]; then
+   CONDOR_INSTALL_OPT="condor"
+else
+   CONDOR_INSTALL_OPT="condor-all-${condorversion}"
+  #  email  = "487217491196-compute@developer.gserviceaccount.com"
+fi
+if [ "${osversion}" == "6" ]; then
+   CONDOR_STARTUP_CMD="service condor start"
+else
+   CONDOR_STARTUP_CMD="systemctl start condor;systemctl enable condor"
+fi
+CONDOR_REPO_URL=https://research.cs.wisc.edu/htcondor/yum/repo.d/htcondor-stable-rhel${osversion}.repo
+
+sleep 2 #Give it some time to setup yum
+cd /tmp
+yum update -y
+yum install -y wget curl net-tools vim gcc python3 git
+wget https://research.cs.wisc.edu/htcondor/yum/RPM-GPG-KEY-HTCondor
+rpm --import RPM-GPG-KEY-HTCondor
+cd /etc/yum.repos.d && wget $CONDOR_REPO_URL
+yum install -y $CONDOR_INSTALL_OPT
+
+##############################################################
+# Install Docker on Compute Nodes 
+##############################################################
+if [ "$SERVER_TYPE" == "compute" ]; then
+    yum install -y yum-utils
+    yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
+    yum install -y docker-ce docker-ce-cli containerd.io
+    systemctl start docker
+    systemctl enable docker
+    usermod -aG docker condor
+fi
+
+##############################################################
+# Configure Condor Daemons 
+##############################################################
+cd /tmp
+cat <<EOF > condor_config.local
+DISCARD_SESSION_KEYRING_ON_STARTUP=False
+CONDOR_ADMIN=${admin_email}
+CONDOR_HOST=${cluster_name}-manager
+EOF
+
+# Case for compute
+if [ "$SERVER_TYPE" == "compute" ]; then
+cat <<EOF1 >> condor_config.local
+# Standard Stuff
+DAEMON_LIST = MASTER, STARTD
+ALLOW_WRITE = \$(ALLOW_WRITE), \$(CONDOR_HOST)
+# Run Dynamics Slots
+NUM_SLOTS = 1
+NUM_SLOTS_TYPE_1 = 1
+SLOT_TYPE_1 = 100%
+SLOT_TYPE_1_PARTITIONABLE = TRUE
+# Allowing Run as Owner
+STARTER_ALLOW_RUNAS_OWNER = TRUE
+SUBMIT_ATTRS = RunAsOwner
+RunAsOwner = True
+UID_DOMAIN = c.${project}.internal
+TRUST_UID_DOMAIN = True
+HasDocker = True
+EOF1
+fi
+
+# Case for manager
+if [ "$SERVER_TYPE" == "manager" ]; then
+cat <<EOF2 >> condor_config.local
+DAEMON_LIST = MASTER, COLLECTOR, NEGOTIATOR
+ALLOW_WRITE = *
+EOF2
+fi
+
+# Case for submit
+if [ "$SERVER_TYPE" == "submit" ]; then
+cat <<EOF3 >> condor_config.local
+DAEMON_LIST = MASTER, SCHEDD
+ALLOW_WRITE = \$(ALLOW_WRITE), \$(CONDOR_HOST)
+# Allowing Run as Owner
+STARTER_ALLOW_RUNAS_OWNER = TRUE
+SUBMIT_ATTRS = RunAsOwner
+RunAsOwner = True
+UID_DOMAIN = c.${project}.internal
+TRUST_UID_DOMAIN = True
+EOF3
+fi
+
+
+mkdir -p /etc/condor/config.d
+mv condor_config.local /etc/condor/config.d
+eval $CONDOR_STARTUP_CMD
+
+##############################################################
+# Install and configure logging agent for StackDriver
+##############################################################
+curl -sSO https://dl.google.com/cloudagents/add-logging-agent-repo.sh
+bash add-logging-agent-repo.sh --also-install
+
+# Install Custom Metric Plugin:
+google-fluentd-gem install fluent-plugin-stackdriver-monitoring
+
+# Create Fluentd Config
+
+cat <<EOF > condor.conf
+<source>
+  @type tail
+  format none
+  path /var/log/condor/*Log
+  pos_file /var/lib/google-fluentd/pos/condor.pos
+  read_from_head true
+  tag condor
+</source>
+<source>
+  @type exec
+  command condor_status -direct `hostname` -format "%f " TotalLoadAvg |  cut -d " " -f 1
+    <parse>
+      keys la0
+    </parse>
+  tag condor_la0
+  run_interval 5s
+</source>
+<source>
+  @type exec
+  command condor_status -schedd -format "%d" TotalIdleJobs
+    <parse>
+      keys q0
+    </parse>
+  tag condor_q0
+  run_interval 5s
+</source>
+<match condor_la0>
+  @type stackdriver_monitoring
+  project ${project}
+  <custom_metrics>
+    key la0
+    type custom.googleapis.com/la0
+    metric_kind GAUGE
+    value_type DOUBLE
+  </custom_metrics>
+</match>
+<match condor_q0>
+  @type stackdriver_monitoring
+  project ${project}
+  <custom_metrics>
+    key q0
+    type custom.googleapis.com/q0
+    metric_kind GAUGE
+    value_type INT64
+  </custom_metrics>
+</match>
+EOF
+mkdir -p /etc/google-fluentd/config.d/
+mv condor.conf /etc/google-fluentd/config.d/
+
+if [ "$SERVER_TYPE" == "submit" ]; then
+mkdir -p /var/log/condor/jobs
+touch /var/log/condor/jobs/stats.log
+chmod 666 /var/log/condor/jobs/stats.log
+fi
+
+service google-fluentd restart
+
+# Add Python Libraries and Autoscaler
+if [ "$SERVER_TYPE" == "submit" ]; then
+  python3 -m pip install --upgrade oauth2client
+  python3 -m pip install --upgrade google-api-python-client
+  python3 -m pip install --upgrade absl-py
+
+cat <<EOFZ > /opt/autoscaler.py
+${autoscaler}
+EOFZ
+
+# Create cron entry for autoscaler. Log to /var/log/messages
+
+echo "* * * * * python3 /opt/autoscaler.py --p ${project} --z ${zone} --r ${region} %{ if multizone }--mz %{ endif }--g ${cluster_name} --c ${max_replicas} | logger " |crontab -
+
+fi
+
+# Now we can let everyone know that the setup is complete.
+
+wall "******* HTCondor system configuration complete ********"
diff --git a/docs/tutorials/multinode/terraform/init.sh b/docs/tutorials/multinode/terraform/init.sh
new file mode 100644
index 00000000..775ef679
--- /dev/null
+++ b/docs/tutorials/multinode/terraform/init.sh
@@ -0,0 +1,31 @@
+# ---- Edit below -----#
+
+export TF_VAR_project=[USER_PROJECT]
+export TF_VAR_zone=us-east4-c
+export TF_VAR_region=us-east4
+
+export TF_VAR_multizone=false
+# For regional/multizone, set this to the number of regions in the zone.
+export TF_VAR_numzones=4
+
+# ---- Do not edit below -----#
+
+export TF_VAR_project_id=${TF_VAR_project}
+export TF_VAR_service_account="htcondor@"${TF_VAR_project}".iam.gserviceaccount.com"
+
+gcloud config set project $TF_VAR_project
+gcloud services enable compute.googleapis.com
+gcloud services enable monitoring.googleapis.com
+gcloud config set compute/zone $TF_VAR_zone
+gcloud config set compute/region $TF_VAR_region
+
+gcloud config list
+
+gcloud iam service-accounts create htcondor --display-name="Run HTCondor" 
+
+# Add roles
+gcloud projects add-iam-policy-binding ${TF_VAR_project} --member serviceAccount:${TF_VAR_service_account} --role roles/compute.admin
+gcloud projects add-iam-policy-binding ${TF_VAR_project} --member serviceAccount:${TF_VAR_service_account} --role roles/iam.serviceAccountUser
+gcloud projects add-iam-policy-binding ${TF_VAR_project} --member serviceAccount:${TF_VAR_service_account} --role roles/monitoring.admin
+gcloud projects add-iam-policy-binding ${TF_VAR_project} --member serviceAccount:${TF_VAR_service_account} --role roles/logging.admin
+gcloud projects add-iam-policy-binding ${TF_VAR_project} --member serviceAccount:${TF_VAR_service_account} --role roles/autoscaling.metricsWriter
diff --git a/docs/tutorials/multinode/terraform/init.tf b/docs/tutorials/multinode/terraform/init.tf
new file mode 100644
index 00000000..8660f9fa
--- /dev/null
+++ b/docs/tutorials/multinode/terraform/init.tf
@@ -0,0 +1,2 @@
+provider "google" {
+}
diff --git a/docs/tutorials/multinode/terraform/main.tf b/docs/tutorials/multinode/terraform/main.tf
new file mode 100644
index 00000000..9e92cc74
--- /dev/null
+++ b/docs/tutorials/multinode/terraform/main.tf
@@ -0,0 +1,40 @@
+variable "project" {
+  type=string
+}
+variable "zone" {
+  type=string
+}
+variable "region" {
+  type=string
+}
+variable "multizone" {
+  type=bool
+}
+variable "numzones" {
+  type=string
+}
+
+variable "cluster_name" {
+  type = string
+  default = "c"
+  description = "Name used to prefix resources in cluster."
+  
+}
+
+module "htcondor" {
+  source = "./htcondor/"
+  cluster_name = var.cluster_name
+  project = var.project
+  zone = var.zone
+  region = var.region
+  multizone = var.multizone
+  numzones = var.numzones
+  osversion = "7"
+  max_replicas=20
+  min_replicas=0
+  compute_instance_type = "custom-2-11264"
+  service_account="htcondor@${var.project}.iam.gserviceaccount.com"
+  use_preemptibles=false
+  osproject ="centos-cloud"
+  osimage ="centos-7"
+}
\ No newline at end of file
diff --git a/docs/tutorials/noisy_qsimcirq.ipynb b/docs/tutorials/noisy_qsimcirq.ipynb
index 6f366593..af2e5c89 100644
--- a/docs/tutorials/noisy_qsimcirq.ipynb
+++ b/docs/tutorials/noisy_qsimcirq.ipynb
@@ -98,7 +98,7 @@
     "try:\n",
     "    import qsimcirq\n",
     "except ImportError:\n",
-    "    !pip install qsimcirq==0.9.5 --quiet\n",
+    "    !pip install qsimcirq --quiet\n",
     "    import qsimcirq"
    ]
   },
@@ -284,4 +284,4 @@
    "metadata": {}
   }
  ]
-}
\ No newline at end of file
+}
diff --git a/docs/tutorials/qsimcirq_gcp.md b/docs/tutorials/qsimcirq_gcp.md
index 36ac704b..3945564a 100644
--- a/docs/tutorials/qsimcirq_gcp.md
+++ b/docs/tutorials/qsimcirq_gcp.md
@@ -53,7 +53,7 @@ Then click on *Create* for a new VM instance:
 
 ![alt_text](../images/qsimcirq_gcp/image7.png )
 
-### Build a Container Optimized VM
+### Build a Container Optimized VM with container deployed
 
 To create the VM use the steps in sequence below:
 
@@ -72,14 +72,17 @@ To create the VM use the steps in sequence below:
 *   Choose the [Machine Type](https://cloud.google.com/blog/products/compute/choose-the-right-google-compute-engine-machine-type-for-you): n2-standard-16
     * 16 CPUs
     * 64GB memory
-*   Choose the Boot Disk image:[ Container-Optimized OS](https://cloud.google.com/container-optimized-os/docs/concepts/features-and-benefits)
     * Leave the remaining as defaults.
 
 ![alt_text](../images/qsimcirq_gcp/image10.png )
+> Select `Deploy a container image to this VM instance`.
 
-Finally, enable HTTP access and click *Create*.
-
-![alt_text](../images/qsimcirq_gcp/image8.png )
+For the container image enter:
+```
+gcr.io/quantum-builds/github.com/quantumlib/jupyter_qsim:latest
+```
+![alt_text](../images/qsimcirq_gcp/container.png )
+> This may take a few minutes to complete, even after the VM is created, the container will take some time to complete.
 
 
 ## Preparing your local system to connect to Colab
@@ -145,32 +148,7 @@ You should now see the command line prompt from your VM:
 wellhello@qsim-1 ~ $ 
 ```
 
-### Run the Jupyter / qsim container on your VM 
-
-At the command prompt you can now start a Docker container with all the required
-code to run simulations. Start the container:
-
-```
-$ docker run -v `pwd`:/homedir -p 8888:8888 gcr.io/quantum-builds/github.com/quantumlib/jupyter_qsim:latest
-```
-
-You should see several lines of output ending with lines like below. (If you get
-an error about `permission denied` you may need to run docker with `sudo` as
-[described here](https://docs.docker.com/engine/reference/run/#general-form)).
-
-
-```
-To access the notebook, open this file in a browser:
-    file:///root/.local/share/jupyter/runtime/nbserver-1-open.html
-Or copy and paste one of these URLs:
-    http://e1f7a7cca9fa:8888/?token=aa16e1b6d3f51c58928037d34cc6854dac47347dd4c0eae5
-    or http://127.0.0.1:8888/?token=aa16e1b6d3f51c58928037d34cc6854dac47347dd4c0eae5
-```
-
-Copy the last URL in the output. Edit the URL to replace `127.0.0.1` with
-`localhost`. Save this URL for the next step. This URL points to your local
-runtime, running as a Docker container on your VM.
-
+The container port is now forwarded to your local machine.
 
 ## Connect Colab to your local runtime
 
@@ -185,9 +163,9 @@ get the UI:
 
 Select *Connect to local runtime*. You will see the UI:
 
-<img src="../images/qsimcirq_gcp/colab_settings.png" width="500"/>
+<img src="../images/qsimcirq_gcp/connection.png" width="500"/>
 
-Pass the edited URL from the previous section, then click *Connect*:
+Type in the URL `http://localhost:8888/` , then click *Connect*:
 
 <img src="../images/qsimcirq_gcp/colab_success.png" width="300"/>
 
@@ -213,7 +191,7 @@ In the previous step, you copied a URL like below. It is easy to just copy that
 URL and paste it directly into a browser running on your local machine.
 
 ```
-http://127.0.0.1:8888/?token=7191178ae9aa4ebe1698b07bb67dea1d289cfd0e0b960373
+http://127.0.0.1:8888/
 ```
 
 In the browser you should now see the Jupyter UI:
@@ -230,11 +208,6 @@ You can now run these cells as you would in any notebook.
 
 ![alt_text](../images/qsimcirq_gcp/image1.png)
 
-If you choose to modify the notebook, you can save it on the *qsim-1* VM from *File* -> *Save As*, and saving to `/homedir/mynotebook.ipynb`.  This will save in your home directory on your VM. If you intend to destroy the VM after this tutorial, either download the notebooks from the VM or save directly from your browser.
-
-![alt_text](../images/qsimcirq_gcp/image4.png)
-
-
 ## Run interactively
 
 To run interactively within the container, you can open a second shell window to
@@ -312,24 +285,6 @@ output vector: (0.5+0.5j)|0⟩ + (0.5-0.5j)|1⟩
 You have successfully simulated a quantum circuit on Google Cloud Platform using
 a Docker container.
 
-### Running your own script
-
-If you want to run a Python script, you can locate a file in the home directory
-on your VM, then run something like the following in the container shell:
-
-```
-$ python3 /homedir/myscript.py
-```
-
-### Exit the container
-
-Exit the container by typing ctrl-d twice. You will see the output like:
-
-```
-[root@79804d33f250 /]# exit
-```
-
-
 ## Clean up
 
 To avoid incurring charges to your Google Cloud Platform account for the
diff --git a/docs/usage.md b/docs/usage.md
index 7bc5e525..33b7f317 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -13,7 +13,7 @@ Sample circuits are provided in
 ## qsim_base usage
 
 ```
-./qsim_base.x -c circuit_file -d maxtime -t num_threads -f max_fused_size -v verbosity
+./qsim_base.x -c circuit_file -d maxtime -t num_threads -f max_fused_size -v verbosity -z
 ```
 
 | Flag | Description |
@@ -22,11 +22,23 @@ Sample circuits are provided in
 |`-d maxtime` | maximum time |
 |`-t num_threads` | number of threads to use|
 |`-f max_fused_size` | maximum fused gate size|
-|`-v verbosity` | verbosity level (0,1,>1)|
+|`-v verbosity` | verbosity level (0,1,2,3,4,5)|
+|`-z` | set flush-to-zero and denormals-are-zeros MXCSR control flags|
 
 qsim_base computes all the amplitudes and just prints the first eight of them
 (or a smaller number for 1- or 2-qubit circuits).
 
+Verbosity levels are described in the following table.
+
+| Verbosity level | Description |
+|-----------------|-------------|
+| 0 | no additional information|
+| 1 | add total simulation runtime|
+| 2 | add initialization runtime and fuser runtime|
+| 3 | add basic fuser statistics|
+| 4 | add simulation runtime for each fused gate|
+| 5 | additional fuser information (qubit indices for each fused gate)|
+
 Example:
 ```
 ./qsim_base.x -c ../circuits/circuit_q24 -d 16 -t 8 -v 1
@@ -35,7 +47,7 @@ Example:
 ## qsim_von_neumann usage
 
 ```
-./qsim_von_neumann.x -c circuit_file -d maxtime -t num_threads -f max_fused_size -v verbosity
+./qsim_von_neumann.x -c circuit_file -d maxtime -t num_threads -f max_fused_size -v verbosity -z
 ```
 
 
@@ -45,7 +57,8 @@ Example:
 |`-d maxtime` | maximum time |
 |`-t num_threads` | number of threads to use|
 |`-f max_fused_size` | maximum fused gate size|
-|`-v verbosity` | verbosity level (0,1,>1)|
+|`-v verbosity` | verbosity level (0,1,2,3,4,5)|
+|`-z` | set flush-to-zero and denormals-are-zeros MXCSR control flags|
 
 qsim_von_neumann computes all the amplitudes and calculates the von Neumann
 entropy. Note that this can be quite slow for large circuits and small thread
@@ -64,18 +77,19 @@ Example:
                     -i input_files \
                     -o output_files \
                     -f max_fused_size \
-                    -t num_threads -v verbosity
+                    -t num_threads -v verbosity -z
 ```
 
 | Flag | Description |
 |-------|------------|
 |`-c circuit_file` | circuit file to run|
-|`-d times_to_save_results`  | comma-separated list of circuit times to save results at|
+|`-d times_to_save_results` | comma-separated list of circuit times to save results at|
 |`-i input_files` | comma-separated list of bitstring input files|
 |`-o output_files` | comma-separated list of amplitude output files|
 |`-t num_threads` | number of threads to use|
 |`-f max_fused_size` | maximum fused gate size|
-|`-v verbosity` | verbosity level (0,1,>1)|
+|`-v verbosity` | verbosity level (0,1,2,3,4,5)|
+|`-z` | set flush-to-zero and denormals-are-zeros MXCSR control flags|
 
 qsim_amplitudes reads input files of bitstrings, computes the corresponding
 amplitudes at specified times and writes them to output files.
@@ -88,6 +102,39 @@ Example:
 ./qsim_amplitudes.x -c ../circuits/circuit_q24 -t 4 -d 16,24 -i ../circuits/bitstrings_q24_s1,../circuits/bitstrings_q24_s2 -o ampl_q24_s1,ampl_q24_s2 -v 1
 ```
 
+## qsim_qtrajectory_cuda usage
+
+```
+./qsim_qtrajectory_cuda.x -c circuit_file \
+                          -d times_to_calculate_observables \
+                          -a amplitude_damping_const \
+                          -p phase_damping_const \
+                          -t traj0 -n num_trajectories \
+                          -f max_fused_size \
+                          -v verbosity
+```
+
+| Flag | Description |
+|-------|------------|
+|`-c circuit_file` | circuit file to run|
+|`-d times_to_calculate_observables` | comma-separated list of circuit times to calculate observables at|
+|`-a amplitude_damping_const` | amplitude damping constant |
+|`-p phase_damping_const` | phase damping constant |
+|`-t traj0` | starting trajectory |
+|`-n num_trajectories ` | number of trajectories to run starting with `traj0` |
+|`-f max_fused_size` | maximum fused gate size|
+|`-v verbosity` | verbosity level (0,1,2,3,4,5)|
+
+qsim_qtrajectory_cuda runs on GPUs. qsim_qtrajectory_cuda performs quantum
+trajactory simulations with amplitude damping and phase damping noise channels.
+qsim_qtrajectory_cuda calculates observables (operator X at each qubit) at
+specified times.
+
+Example:
+```
+./qsim_qtrajectory_cuda.x -c ../circuits/circuit_q24 -d 8,16,32 -a 0.005 -p 0.005 -t 0 -n 100 -f 4 -v 0
+```
+
 ## qsimh_base usage
 
 ```
@@ -97,20 +144,20 @@ Example:
                -w prefix \
                -p num_prefix_gates \
                -r num_root_gates \
-               -t num_threads -v verbosity
+               -t num_threads -v verbosity -z
 ```
 
 | Flag | Description |
 |-------|------------|
 |`-c circuit_file` | circuit file to run|
 |`-d maxtime` | maximum time |
-|`-k part1_qubits` |  comma-separated list of qubit indices for part 1 |
+|`-k part1_qubits` |  comma-separated list of qubit indices for part 1|
 |`-w prefix`| prefix value |
 |`-p num_prefix_gates` | number of prefix gates|
 |`-r num_root_gates` | number of root gates|
 |`-t num_threads` | number of threads to use|
-|`-v verbosity` | verbosity level (0,>0)|
-
+|`-v verbosity` | verbosity level (0,1,4,5)|
+|`-z` | set flush-to-zero and denormals-are-zeros MXCSR control flags|
 
 qsimh_base just computes and just prints the first eight amplitudes. The hybrid
 Schrödinger-Feynman method is used. The lattice is split into two parts.
@@ -176,21 +223,22 @@ maximum "time".
                      -p num_prefix_gates \
                      -r num_root_gates \
                      -i input_file -o output_file \
-                     -t num_threads -v verbosity
+                     -t num_threads -v verbosity -z
 ```
 
 | Flag | Description |
 |-------|------------|
 |`-c circuit_file` | circuit file to run|
 |`-d maxtime` | maximum time |
-|`-k part1_qubits` |  comma-separated list of qubit indices for part 1 |
+|`-k part1_qubits` | comma-separated list of qubit indices for part 1|
 |`-w prefix`| prefix value |
 |`-p num_prefix_gates` | number of prefix gates|
 |`-r num_root_gates` | number of root gates|
 |`-i input_file` | bitstring input file|
 |`-o output_file` | amplitude output file|
 |`-t num_threads` | number of threads to use|
-|`-v verbosity` | verbosity level (0,>0)|
+|`-v verbosity` | verbosity level (0,1,4,5)|
+|`-z` | set flush-to-zero and denormals-are-zeros MXCSR control flags|
 
 qsimh_amplitudes reads the input file of bitstrings, computes the corresponding
 amplitudes and writes them to the output file. The hybrid Schrödinger-Feynman
diff --git a/jupyter/Dockerfile b/jupyter/Dockerfile
index 29043611..35610cd9 100644
--- a/jupyter/Dockerfile
+++ b/jupyter/Dockerfile
@@ -1,8 +1,15 @@
 # Base OS
 FROM centos:8
 USER root
-# Install baseline
 
+# Centos 8 has reach end of life: https://www.centos.org/centos-linux-eol/
+# Configuration must be loaded from the vault.
+RUN pushd /etc/yum.repos.d/ && \
+	sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && \
+	sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* && \
+	popd
+
+# Install baseline
 RUN yum -y update && \
 	yum install -y epel-release && \
 	yum group install -y "Development Tools" && \
@@ -22,6 +29,5 @@ RUN  jupyter serverextension enable --py jupyter_http_over_ws
 CMD ["jupyter-notebook", "--port=8888", "--no-browser",\
       "--ip=0.0.0.0", "--allow-root", \
 	  "--NotebookApp.allow_origin='*'", \
-      "--NotebookApp.port_retries=0"]
-
-	  #"--NotebookApp.allow_origin='https://colab.research.google.com'", \
\ No newline at end of file
+      "--NotebookApp.port_retries=0", \
+	  "--NotebookApp.token=''"]
diff --git a/lib/BUILD b/lib/BUILD
index 3316f531..9be39dea 100644
--- a/lib/BUILD
+++ b/lib/BUILD
@@ -1,8 +1,14 @@
 package(default_visibility = ["//visibility:public"])
 
+# Libraries of the following form:
+#   # cuda_library
+#   cc_library(...)
+# are converted to cuda_library rules when imported to the Google codebase.
+# Do not modify this tag.
+
 ##### Aggregate libraries #####
 
-# Full qsim library
+# Full qsim library, minus CUDA
 cc_library(
     name = "qsim_lib",
     hdrs = [
@@ -34,6 +40,66 @@ cc_library(
         "run_qsimh.h",
         "seqfor.h",
         "simmux.h",
+        "simulator.h",
+        "simulator_avx.h",
+        "simulator_avx512.h",
+        "simulator_basic.h",
+        "simulator_sse.h",
+        "statespace_avx.h",
+        "statespace_avx512.h",
+        "statespace_basic.h",
+        "statespace_sse.h",
+        "statespace.h",
+        "umux.h",
+        "unitaryspace.h",
+        "unitaryspace_avx.h",
+        "unitaryspace_avx512.h",
+        "unitaryspace_basic.h",
+        "unitaryspace_sse.h",
+        "unitary_calculator_avx.h",
+        "unitary_calculator_avx512.h",
+        "unitary_calculator_basic.h",
+        "unitary_calculator_sse.h",
+        "util.h",
+        "util_cpu.h",
+        "vectorspace.h",
+    ],
+)
+
+# Full qsim library, including CUDA
+# cuda_library
+cc_library(
+    name = "qsim_cuda_lib",
+    hdrs = [
+        "bits.h",
+        "bitstring.h",
+        "channel.h",
+        "channels_cirq.h",
+        "circuit_noisy.h",
+        "circuit_qsim_parser.h",
+        "circuit.h",
+        "expect.h",
+        "formux.h",
+        "fuser.h",
+        "fuser_basic.h",
+        "fuser_mqubit.h",
+        "gate.h",
+        "gate_appl.h",
+        "gates_cirq.h",
+        "gates_qsim.h",
+        "hybrid.h",
+        "io_file.h",
+        "io.h",
+        "matrix.h",
+        "mps_simulator.h",
+        "mps_statespace.h",
+        "parfor.h",
+        "qtrajectory.h",
+        "run_qsim.h",
+        "run_qsimh.h",
+        "seqfor.h",
+        "simmux.h",
+        "simulator.h",
         "simulator_avx.h",
         "simulator_avx512.h",
         "simulator_basic.h",
@@ -58,6 +124,7 @@ cc_library(
         "unitary_calculator_basic.h",
         "unitary_calculator_sse.h",
         "util.h",
+        "util_cpu.h",
         "util_cuda.h",
         "vectorspace.h",
         "vectorspace_cuda.h",
@@ -86,6 +153,7 @@ cc_library(
         "run_qsim.h",
         "seqfor.h",
         "simmux.h",
+        "simulator.h",
         "simulator_avx.h",
         "simulator_avx512.h",
         "simulator_basic.h",
@@ -104,6 +172,7 @@ cc_library(
         "unitary_calculator_basic.h",
         "unitary_calculator_sse.h",
         "util.h",
+        "util_cpu.h",
         "vectorspace.h",
     ],
 )
@@ -131,6 +200,7 @@ cc_library(
         "run_qsimh.h",
         "seqfor.h",
         "simmux.h",
+        "simulator.h",
         "simulator_avx.h",
         "simulator_avx512.h",
         "simulator_basic.h",
@@ -141,6 +211,7 @@ cc_library(
         "statespace_basic.h",
         "statespace_sse.h",
         "util.h",
+        "util_cpu.h",
         "vectorspace.h",
     ],
 )
@@ -171,6 +242,12 @@ cc_library(
     hdrs = ["util.h"],
 )
 
+cc_library(
+    name = "util_cpu",
+    hdrs = ["util_cpu.h"],
+)
+
+# cuda_library
 cc_library(
     name = "util_cuda",
     hdrs = ["util_cuda.h"],
@@ -331,6 +408,7 @@ cc_library(
     hdrs = ["vectorspace.h"],
 )
 
+# cuda_library
 cc_library(
     name = "vectorspace_cuda",
     hdrs = ["vectorspace_cuda.h"],
@@ -384,6 +462,7 @@ cc_library(
     ],
 )
 
+# cuda_library
 cc_library(
     name = "statespace_cuda",
     hdrs = [
@@ -399,11 +478,17 @@ cc_library(
 
 ### Simulator libraries ###
 
+cc_library(
+    name = "simulator_base",
+    hdrs = ["simulator.h"],
+    deps = [":bits"],
+)
+
 cc_library(
     name = "simulator_avx",
     hdrs = ["simulator_avx.h"],
     deps = [
-        ":bits",
+        ":simulator_base",
         ":statespace_avx",
     ],
 )
@@ -412,7 +497,7 @@ cc_library(
     name = "simulator_avx512",
     hdrs = ["simulator_avx512.h"],
     deps = [
-        ":bits",
+        ":simulator_base",
         ":statespace_avx512",
     ],
 )
@@ -421,7 +506,7 @@ cc_library(
     name = "simulator_basic",
     hdrs = ["simulator_basic.h"],
     deps = [
-        ":bits",
+        ":simulator_base",
         ":statespace_basic",
     ],
 )
@@ -430,11 +515,12 @@ cc_library(
     name = "simulator_sse",
     hdrs = ["simulator_sse.h"],
     deps = [
-        ":bits",
+        ":simulator_base",
         ":statespace_sse",
     ],
 )
 
+# cuda_library
 cc_library(
     name = "simulator_cuda",
     hdrs = [
@@ -475,7 +561,10 @@ cc_library(
 cc_library(
     name = "channel",
     hdrs = ["channel.h"],
-    deps = [":gate"],
+    deps = [
+        ":gate",
+        ":matrix",
+    ],
 )
 
 cc_library(
@@ -557,7 +646,7 @@ cc_library(
     name = "unitary_calculator_avx",
     hdrs = ["unitary_calculator_avx.h"],
     deps = [
-        ":bits",
+        ":simulator_base",
         ":unitaryspace_avx",
     ],
 )
@@ -566,7 +655,7 @@ cc_library(
     name = "unitary_calculator_avx512",
     hdrs = ["unitary_calculator_avx512.h"],
     deps = [
-        ":bits",
+        ":simulator_base",
         ":unitaryspace_avx512",
     ],
 )
@@ -575,7 +664,7 @@ cc_library(
     name = "unitary_calculator_basic",
     hdrs = ["unitary_calculator_basic.h"],
     deps = [
-        ":bits",
+        ":simulator_base",
         ":unitaryspace_basic",
     ],
 )
@@ -584,7 +673,7 @@ cc_library(
     name = "unitary_calculator_sse",
     hdrs = ["unitary_calculator_sse.h"],
     deps = [
-        ":bits",
+        ":simulator_base",
         ":unitaryspace_sse",
     ],
 )
diff --git a/lib/channel.h b/lib/channel.h
index 5f2a187b..372a174c 100644
--- a/lib/channel.h
+++ b/lib/channel.h
@@ -15,7 +15,11 @@
 #ifndef CHANNEL_H_
 #define CHANNEL_H_
 
+#include <set>
+#include <vector>
+
 #include "gate.h"
+#include "matrix.h"
 
 namespace qsim {
 
@@ -24,6 +28,8 @@ namespace qsim {
  */
 template <typename Gate>
 struct KrausOperator {
+  using fp_type = typename Gate::fp_type;
+
   enum Kind {
     kNormal = 0,
     kMeasurement = gate::kMeasurement,
@@ -49,6 +55,68 @@ struct KrausOperator {
    * one operation.
    */
   std::vector<Gate> ops;
+
+  /**
+   * Product of K^\dagger and K. This can be empty if unitary = true.
+   */
+  Matrix<fp_type> kd_k;
+
+  /**
+   * Qubits kd_k acts on. This can be empty if unitary = true.
+   */
+  std::vector<unsigned> qubits;
+
+  /**
+   * Calculates the product of "K^\dagger K". Sets qubits "K^\dagger K" acts on.
+   */
+  void CalculateKdKMatrix() {
+    if (ops.size() == 1) {
+      kd_k = ops[0].matrix;
+      MatrixDaggerMultiply(ops[0].qubits.size(), ops[0].matrix, kd_k);
+      qubits = ops[0].qubits;
+    } else if (ops.size() > 1) {
+      std::set<unsigned> qubit_map;
+
+      for (const auto& op : ops) {
+        for (unsigned q : op.qubits) {
+          qubit_map.insert(q);
+        }
+      }
+
+      unsigned num_qubits = qubit_map.size();
+
+      qubits.resize(0);
+      qubits.reserve(num_qubits);
+
+      for (auto it = qubit_map.begin(); it != qubit_map.end(); ++it) {
+        qubits.push_back(*it);
+      }
+
+      MatrixIdentity(unsigned{1} << num_qubits, kd_k);
+
+      for (const auto& op : ops) {
+        if (op.qubits.size() == num_qubits) {
+          MatrixMultiply(num_qubits, op.matrix, kd_k);
+        } else {
+          unsigned mask = 0;
+
+          for (auto q : op.qubits) {
+            for (unsigned i = 0; i < num_qubits; ++i) {
+              if (q == qubits[i]) {
+                mask |= unsigned{1} << i;
+                break;
+              }
+            }
+          }
+
+          MatrixMultiply(mask, op.qubits.size(), op.matrix, num_qubits, kd_k);
+        }
+      }
+
+      auto m = kd_k;
+      MatrixDaggerMultiply(num_qubits, m, kd_k);
+    }
+  }
 };
 
 /**
diff --git a/lib/channels_cirq.h b/lib/channels_cirq.h
index a8fd87ce..69f1df9d 100644
--- a/lib/channels_cirq.h
+++ b/lib/channels_cirq.h
@@ -237,11 +237,22 @@ struct GeneralizedAmplitudeDampingChannel {
     using M = Cirq::MatrixGate1<fp_type>;
     auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
 
-
-    return {{normal, 0, p1, {M::Create(time, q, {t1, 0,  0, 0,  0, 0, r1, 0})}},
-            {normal, 0, p2, {M::Create(time, q, {r2, 0,  0, 0,  0, 0, t2, 0})}},
-            {normal, 0, p3, {M::Create(time, q, { 0, 0, s1, 0,  0,  0, 0, 0})}},
-            {normal, 0, p3, {M::Create(time, q, { 0, 0,  0, 0, s2, 0,  0, 0})}},
+    return {{normal, 0, p1,
+             {M::Create(time, q, {t1, 0, 0, 0, 0, 0, r1, 0})},
+             {t1 * t1, 0, 0, 0, 0, 0, r1 * r1, 0}, {q},
+            },
+            {normal, 0, p2,
+             {M::Create(time, q, {r2, 0, 0, 0, 0, 0, t2, 0})},
+             {r2 * r2, 0, 0, 0, 0, 0, t2 * t2, 0}, {q},
+            },
+            {normal, 0, p3,
+             {M::Create(time, q, {0, 0, s1, 0, 0, 0, 0, 0})},
+             {0, 0, 0, 0, 0, 0, s1 * s1, 0}, {q},
+            },
+            {normal, 0, p3,
+             {M::Create(time, q, {0, 0, 0, 0, s2, 0, 0, 0})},
+             {s2 * s2, 0, 0, 0, 0, 0, 0, 0}, {q},
+            },
            };
   }
 
@@ -281,8 +292,14 @@ struct AmplitudeDampingChannel {
     using M = Cirq::MatrixGate1<fp_type>;
     auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
 
-    return {{normal, 0, p1, {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})}},
-            {normal, 0, p2, {M::Create(time, q, {0, 0, s, 0, 0, 0, 0, 0})}},
+    return {{normal, 0, p1,
+             {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})},
+             {1, 0, 0, 0, 0, 0, r * r, 0}, {q},
+            },
+            {normal, 0, p2,
+             {M::Create(time, q, {0, 0, s, 0, 0, 0, 0, 0})},
+             {0, 0, 0, 0, 0, 0, s * s, 0}, {q},
+            },
            };
   }
 
@@ -320,8 +337,14 @@ struct PhaseDampingChannel {
     using M = Cirq::MatrixGate1<fp_type>;
     auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
 
-    return {{normal, 0, p1, {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})}},
-            {normal, 0, p2, {M::Create(time, q, {0, 0, 0, 0, 0, 0, s, 0})}},
+    return {{normal, 0, p1,
+             {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})},
+             {1, 0, 0, 0, 0, 0, r * r, 0}, {q},
+            },
+            {normal, 0, p2,
+             {M::Create(time, q, {0, 0, 0, 0, 0, 0, s, 0})},
+             {0, 0, 0, 0, 0, 0, s * s, 0}, {q},
+            },
            };
   }
 
@@ -351,8 +374,14 @@ struct ResetChannel {
     using M = Cirq::MatrixGate1<fp_type>;
     auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
 
-    return {{normal, 0, 0, {M::Create(time, q, {1, 0, 0, 0, 0, 0, 0, 0})}},
-            {normal, 0, 0, {M::Create(time, q, {0, 0, 1, 0, 0, 0, 0, 0})}},
+    return {{normal, 0, 0,
+             {M::Create(time, q, {1, 0, 0, 0, 0, 0, 0, 0})},
+             {1, 0, 0, 0, 0, 0, 0, 0}, {q},
+            },
+            {normal, 0, 0,
+             {M::Create(time, q, {0, 0, 1, 0, 0, 0, 0, 0})},
+             {0, 0, 0, 0, 0, 0, 1, 0}, {q},
+            },
            };
   }
 };
diff --git a/lib/channels_qsim.h b/lib/channels_qsim.h
new file mode 100644
index 00000000..5c07bccf
--- /dev/null
+++ b/lib/channels_qsim.h
@@ -0,0 +1,117 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef CHANNELS_QSIM_H_
+#define CHANNELS_QSIM_H_
+
+#include <cmath>
+#include <cstdint>
+#include <vector>
+
+#include "channel.h"
+#include "gates_qsim.h"
+
+namespace qsim {
+
+/**
+ * Amplitude damping channel factory.
+ */
+template <typename fp_type>
+struct AmplitudeDampingChannel {
+  AmplitudeDampingChannel(double gamma) : gamma(gamma) {}
+
+  static Channel<GateQSim<fp_type>> Create(
+      unsigned time, unsigned q, double gamma) {
+    double p1 = 1 - gamma;
+    double p2 = 0;
+
+    fp_type r = std::sqrt(p1);
+    fp_type s = std::sqrt(gamma);
+
+    using M = GateMatrix1<fp_type>;
+    auto normal = KrausOperator<GateQSim<fp_type>>::kNormal;
+
+    return {{normal, 0, p1,
+             {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})},
+             {1, 0, 0, 0, 0, 0, r * r, 0}, {q},
+            },
+            {normal, 0, p2,
+             {M::Create(time, q, {0, 0, s, 0, 0, 0, 0, 0})},
+             {0, 0, 0, 0, 0, 0, s * s, 0}, {q},
+            },
+           };
+  }
+
+  Channel<GateQSim<fp_type>> Create(unsigned time, unsigned q) const {
+    return Create(time, q, gamma);
+  }
+
+  double gamma = 0;
+};
+
+/**
+ * Returns an amplitude damping channel factory object.
+ */
+template <typename fp_type>
+inline AmplitudeDampingChannel<fp_type> amplitude_damp(double gamma) {
+  return AmplitudeDampingChannel<fp_type>(gamma);
+}
+
+/**
+ *  Phase damping channel factory.
+ */
+template <typename fp_type>
+struct PhaseDampingChannel {
+  PhaseDampingChannel(double gamma) : gamma(gamma) {}
+
+  static Channel<GateQSim<fp_type>> Create(
+      unsigned time, unsigned q, double gamma) {
+    double p1 = 1 - gamma;
+    double p2 = 0;
+
+    fp_type r = std::sqrt(p1);
+    fp_type s = std::sqrt(gamma);
+
+    using M = GateMatrix1<fp_type>;
+    auto normal = KrausOperator<GateQSim<fp_type>>::kNormal;
+
+    return {{normal, 0, p1,
+             {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})},
+             {1, 0, 0, 0, 0, 0, r * r, 0}, {q},
+            },
+            {normal, 0, p2,
+             {M::Create(time, q, {0, 0, 0, 0, 0, 0, s, 0})},
+             {0, 0, 0, 0, 0, 0, s * s, 0}, {q},
+            },
+           };
+  }
+
+  Channel<GateQSim<fp_type>> Create(unsigned time, unsigned q) const {
+    return Create(time, q, gamma);
+  }
+
+  double gamma = 0;
+};
+
+/**
+ * Returns a phase damping channel factory object.
+ */
+template <typename fp_type>
+inline PhaseDampingChannel<fp_type> phase_damp(double gamma) {
+  return PhaseDampingChannel<fp_type>(gamma);
+}
+
+}  // namespace qsim
+
+#endif  // CHANNELS_QSIM_H_
diff --git a/lib/expect.h b/lib/expect.h
index 2943c713..38fabfe4 100644
--- a/lib/expect.h
+++ b/lib/expect.h
@@ -42,18 +42,27 @@ struct OpString {
  * @param ket Temporary state vector.
  * @return The computed expectation value.
  */
-template <typename Fuser, typename Simulator, typename Gate>
+template <typename IO, typename Fuser, typename Gate, typename Simulator>
 std::complex<double> ExpectationValue(
     const typename Fuser::Parameter& param,
     const std::vector<OpString<Gate>>& strings,
-    const typename Simulator::StateSpace& ss, const Simulator& simulator,
-    const typename Simulator::State& state, typename Simulator::State& ket) {
+    const typename Simulator::StateSpace& state_space,
+    const Simulator& simulator, const typename Simulator::State& state,
+    typename Simulator::State& ket) {
   std::complex<double> eval = 0;
 
+  if (state_space.IsNull(ket) || ket.num_qubits() < state.num_qubits()) {
+    ket = state_space.Create(state.num_qubits());
+    if (state_space.IsNull(ket)) {
+      IO::errorf("not enough memory: is the number of qubits too large?\n");
+      return eval;
+    }
+  }
+
   for (const auto& str : strings) {
     if (str.ops.size() == 0) continue;
 
-    ss.Copy(state, ket);
+    state_space.Copy(state, ket);
 
     if (str.ops.size() == 1) {
       const auto& op = str.ops[0];
@@ -70,7 +79,7 @@ std::complex<double> ExpectationValue(
       }
     }
 
-    eval += str.weight * ss.InnerProduct(state, ket);
+    eval += str.weight * state_space.InnerProduct(state, ket);
   }
 
   return eval;
@@ -88,7 +97,7 @@ std::complex<double> ExpectationValue(
  * @param state The state of the system.
  * @return The computed expectation value.
  */
-template <typename IO, typename Fuser, typename Simulator, typename Gate>
+template <typename IO, typename Fuser, typename Gate, typename Simulator>
 std::complex<double> ExpectationValue(
     const std::vector<OpString<Gate>>& strings,
     const Simulator& simulator, const typename Simulator::State& state) {
@@ -123,8 +132,8 @@ std::complex<double> ExpectationValue(
         break;
       }
 
-      auto matrix = CalculateFusedMatrix<typename Simulator::fp_type>(fgate);
-      auto r = simulator.ExpectationValue(fgate.qubits, matrix.data(), state);
+      auto r = simulator.ExpectationValue(
+          fgate.qubits, fgate.matrix.data(), state);
       eval += str.weight * r;
     }
   }
diff --git a/lib/fuser.h b/lib/fuser.h
index 927349f1..93933975 100644
--- a/lib/fuser.h
+++ b/lib/fuser.h
@@ -50,6 +50,10 @@ struct GateFused {
    * Ordered list of component gates.
    */
   std::vector<const Gate*> gates;
+  /**
+   * Fused gate matrix.
+   */
+  Matrix<typename Gate::fp_type> matrix;
 };
 
 /**
@@ -134,16 +138,14 @@ class Fuser {
 /**
  * Multiplies component gate matrices of a fused gate.
  * @param gate Fused gate.
- * @return Matrix product of component matrices.
  */
-template <typename fp_type, typename FusedGate>
-inline Matrix<fp_type> CalculateFusedMatrix(const FusedGate& gate) {
-  Matrix<fp_type> matrix;
-  MatrixIdentity(unsigned{1} << gate.qubits.size(), matrix);
+template <typename FusedGate>
+inline void CalculateFusedMatrix(FusedGate& gate) {
+  MatrixIdentity(unsigned{1} << gate.qubits.size(), gate.matrix);
 
   for (auto pgate : gate.gates) {
     if (gate.qubits.size() == pgate->qubits.size()) {
-      MatrixMultiply(gate.qubits.size(), pgate->matrix, matrix);
+      MatrixMultiply(gate.qubits.size(), pgate->matrix, gate.matrix);
     } else {
       unsigned mask = 0;
 
@@ -157,11 +159,31 @@ inline Matrix<fp_type> CalculateFusedMatrix(const FusedGate& gate) {
       }
 
       MatrixMultiply(mask, pgate->qubits.size(), pgate->matrix,
-                     gate.qubits.size(), matrix);
+                     gate.qubits.size(), gate.matrix);
     }
   }
+}
 
-  return matrix;
+/**
+ * Multiplies component gate matrices for a range of fused gates.
+ * @param gbeg, gend The iterator range [gbeg, gend) of fused gates.
+ */
+template <typename Iterator>
+inline void CalculateFusedMatrices(Iterator gbeg, Iterator gend) {
+  for (auto g = gbeg; g != gend; ++g) {
+    if (g->kind != gate::kMeasurement) {
+      CalculateFusedMatrix(*g);
+    }
+  }
+}
+
+/**
+ * Multiplies component gate matrices for a vector of fused gates.
+ * @param gates The vector of fused gates.
+ */
+template <typename FusedGate>
+inline void CalculateFusedMatrices(std::vector<FusedGate>& gates) {
+  CalculateFusedMatrices(gates.begin(), gates.end());
 }
 
 }  // namespace qsim
diff --git a/lib/fuser_basic.h b/lib/fuser_basic.h
index 345db25a..2fafb14a 100644
--- a/lib/fuser_basic.h
+++ b/lib/fuser_basic.h
@@ -53,31 +53,31 @@ class BasicGateFuser final : public Fuser<IO, Gate> {
 
   /**
    * Stores sets of gates that can be applied together. Only one- and
-   * two-qubit gates will get fused. Gates fused with this method are not
-   * multiplied together until ApplyFusedGate is called on the output.
-   * To respect specific time boundaries while fusing gates, use the other
-   * version of this method below.
+   * two-qubit gates will get fused. To respect specific time boundaries while
+   * fusing gates, use the other version of this method below.
    * @param param Options for gate fusion.
-   * @param num_qubits The number of qubits acted on by 'gates'.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
    * @param gates The gates (or pointers to the gates) to be fused.
    *   Gate times of the gates that act on the same qubits should be ordered.
    *   Gates that are out of time order should not cross the time boundaries
    *   set by measurement gates.
+   * @param fuse_matrix If true, multiply gate matrices together.
    * @return A vector of fused gate objects. Each element is a set of gates
    *   acting on a specific pair of qubits which can be applied as a group.
    */
   static std::vector<GateFused> FuseGates(const Parameter& param,
-                                          unsigned num_qubits,
-                                          const std::vector<Gate>& gates) {
-    return FuseGates(param, num_qubits, gates.cbegin(), gates.cend(), {});
+                                          unsigned max_qubit1,
+                                          const std::vector<Gate>& gates,
+                                          bool fuse_matrix = true) {
+    return FuseGates(
+        param, max_qubit1, gates.cbegin(), gates.cend(), {}, fuse_matrix);
   }
 
   /**
    * Stores sets of gates that can be applied together. Only one- and
-   * two-qubit gates will get fused. Gates fused with this method are not
-   * multiplied together until ApplyFusedGate is called on the output.
+   * two-qubit gates will get fused.
    * @param param Options for gate fusion.
-   * @param num_qubits The number of qubits acted on by 'gates'.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
    * @param gates The gates (or pointers to the gates) to be fused.
    *   Gate times of the gates that act on the same qubits should be ordered.
    *   Gates that are out of time order should not cross the time boundaries
@@ -85,45 +85,46 @@ class BasicGateFuser final : public Fuser<IO, Gate> {
    * @param times_to_split_at Ordered list of time steps (boundaries) at which
    *   to separate fused gates. Each element of the output will contain gates
    *   from a single 'window' in this list.
+   * @param fuse_matrix If true, multiply gate matrices together.
    * @return A vector of fused gate objects. Each element is a set of gates
    *   acting on a specific pair of qubits which can be applied as a group.
    */
   static std::vector<GateFused> FuseGates(
       const Parameter& param,
-      unsigned num_qubits, const std::vector<Gate>& gates,
-      const std::vector<unsigned>& times_to_split_at) {
-    return FuseGates(param, num_qubits, gates.cbegin(), gates.cend(),
-                     times_to_split_at);
+      unsigned max_qubit1, const std::vector<Gate>& gates,
+      const std::vector<unsigned>& times_to_split_at,
+      bool fuse_matrix = true) {
+    return FuseGates(param, max_qubit1, gates.cbegin(), gates.cend(),
+                     times_to_split_at, fuse_matrix);
   }
 
   /**
    * Stores sets of gates that can be applied together. Only one- and
-   * two-qubit gates will get fused. Gates fused with this method are not
-   * multiplied together until ApplyFusedGate is called on the output.
-   * To respect specific time boundaries while fusing gates, use the other
-   * version of this method below.
+   * two-qubit gates will get fused. To respect specific time boundaries while
+   * fusing gates, use the other version of this method below.
    * @param param Options for gate fusion.
-   * @param num_qubits The number of qubits acted on by gates.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
    * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates
    *   (or pointers to gates) in. Gate times of the gates that act on the same
    *   qubits should be ordered. Gates that are out of time order should not
    *   cross the time boundaries set by measurement gates.
+   * @param fuse_matrix If true, multiply gate matrices together.
    * @return A vector of fused gate objects. Each element is a set of gates
    *   acting on a specific pair of qubits which can be applied as a group.
    */
   static std::vector<GateFused> FuseGates(
-      const Parameter& param, unsigned num_qubits,
+      const Parameter& param, unsigned max_qubit1,
       typename std::vector<Gate>::const_iterator gfirst,
-      typename std::vector<Gate>::const_iterator glast) {
-    return FuseGates(param, num_qubits, gfirst, glast, {});
+      typename std::vector<Gate>::const_iterator glast,
+      bool fuse_matrix = true) {
+    return FuseGates(param, max_qubit1, gfirst, glast, {}, fuse_matrix);
   }
 
   /**
    * Stores sets of gates that can be applied together. Only one- and
-   * two-qubit gates will get fused. Gates fused with this method are not
-   * multiplied together until ApplyFusedGate is called on the output.
+   * two-qubit gates will get fused.
    * @param param Options for gate fusion.
-   * @param num_qubits The number of qubits acted on by gates.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
    * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates
    *   (or pointers to gates) in. Gate times of the gates that act on the same
    *   qubits should be ordered. Gates that are out of time order should not
@@ -132,14 +133,16 @@ class BasicGateFuser final : public Fuser<IO, Gate> {
    * @param times_to_split_at Ordered list of time steps (boundaries) at which
    *   to separate fused gates. Each element of the output will contain gates
    *   from a single 'window' in this list.
+   * @param fuse_matrix If true, multiply gate matrices together.
    * @return A vector of fused gate objects. Each element is a set of gates
    *   acting on a specific pair of qubits which can be applied as a group.
    */
   static std::vector<GateFused> FuseGates(
-      const Parameter& param, unsigned num_qubits,
+      const Parameter& param, unsigned max_qubit1,
       typename std::vector<Gate>::const_iterator gfirst,
       typename std::vector<Gate>::const_iterator glast,
-      const std::vector<unsigned>& times_to_split_at) {
+      const std::vector<unsigned>& times_to_split_at,
+      bool fuse_matrix = true) {
     std::vector<GateFused> gates_fused;
 
     if (gfirst >= glast) return gates_fused;
@@ -159,7 +162,7 @@ class BasicGateFuser final : public Fuser<IO, Gate> {
     std::vector<const RGate*> gates_seq;
 
     // Lattice of gates: qubits "hyperplane" and time direction.
-    std::vector<std::vector<const RGate*>> gates_lat(num_qubits);
+    std::vector<std::vector<const RGate*>> gates_lat(max_qubit1);
 
     // Current unfused gate.
     auto gate_it = gfirst;
@@ -168,7 +171,7 @@ class BasicGateFuser final : public Fuser<IO, Gate> {
       gates_seq.resize(0);
       gates_seq.reserve(num_gates);
 
-      for (unsigned k = 0; k < num_qubits; ++k) {
+      for (unsigned k = 0; k < max_qubit1; ++k) {
         gates_lat[k].resize(0);
         gates_lat[k].reserve(128);
       }
@@ -179,9 +182,7 @@ class BasicGateFuser final : public Fuser<IO, Gate> {
 
         if (gate.time > times[l]) break;
 
-        if (GateIsOutOfOrder(gate.time, gate.qubits, gates_lat)
-            || GateIsOutOfOrder(gate.time, gate.controlled_by, gates_lat)) {
-          IO::errorf("gate is out of time order.\n");
+        if (!ValidateGate(gate, max_qubit1, gates_lat)) {
           gates_fused.resize(0);
           return gates_fused;
         }
@@ -190,7 +191,7 @@ class BasicGateFuser final : public Fuser<IO, Gate> {
           auto& mea_gates_at_time = measurement_gates[gate.time];
           if (mea_gates_at_time.size() == 0) {
             gates_seq.push_back(&gate);
-            mea_gates_at_time.reserve(num_qubits);
+            mea_gates_at_time.reserve(max_qubit1);
           }
 
           mea_gates_at_time.push_back(&gate);
@@ -214,7 +215,7 @@ class BasicGateFuser final : public Fuser<IO, Gate> {
         }
       }
 
-      std::vector<unsigned> last(num_qubits, 0);
+      std::vector<unsigned> last(max_qubit1, 0);
 
       const RGate* delayed_measurement_gate = nullptr;
 
@@ -243,11 +244,11 @@ class BasicGateFuser final : public Fuser<IO, Gate> {
           }
 
           gates_fused.push_back({pgate->kind, pgate->time, pgate->qubits,
-                                 pgate, {pgate}});
+                                 pgate, {pgate}, {}});
         } else if (pgate->qubits.size() == 1) {
           unsigned q0 = pgate->qubits[0];
 
-          GateFused gate_f = {pgate->kind, pgate->time, {q0}, pgate, {}};
+          GateFused gate_f = {pgate->kind, pgate->time, {q0}, pgate, {}, {}};
 
           last[q0] = Advance(last[q0], gates_lat[q0], gate_f.gates);
           gate_f.gates.push_back(gates_lat[q0][last[q0]]);
@@ -260,7 +261,8 @@ class BasicGateFuser final : public Fuser<IO, Gate> {
 
           if (Done(last[q0], pgate->time, gates_lat[q0])) continue;
 
-          GateFused gate_f = {pgate->kind, pgate->time, {q0, q1}, pgate, {}};
+          GateFused gate_f =
+              {pgate->kind, pgate->time, {q0, q1}, pgate, {}, {}};
 
           do {
             last[q0] = Advance(last[q0], gates_lat[q0], gate_f.gates);
@@ -277,7 +279,7 @@ class BasicGateFuser final : public Fuser<IO, Gate> {
         }
       }
 
-      for (unsigned q = 0; q < num_qubits; ++q) {
+      for (unsigned q = 0; q < max_qubit1; ++q) {
         auto l = last[q];
         if (l == gates_lat[q].size()) continue;
 
@@ -290,7 +292,7 @@ class BasicGateFuser final : public Fuser<IO, Gate> {
 
         const auto& mea_gates_at_time = measurement_gates[pgate->time];
 
-        GateFused gate_f = {pgate->kind, pgate->time, {}, pgate, {}};
+        GateFused gate_f = {pgate->kind, pgate->time, {}, pgate, {}, {}};
         gate_f.gates.reserve(mea_gates_at_time.size());
 
         // Fuse measurement gates with equal times.
@@ -307,6 +309,14 @@ class BasicGateFuser final : public Fuser<IO, Gate> {
       if (gate_it == glast) break;
     }
 
+    if (fuse_matrix) {
+      for (auto& gate_f : gates_fused) {
+        if (gate_f.kind != gate::kMeasurement && gate_f.kind != gate::kDecomp) {
+          CalculateFusedMatrix(gate_f);
+        }
+      }
+    }
+
     return gates_fused;
   }
 
@@ -338,7 +348,7 @@ class BasicGateFuser final : public Fuser<IO, Gate> {
                                    std::vector<GateFused>& gates_fused) {
     auto pgate = gates_lat[q][k];
 
-    GateFused gate_f = {pgate->kind, pgate->time, {q}, pgate, {}};
+    GateFused gate_f = {pgate->kind, pgate->time, {q}, pgate, {}, {}};
     gate_f.gates.push_back(pgate);
 
     k = Advance(k + 1, gates_lat[q], gate_f.gates);
@@ -348,17 +358,34 @@ class BasicGateFuser final : public Fuser<IO, Gate> {
     return k;
   }
 
-  template <typename GatesLat>
-  static bool GateIsOutOfOrder(unsigned time,
-                               const std::vector<unsigned>& qubits,
-                               const GatesLat& gates_lat) {
-    for (unsigned q : qubits) {
-      if (!gates_lat[q].empty() && time <= gates_lat[q].back()->time) {
-        return true;
+  template <typename Gate2, typename GatesLat>
+  static bool ValidateGate(const Gate2& gate, unsigned max_qubit1,
+                           const GatesLat& gates_lat) {
+    for (unsigned q : gate.qubits) {
+      if (q >= max_qubit1) {
+        IO::errorf("fuser: gate qubit %u is out of range "
+                   "(should be smaller than %u).\n", q, max_qubit1);
+        return false;
+      }
+      if (!gates_lat[q].empty() && gate.time <= gates_lat[q].back()->time) {
+        IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time);
+        return false;
+      }
+    }
+
+    for (unsigned q : gate.controlled_by) {
+      if (q >= max_qubit1) {
+        IO::errorf("fuser: gate qubit %u is out of range "
+                   "(should be smaller than %u).\n", q, max_qubit1);
+        return false;
+      }
+      if (!gates_lat[q].empty() && gate.time <= gates_lat[q].back()->time) {
+        IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time);
+        return false;
       }
     }
 
-    return false;
+    return true;
   }
 };
 
diff --git a/lib/fuser_mqubit.h b/lib/fuser_mqubit.h
index 66273893..8ddb029e 100644
--- a/lib/fuser_mqubit.h
+++ b/lib/fuser_mqubit.h
@@ -152,31 +152,31 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
   };
 
   /**
-   * Stores sets of gates that can be applied together. Note that
-   * gates fused with this method are not multiplied together until
-   * ApplyFusedGate is called on the output. To respect specific time
-   * boundaries while fusing gates, use the other version of this method below.
+   * Stores sets of gates that can be applied together. To respect specific
+   * time boundaries while fusing gates, use the other version of this method
+   * below.
    * @param param Options for gate fusion.
-   * @param num_qubits The number of qubits acted on by 'gates'.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
    * @param gates The gates (or pointers to the gates) to be fused.
    *   Gate times of the gates that act on the same qubits should be ordered.
    *   Gates that are out of time order should not cross the time boundaries
    *   set by measurement gates.
+   * @param fuse_matrix If true, multiply gate matrices together.
    * @return A vector of fused gate objects. Each element is a set of gates
    *   acting on a specific pair of qubits which can be applied as a group.
    */
   static std::vector<GateFused> FuseGates(const Parameter& param,
-                                          unsigned num_qubits,
-                                          const std::vector<Gate>& gates) {
-    return FuseGates(param, num_qubits, gates.cbegin(), gates.cend(), {});
+                                          unsigned max_qubit1,
+                                          const std::vector<Gate>& gates,
+                                          bool fuse_matrix = true) {
+    return FuseGates(
+        param, max_qubit1, gates.cbegin(), gates.cend(), {}, fuse_matrix);
   }
 
   /**
-   * Stores sets of gates that can be applied together. Note that
-   * gates fused with this method are not multiplied together until
-   * ApplyFusedGate is called on the output.
+   * Stores sets of gates that can be applied together.
    * @param param Options for gate fusion.
-   * @param num_qubits The number of qubits acted on by 'gates'.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
    * @param gates The gates (or pointers to the gates) to be fused.
    *   Gate times of the gates that act on the same qubits should be ordered.
    *   Gates that are out of time order should not cross the time boundaries
@@ -184,44 +184,45 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
    * @param times_to_split_at Ordered list of time steps (boundaries) at which
    *   to separate fused gates. Each element of the output will contain gates
    *   from a single 'window' in this list.
+   * @param fuse_matrix If true, multiply gate matrices together.
    * @return A vector of fused gate objects. Each element is a set of gates
    *   acting on a specific pair of qubits which can be applied as a group.
    */
   static std::vector<GateFused> FuseGates(
       const Parameter& param,
-      unsigned num_qubits, const std::vector<Gate>& gates,
-      const std::vector<unsigned>& times_to_split_at) {
-    return FuseGates(param, num_qubits, gates.cbegin(), gates.cend(),
-                     times_to_split_at);
+      unsigned max_qubit1, const std::vector<Gate>& gates,
+      const std::vector<unsigned>& times_to_split_at,
+      bool fuse_matrix = true) {
+    return FuseGates(param, max_qubit1, gates.cbegin(), gates.cend(),
+                     times_to_split_at, fuse_matrix);
   }
 
   /**
-   * Stores sets of gates that can be applied together. Note that
-   * gates fused with this method are not multiplied together until
-   * ApplyFusedGate is called on the output. To respect specific time
-   * boundaries while fusing gates, use the other version of this method below.
+   * Stores sets of gates that can be applied together. To respect specific
+   * time boundaries while fusing gates, use the other version of this method
+   * below.
    * @param param Options for gate fusion.
-   * @param num_qubits The number of qubits acted on by gates.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
    * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates
    *   (or pointers to gates) in. Gate times of the gates that act on the same
    *   qubits should be ordered. Gates that are out of time order should not
    *   cross the time boundaries set by measurement gates.
+   * @param fuse_matrix If true, multiply gate matrices together.
    * @return A vector of fused gate objects. Each element is a set of gates
    *   acting on a specific pair of qubits which can be applied as a group.
    */
   static std::vector<GateFused> FuseGates(
-      const Parameter& param, unsigned num_qubits,
+      const Parameter& param, unsigned max_qubit1,
       typename std::vector<Gate>::const_iterator gfirst,
-      typename std::vector<Gate>::const_iterator glast) {
-    return FuseGates(param, num_qubits, gfirst, glast, {});
+      typename std::vector<Gate>::const_iterator glast,
+      bool fuse_matrix = true) {
+    return FuseGates(param, max_qubit1, gfirst, glast, {}, fuse_matrix);
   }
 
   /**
-   * Stores sets of gates that can be applied together. Note that
-   * gates fused with this method are not multiplied together until
-   * ApplyFusedGate is called on the output.
+   * Stores sets of gates that can be applied together.
    * @param param Options for gate fusion.
-   * @param num_qubits The number of qubits acted on by gates.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
    * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates
    *   (or pointers to gates) in. Gate times of the gates that act on the same
    *   qubits should be ordered. Gates that are out of time order should not
@@ -230,14 +231,16 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
    * @param times_to_split_at Ordered list of time steps (boundaries) at which
    *   to separate fused gates. Each element of the output will contain gates
    *   from a single 'window' in this list.
+   * @param fuse_matrix If true, multiply gate matrices together.
    * @return A vector of fused gate objects. Each element is a set of gates
    *   acting on a specific pair of qubits which can be applied as a group.
    */
   static std::vector<GateFused> FuseGates(
-      const Parameter& param, unsigned num_qubits,
+      const Parameter& param, unsigned max_qubit1,
       typename std::vector<Gate>::const_iterator gfirst,
       typename std::vector<Gate>::const_iterator glast,
-      const std::vector<unsigned>& times_to_split_at) {
+      const std::vector<unsigned>& times_to_split_at,
+      bool fuse_matrix = true) {
     std::vector<GateFused> fused_gates;
 
     if (gfirst >= glast) return fused_gates;
@@ -250,7 +253,7 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
     auto epochs =
         Base::MergeWithMeasurementTimes(gfirst, glast, times_to_split_at);
 
-    LinkManager link_manager(num_qubits * num_gates);
+    LinkManager link_manager(max_qubit1 * num_gates);
 
     // Auxillary data structures.
     // Sequence of intermediate fused gates.
@@ -258,10 +261,10 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
     // Gate "lattice".
     std::vector<Link*> gates_lat;
     // Sequences of intermediate fused gates ordered by gate size.
-    std::vector<std::vector<GateF*>> fgates(num_qubits + 1);
+    std::vector<std::vector<GateF*>> fgates(max_qubit1 + 1);
 
     gates_seq.reserve(num_gates);
-    gates_lat.reserve(num_qubits);
+    gates_lat.reserve(max_qubit1);
 
     Scratch scratch;
 
@@ -274,10 +277,10 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
     scratch.stack.reserve(8);
 
     Stat stat;
-    stat.num_gates.resize(num_qubits + 1, 0);
+    stat.num_gates.resize(max_qubit1 + 1, 0);
 
     unsigned max_fused_size = std::min(unsigned{6}, param.max_fused_size);
-    max_fused_size = std::min(max_fused_size, num_qubits);
+    max_fused_size = std::min(max_fused_size, max_qubit1);
 
     auto gate_it = gfirst;
 
@@ -285,9 +288,9 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
     for (std::size_t l = 0; l < epochs.size(); ++l) {
       gates_seq.resize(0);
       gates_lat.resize(0);
-      gates_lat.resize(num_qubits, nullptr);
+      gates_lat.resize(max_qubit1, nullptr);
 
-      for (unsigned i = 0; i <= num_qubits; ++i) {
+      for (unsigned i = 0; i <= max_qubit1; ++i) {
         fgates[i].resize(0);
       }
 
@@ -300,9 +303,7 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
 
         if (gate.time > epochs[l]) break;
 
-        if (GateIsOutOfOrder(gate.time, gate.qubits, gates_lat)
-            || GateIsOutOfOrder(gate.time, gate.controlled_by, gates_lat)) {
-          IO::errorf("gate is out of time order.\n");
+        if (!ValidateGate(gate, max_qubit1, gates_lat)) {
           fused_gates.resize(0);
           return fused_gates;
         }
@@ -317,8 +318,8 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
             gates_seq.push_back({&gate, {}, {}, {}, 0, kMeaCnt});
             last_mea_gate = &gates_seq.back();
 
-            last_mea_gate->qubits.reserve(num_qubits);
-            last_mea_gate->links.reserve(num_qubits);
+            last_mea_gate->qubits.reserve(max_qubit1);
+            last_mea_gate->links.reserve(max_qubit1);
 
             ++stat.num_fused_mea_gates;
           }
@@ -391,20 +392,53 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
 
       if (max_fused_size > 2) {
         FuseGateSequences(
-            max_fused_size, num_qubits, scratch, gates_seq, stat, fused_gates);
+            max_fused_size, max_qubit1, scratch, gates_seq, stat, fused_gates);
       } else {
+        unsigned prev_time = 0;
+
+        std::vector<GateF*> orphaned_gates;
+        orphaned_gates.reserve(max_qubit1);
+
         for (auto& fgate : gates_seq) {
-          if (fgate.gates.size() > 0) {
-            // Assume fgate.qubits (gate.qubits) are sorted.
-            fused_gates.push_back({fgate.parent->kind, fgate.parent->time,
-                                   std::move(fgate.qubits), fgate.parent,
-                                   std::move(fgate.gates)});
-
-            if (fgate.visited != kMeaCnt) {
-              ++stat.num_fused_gates;
+          if (fgate.gates.size() == 0) continue;
+
+          if (prev_time != fgate.parent->time) {
+            if (orphaned_gates.size() > 0) {
+              FuseOrphanedGates(
+                  max_fused_size, stat, orphaned_gates, fused_gates);
+              orphaned_gates.resize(0);
             }
+
+            prev_time = fgate.parent->time;
+          }
+
+          if (fgate.qubits.size() == 1 && max_fused_size > 1
+              && fgate.visited != kMeaCnt && !fgate.parent->unfusible) {
+            orphaned_gates.push_back(&fgate);
+            continue;
+          }
+
+          // Assume fgate.qubits (gate.qubits) are sorted.
+          fused_gates.push_back({fgate.parent->kind, fgate.parent->time,
+                                 std::move(fgate.qubits), fgate.parent,
+                                 std::move(fgate.gates), {}});
+
+          if (fgate.visited != kMeaCnt) {
+            ++stat.num_fused_gates;
           }
         }
+
+        if (orphaned_gates.size() > 0) {
+          FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates);
+        }
+      }
+    }
+
+    if (fuse_matrix) {
+      for (auto& fgate : fused_gates) {
+        if (fgate.kind != gate::kMeasurement && fgate.kind != gate::kDecomp) {
+          CalculateFusedMatrix(fgate);
+        }
       }
     }
 
@@ -448,13 +482,13 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
   //
   // max_fused_size = 6: _-_-_
   static void FuseGateSequences(unsigned max_fused_size,
-                                unsigned num_qubits, Scratch& scratch,
+                                unsigned max_qubit1, Scratch& scratch,
                                 std::vector<GateF>& gates_seq, Stat& stat,
                                 std::vector<GateFused>& fused_gates) {
     unsigned prev_time = 0;
 
     std::vector<GateF*> orphaned_gates;
-    orphaned_gates.reserve(num_qubits);
+    orphaned_gates.reserve(max_qubit1);
 
     for (auto& fgate : gates_seq) {
       if (prev_time != fgate.parent->time) {
@@ -471,14 +505,14 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
       if (fgate.visited == kMeaCnt || fgate.qubits.size() >= max_fused_size
           || fgate.parent->unfusible) {
         if (fgate.visited != kMeaCnt) {
-           ++stat.num_fused_gates;
+          ++stat.num_fused_gates;
         }
 
         fgate.visited = kFinal;
 
         fused_gates.push_back({fgate.parent->kind, fgate.parent->time,
                                std::move(fgate.qubits), fgate.parent,
-                               std::move(fgate.gates)});
+                               std::move(fgate.gates), {}});
 
         continue;
       }
@@ -503,7 +537,7 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
 
           fused_gates.push_back({fgate->parent->kind, fgate->parent->time,
                                  std::move(fgate->qubits), fgate->parent,
-                                 std::move(fgate->gates)});
+                                 std::move(fgate->gates), {}});
 
           ++stat.num_fused_gates;
         }
@@ -564,7 +598,7 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
 
       fused_gates.push_back({ogate1->parent->kind, ogate1->parent->time,
                              std::move(ogate1->qubits), ogate1->parent,
-                             std::move(ogate1->gates)});
+                             std::move(ogate1->gates), {}});
 
       ++stat.num_fused_gates;
     }
@@ -921,7 +955,9 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
                   if (ln != nullptr && rn != nullptr) {
                     return R()(ln->val->parent->time, rn->val->parent->time);
                   } else {
-                    return ln != nullptr || rn == nullptr;
+                    // nullptrs are larger than everything else and
+                    // equivalent among each other.
+                    return ln != nullptr;
                   }
                 });
 
@@ -963,7 +999,7 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
 
   static void PrintStat(unsigned verbosity, const Stat& stat,
                         const std::vector<GateFused>& fused_gates) {
-    if (verbosity == 0) return;
+    if (verbosity < 3) return;
 
     if (stat.num_controlled_gates > 0) {
       IO::messagef("%lu controlled gates\n", stat.num_controlled_gates);
@@ -992,10 +1028,10 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
 
     IO::messagef(" gates are fused into %lu gates\n", stat.num_fused_gates);
 
-    if (verbosity == 1) return;
+    if (verbosity < 5) return;
 
     IO::messagef("fused gate qubits:\n");
-    for (const auto g : fused_gates) {
+    for (const auto& g : fused_gates) {
       IO::messagef("%6u  ", g.parent->time);
       if (g.parent->kind == gate::kMeasurement) {
         IO::messagef("m");
@@ -1016,17 +1052,36 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
     }
   }
 
-  template <typename GatesLat>
-  static bool GateIsOutOfOrder(unsigned time,
-                               const std::vector<unsigned>& qubits,
-                               const GatesLat& gates_lat) {
-    for (unsigned q : qubits) {
-      if (gates_lat[q] != nullptr && time <= gates_lat[q]->val->parent->time) {
-        return true;
+  template <typename Gate2, typename GatesLat>
+  static bool ValidateGate(const Gate2& gate, unsigned max_qubit1,
+                           const GatesLat& gates_lat) {
+    for (unsigned q : gate.qubits) {
+      if (q >= max_qubit1) {
+        IO::errorf("fuser: gate qubit %u is out of range "
+                   "(should be smaller than %u).\n", q, max_qubit1);
+        return false;
+      }
+      if (gates_lat[q] != nullptr
+          && gate.time <= gates_lat[q]->val->parent->time) {
+        IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time);
+        return false;
+      }
+    }
+
+    for (unsigned q : gate.controlled_by) {
+      if (q >= max_qubit1) {
+        IO::errorf("fuser: gate qubit %u is out of range "
+                   "(should be smaller than %u).\n", q, max_qubit1);
+        return false;
+      }
+      if (gates_lat[q] != nullptr
+          && gate.time <= gates_lat[q]->val->parent->time) {
+        IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time);
+        return false;
       }
     }
 
-    return false;
+    return true;
   }
 };
 
diff --git a/lib/gate_appl.h b/lib/gate_appl.h
index 59b60082..8601e6f2 100644
--- a/lib/gate_appl.h
+++ b/lib/gate_appl.h
@@ -136,13 +136,12 @@ template <typename Simulator, typename Gate>
 inline void ApplyFusedGate(const Simulator& simulator, const Gate& gate,
                            typename Simulator::State& state) {
   if (gate.kind != gate::kMeasurement) {
-    using fp_type = typename Simulator::fp_type;
-    auto matrix = CalculateFusedMatrix<fp_type>(gate);
     if (gate.parent->controlled_by.size() == 0) {
-      simulator.ApplyGate(gate.qubits, matrix.data(), state);
+      simulator.ApplyGate(gate.qubits, gate.matrix.data(), state);
     } else {
       simulator.ApplyControlledGate(gate.qubits, gate.parent->controlled_by,
-                                    gate.parent->cmask, matrix.data(), state);
+                                    gate.parent->cmask, gate.matrix.data(),
+                                    state);
     }
   }
 }
@@ -160,9 +159,9 @@ template <typename Simulator, typename Gate>
 inline void ApplyFusedGateDagger(const Simulator& simulator, const Gate& gate,
                                  typename Simulator::State& state) {
   if (gate.kind != gate::kMeasurement) {
-    using fp_type = typename Simulator::fp_type;
-    auto matrix = CalculateFusedMatrix<fp_type>(gate);
+    auto matrix = gate.matrix;
     MatrixDagger(unsigned{1} << gate.qubits.size(), matrix);
+
     if (gate.parent->controlled_by.size() == 0) {
       simulator.ApplyGate(gate.qubits, matrix.data(), state);
     } else {
diff --git a/lib/gates_qsim.h b/lib/gates_qsim.h
index 7690c2b6..9b64cf98 100644
--- a/lib/gates_qsim.h
+++ b/lib/gates_qsim.h
@@ -46,6 +46,8 @@ enum GateKind {
   kGateIS,      // iSwap
   kGateFS,      // fSim
   kGateCP,      // control phase
+  kGateMatrix1, // One-qubit matrix gate.
+  kGateMatrix2, // Two-qubit matrix gate.
   kDecomp = gate::kDecomp,
   kMeasurement = gate::kMeasurement,
 };
@@ -317,6 +319,24 @@ struct GateS {
   }
 };
 
+/**
+ * A one-qubit gate defined entirely by its matrix.
+ */
+template <typename fp_type>
+struct GateMatrix1 {
+  static constexpr GateKind kind = kGateMatrix1;
+  static constexpr char name[] = "mat1";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0,
+                                  const Matrix<fp_type>& m) {
+    auto m2 = m;
+    return
+        CreateGate<GateQSim<fp_type>, GateMatrix1>(time, {q0}, std::move(m2));
+  }
+};
+
 // Two-qubit gates:
 
 /**
@@ -566,6 +586,29 @@ struct GateCP {
   }
 };
 
+/**
+ * A two-qubit gate defined entirely by its matrix.
+ */
+template <typename fp_type>
+struct GateMatrix2 {
+  static constexpr GateKind kind = kGateMatrix2;
+  static constexpr char name[] = "mat2";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = false;
+
+  template <typename M = Matrix<fp_type>>
+  static GateQSim<fp_type> Create(
+      unsigned time, unsigned q0, unsigned q1, M&& m) {
+    return CreateGate<GateQSim<fp_type>, GateMatrix2>(time, {q1, q0},
+                                                      std::forward<M>(m));
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(fp_type phi) {
+    // Not implemented.
+    return schmidt_decomp_type<fp_type>{};
+  }
+};
+
 template <typename fp_type>
 inline schmidt_decomp_type<fp_type> GetSchmidtDecomp(
     GateKind kind, const std::vector<fp_type>& params) {
diff --git a/lib/hybrid.h b/lib/hybrid.h
index 0ce98b63..d0189efc 100644
--- a/lib/hybrid.h
+++ b/lib/hybrid.h
@@ -40,6 +40,7 @@ struct HybridSimulator final {
   // Note that one can use "struct GateHybrid : public Gate {" in C++17.
   struct GateHybrid {
     using GateKind = HybridSimulator::GateKind;
+    using fp_type = HybridSimulator::fp_type;
 
     GateKind kind;
     unsigned time;
@@ -554,7 +555,13 @@ struct HybridSimulator final {
                          const Simulator& simulator,
                          typename Simulator::State& state) {
     for (std::size_t i = i0; i < i1; ++i) {
-      ApplyFusedGate(simulator, gates[i], state);
+      if (gates[i].matrix.size() > 0) {
+        ApplyFusedGate(simulator, gates[i], state);
+      } else {
+        auto gate = gates[i];
+        CalculateFusedMatrix(gate);
+        ApplyFusedGate(simulator, gate, state);
+      }
     }
   }
 
diff --git a/lib/matrix.h b/lib/matrix.h
index f9725eca..e126a02d 100644
--- a/lib/matrix.h
+++ b/lib/matrix.h
@@ -92,6 +92,39 @@ inline void MatrixMultiply(
   }
 }
 
+/**
+ * Multiplies two gate matrices of equal size: m2 = m1^\dagger m2.
+ * @q Number of gate qubits. The number of matrix rows (columns) is 2^q.
+ * @m1 Matrix m1.
+ * @m2 Input matrix m2. Output product of matrices m2 = m1 m2.
+ */
+template <typename fp_type1, typename fp_type2>
+inline void MatrixDaggerMultiply(
+    unsigned q, const Matrix<fp_type1>& m1, Matrix<fp_type2>& m2) {
+  Matrix<fp_type2> mt = m2;
+  unsigned n = unsigned{1} << q;
+
+  for (unsigned i = 0; i < n; ++i) {
+    for (unsigned j = 0; j < n; ++j) {
+      fp_type2 re = 0;
+      fp_type2 im = 0;
+
+      for (unsigned k = 0; k < n; ++k) {
+        fp_type2 r1 = m1[2 * (n * k + i)];
+        fp_type2 i1 = m1[2 * (n * k + i) + 1];
+        fp_type2 r2 = mt[2 * (n * k + j)];
+        fp_type2 i2 = mt[2 * (n * k + j) + 1];
+
+        re += r1 * r2 + i1 * i2;
+        im += r1 * i2 - i1 * r2;
+      }
+
+      m2[2 * (n * i + j)] = re;
+      m2[2 * (n * i + j) + 1] = im;
+    }
+  }
+}
+
 /**
  * Multiplies two gate matrices: m2 = m1 m2. The size of m1 should not exceed
  *   the size of m2.
diff --git a/lib/mps_simulator.h b/lib/mps_simulator.h
index ae053690..8fbcbae1 100644
--- a/lib/mps_simulator.h
+++ b/lib/mps_simulator.h
@@ -35,11 +35,12 @@ namespace mps {
 /**
  *  Truncated Matrix Product State (MPS) circuit simulator w/ vectorization.
  */
-template <typename For, typename fp_type = float>
+template <typename For, typename FP = float>
 class MPSSimulator final {
  public:
-  using MPSStateSpace_ = MPSStateSpace<For, fp_type>;
+  using MPSStateSpace_ = MPSStateSpace<For, FP>;
   using State = typename MPSStateSpace_::MPS;
+  using fp_type = typename MPSStateSpace_::fp_type;
 
   using Complex = std::complex<fp_type>;
   using Matrix =
@@ -219,7 +220,7 @@ class MPSSimulator final {
     block_0.fill(Complex(0, 0));
     const auto keep_cols = (svd_u.cols() > bond_dim) ? bond_dim : svd_u.cols();
     block_0.block(0, 0, svd_u.rows(), keep_cols).noalias() =
-        svd_u(Eigen::all, Eigen::seq(0, keep_cols - 1));
+        svd_u(Eigen::indexing::all, Eigen::seq(0, keep_cols - 1));
 
     // Place row product of S V into scratch to truncate and then B1.
     MatrixMap svd_v((Complex*)(raw_state + end), p, 2 * m_dim);
@@ -232,7 +233,8 @@ class MPSSimulator final {
     for (unsigned i = 0; i < keep_rows; ++i) {
       svd_v.row(i) *= s_vector(i);
     }
-    block_1.block(0, 0, keep_rows, svd_v.cols()).noalias() = svd_v(row_seq, Eigen::all);
+    block_1.block(0, 0, keep_rows, svd_v.cols()).noalias() =
+        svd_v(row_seq, Eigen::indexing::all);
   }
 
   For for_;
diff --git a/lib/mps_statespace.h b/lib/mps_statespace.h
index 888d4d58..9b3acf31 100644
--- a/lib/mps_statespace.h
+++ b/lib/mps_statespace.h
@@ -26,8 +26,10 @@
 #include <cstdlib>
 #include <cstring>
 #include <memory>
+#include <random>
 
 #include "../eigen/Eigen/Dense"
+#include "../eigen/unsupported/Eigen/CXX11/Tensor"
 
 namespace qsim {
 
@@ -51,10 +53,11 @@ inline void free(void* ptr) {
  * Class containing context and routines for fixed bond dimension
  * truncated Matrix Product State (MPS) simulation.
  */
-template <typename For, typename fp_type = float>
+template <typename For, typename FP = float>
 class MPSStateSpace {
  private:
  public:
+  using fp_type = FP;
   using Pointer = std::unique_ptr<fp_type, decltype(&detail::free)>;
 
   using Complex = std::complex<fp_type>;
@@ -179,8 +182,8 @@ class MPSStateSpace {
     fp_type* state2_raw = state2.get();
 
     // Contract leftmost blocks together, store result in state1 scratch.
-    ConstMatrixMap top((Complex*) state2_raw, 2, bond_dim);
-    ConstMatrixMap bot((Complex*) state1_raw, 2, bond_dim);
+    ConstMatrixMap top((Complex*)state2_raw, 2, bond_dim);
+    ConstMatrixMap bot((Complex*)state1_raw, 2, bond_dim);
     MatrixMap partial_contract((Complex*)(state1_raw + end), bond_dim,
                                bond_dim);
     MatrixMap partial_contract2(
@@ -231,6 +234,326 @@ class MPSStateSpace {
     return partial_contract(0, 0);
   }
 
+  // Compute the 2x2 1-RDM of state on index. Result written to rdm.
+  // Requires: scratch and rdm to be allocated.
+  static void ReduceDensityMatrix(MPS& state, MPS& scratch, int index,
+                                  fp_type* rdm) {
+    const auto num_qubits = state.num_qubits();
+    const auto bond_dim = state.bond_dim();
+    const auto end = Size(state);
+    const bool last_index = (index == num_qubits - 1);
+    const auto right_dim = (last_index ? 1 : bond_dim);
+    auto offset = 0;
+    fp_type* state_raw = state.get();
+    fp_type* scratch_raw = scratch.get();
+    fp_type* state_raw_workspace = state_raw + end + 2 * bond_dim * bond_dim;
+    fp_type* scratch_raw_workspace =
+        scratch_raw + end + 2 * bond_dim * bond_dim;
+
+    Copy(state, scratch);
+
+    // Contract leftmost blocks together, store result in state scratch.
+    ConstMatrixMap top((Complex*)scratch_raw, 2, bond_dim);
+    ConstMatrixMap bot((Complex*)state_raw, 2, bond_dim);
+    MatrixMap partial_contract((Complex*)(state_raw + end), bond_dim, bond_dim);
+    MatrixMap partial_contract2((Complex*)(state_raw_workspace), bond_dim,
+                                2 * bond_dim);
+
+    partial_contract.setZero();
+    partial_contract(0, 0) = 1;
+    if (index > 0) {
+      partial_contract.noalias() = top.adjoint() * bot;
+    }
+
+    // Contract all internal blocks together.
+    for (unsigned i = 1; i < index; ++i) {
+      offset = GetBlockOffset(state, i);
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(state_raw_workspace), bond_dim, 2 * bond_dim);
+
+      // Merge bot into left boundary merged tensor.
+      new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim,
+                                2 * bond_dim);
+      partial_contract2.noalias() = partial_contract * bot;
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(state_raw_workspace), 2 * bond_dim, bond_dim);
+
+      // Merge top into partial_contract2.
+      new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim,
+                                bond_dim);
+      partial_contract.noalias() = top.adjoint() * partial_contract2;
+    }
+
+    // The [bond_dim, bond_dim] block in state_raw now contains the contraction
+    // up to, but not including index.
+    // Contract rightmost blocks.
+    offset = GetBlockOffset(state, num_qubits - 1);
+    new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim, 2);
+    new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2);
+    new (&partial_contract)
+        MatrixMap((Complex*)(scratch_raw + end), bond_dim, bond_dim);
+    new (&partial_contract2)
+        MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * bond_dim);
+
+    partial_contract.setZero();
+    partial_contract(0, 0) = 1;
+    if (index < num_qubits - 1) {
+      partial_contract.noalias() = top * bot.adjoint();
+    }
+
+    for (unsigned i = num_qubits - 2; i > index; --i) {
+      offset = GetBlockOffset(state, i);
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(scratch_raw_workspace), 2 * bond_dim, bond_dim);
+
+      // Merge bot into left boundary merged tensor.
+      new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), 2 * bond_dim,
+                                bond_dim);
+      partial_contract2.noalias() = bot * partial_contract.adjoint();
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * bond_dim);
+
+      // Merge top into partial_contract2.
+      new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim,
+                                2 * bond_dim);
+      // [bd, bd] = [bd, 2bd] @ [bd, 2bd]
+      partial_contract.noalias() = top * partial_contract2.adjoint();
+    }
+
+    // The [bond_dim, bond_dim] block in scratch_raw now contains the
+    // contraction down from the end, but not including the index. Begin final
+    // contraction steps.
+
+    // Get leftmost [bd, bd] contraction and contract with top.
+
+    offset = GetBlockOffset(state, index);
+    new (&partial_contract)
+        MatrixMap((Complex*)(state_raw + end), bond_dim, bond_dim);
+    new (&top)
+        ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2 * right_dim);
+    new (&partial_contract2)
+        MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * right_dim);
+    partial_contract2.noalias() = partial_contract * top.conjugate();
+    // copy the bottom contraction scratch_raw to state_raw to save space.
+    memcpy(state_raw + end, scratch_raw + end,
+           bond_dim * bond_dim * 2 * sizeof(fp_type));
+
+    // Contract top again for correct shape.
+    fp_type* contract3_target = (last_index ? rdm : scratch_raw);
+    MatrixMap partial_contract3((Complex*)contract3_target, 2 * right_dim,
+                                2 * right_dim);
+    partial_contract3.noalias() = top.transpose() * partial_contract2;
+
+    // If we are contracting the last index, all the needed transforms are done.
+    if (last_index) {
+      return;
+    }
+
+    // Conduct final tensor contraction operations. Cannot be easily compiled to
+    // matmul.
+    const Eigen::TensorMap<const Eigen::Tensor<Complex, 4, Eigen::RowMajor>>
+        t_4d((Complex*)scratch_raw, 2, bond_dim, 2, bond_dim);
+    const Eigen::TensorMap<const Eigen::Tensor<Complex, 2, Eigen::RowMajor>>
+        t_2d((Complex*)(state_raw + end), bond_dim, bond_dim);
+
+    const Eigen::array<Eigen::IndexPair<int>, 2> product_dims = {
+        Eigen::IndexPair<int>(1, 0),
+        Eigen::IndexPair<int>(3, 1),
+    };
+    Eigen::TensorMap<Eigen::Tensor<Complex, 2, Eigen::RowMajor>> out(
+        (Complex*)rdm, 2, 2);
+    out = t_4d.contract(t_2d, product_dims);
+  }
+
+  // Draw a single bitstring sample from state using scratch and scratch2
+  // as working space.
+  static void SampleOnce(MPS& state, MPS& scratch, MPS& scratch2,
+                         std::mt19937* random_gen, std::vector<bool>* sample) {
+    // TODO: carefully profile with perf and optimize temp storage
+    //  locations for cache friendliness.
+    const auto bond_dim = state.bond_dim();
+    const auto num_qubits = state.num_qubits();
+    const auto end = Size(state);
+    const auto left_frontier_offset = GetBlockOffset(state, num_qubits + 1);
+    std::default_random_engine generator;
+    fp_type* state_raw = state.get();
+    fp_type* scratch_raw = scratch.get();
+    fp_type* scratch2_raw = scratch2.get();
+    fp_type rdm[8];
+
+    sample->reserve(num_qubits);
+    Copy(state, scratch);
+    Copy(state, scratch2);
+
+    // Store prefix contractions in scratch2.
+    auto offset = GetBlockOffset(state, num_qubits - 1);
+    ConstMatrixMap top((Complex*)(state_raw + offset), bond_dim, 2);
+    ConstMatrixMap bot((Complex*)(scratch_raw + offset), bond_dim, 2);
+    MatrixMap partial_contract((Complex*)(scratch2_raw + offset), bond_dim,
+                               bond_dim);
+    MatrixMap partial_contract2((Complex*)(scratch_raw + end), bond_dim,
+                                2 * bond_dim);
+    partial_contract.noalias() = top * bot.adjoint();
+
+    for (unsigned i = num_qubits - 2; i > 0; --i) {
+      offset = GetBlockOffset(state, i);
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(scratch_raw + end), 2 * bond_dim, bond_dim);
+
+      // Merge bot into left boundary merged tensor.
+      new (&bot) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim,
+                                bond_dim);
+      partial_contract2.noalias() = bot * partial_contract.adjoint();
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(scratch_raw + end), bond_dim, 2 * bond_dim);
+
+      // Merge top into partial_contract2.
+      new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim,
+                                2 * bond_dim);
+
+      // merge into partial_contract -> scracth2_raw.
+      new (&partial_contract)
+          MatrixMap((Complex*)(scratch2_raw + offset), bond_dim, bond_dim);
+      partial_contract.noalias() = top * partial_contract2.adjoint();
+    }
+
+    // Compute RDM-0 and draw first sample.
+    offset = GetBlockOffset(state, 1);
+    new (&top) ConstMatrixMap((Complex*)state_raw, 2, bond_dim);
+    new (&bot) ConstMatrixMap((Complex*)scratch_raw, 2, bond_dim);
+    new (&partial_contract)
+        MatrixMap((Complex*)(scratch2_raw + offset), bond_dim, bond_dim);
+    new (&partial_contract2)
+        MatrixMap((Complex*)(scratch_raw + end), 2, bond_dim);
+
+    partial_contract2.noalias() = bot * partial_contract.adjoint();
+
+    new (&partial_contract) MatrixMap((Complex*)rdm, 2, 2);
+    partial_contract.noalias() = top * partial_contract2.adjoint();
+    auto p0 = rdm[0] / (rdm[0] + rdm[6]);
+    std::bernoulli_distribution distribution(1 - p0);
+    auto bit_val = distribution(*random_gen);
+    sample->push_back(bit_val);
+
+    // collapse state.
+    new (&partial_contract) MatrixMap((Complex*)scratch_raw, 2, bond_dim);
+    partial_contract.row(!bit_val).setZero();
+
+    // Prepare left contraction frontier.
+    new (&partial_contract2) MatrixMap(
+        (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim);
+    partial_contract2.noalias() =
+        partial_contract.transpose() * partial_contract.conjugate();
+
+    // Compute RDM-i and draw internal tensor samples.
+    for (unsigned i = 1; i < num_qubits - 1; i++) {
+      // Get leftmost [bd, bd] contraction and contract with top.
+      offset = GetBlockOffset(state, i);
+      new (&partial_contract) MatrixMap(
+          (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim);
+      new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim,
+                                2 * bond_dim);
+      new (&partial_contract2)
+          MatrixMap((Complex*)(state_raw + end), bond_dim, 2 * bond_dim);
+      partial_contract2.noalias() = partial_contract * top.conjugate();
+
+      // Contract top again for correct shape.
+      MatrixMap partial_contract3((Complex*)(scratch_raw + end), 2 * bond_dim,
+                                  2 * bond_dim);
+      partial_contract3.noalias() = top.transpose() * partial_contract2;
+
+      // Conduct final tensor contraction operations. Cannot be easily compiled
+      // to matmul. Perf reports shows only ~6% of runtime spent here on large
+      // systems.
+      offset = GetBlockOffset(state, i + 1);
+      const Eigen::TensorMap<const Eigen::Tensor<Complex, 4, Eigen::RowMajor>>
+          t_4d((Complex*)(scratch_raw + end), 2, bond_dim, 2, bond_dim);
+      const Eigen::TensorMap<const Eigen::Tensor<Complex, 2, Eigen::RowMajor>>
+          t_2d((Complex*)(scratch2_raw + offset), bond_dim, bond_dim);
+
+      const Eigen::array<Eigen::IndexPair<int>, 2> product_dims = {
+          Eigen::IndexPair<int>(1, 0),
+          Eigen::IndexPair<int>(3, 1),
+      };
+      Eigen::TensorMap<Eigen::Tensor<Complex, 2, Eigen::RowMajor>> out(
+          (Complex*)rdm, 2, 2);
+      out = t_4d.contract(t_2d, product_dims);
+
+      // Sample bit and collapse state.
+      p0 = rdm[0] / (rdm[0] + rdm[6]);
+      distribution = std::bernoulli_distribution(1 - p0);
+      bit_val = distribution(*random_gen);
+
+      sample->push_back(bit_val);
+      offset = GetBlockOffset(state, i);
+      new (&partial_contract)
+          MatrixMap((Complex*)(scratch_raw + offset), bond_dim * 2, bond_dim);
+      for (unsigned j = !bit_val; j < 2 * bond_dim; j += 2) {
+        partial_contract.row(j).setZero();
+      }
+
+      // Update left frontier.
+      new (&partial_contract) MatrixMap(
+          (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim);
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(state_raw + end), bond_dim, 2 * bond_dim);
+
+      // Merge bot into left boundary merged tensor.
+      new (&bot) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim,
+                                2 * bond_dim);
+      partial_contract2.noalias() = partial_contract * bot.conjugate();
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(state_raw + end), 2 * bond_dim, bond_dim);
+
+      // Merge top into partial_contract2.
+      new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim,
+                                bond_dim);
+      partial_contract.noalias() = top.transpose() * partial_contract2;
+    }
+
+    // Compute RDM-(n-1) and sample.
+    offset = GetBlockOffset(state, num_qubits - 1);
+    new (&partial_contract2)
+        MatrixMap((Complex*)(state_raw + end), bond_dim, 2);
+
+    new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2);
+    partial_contract2.noalias() = partial_contract * top.conjugate();
+    new (&partial_contract) MatrixMap((Complex*)rdm, 2, 2);
+    partial_contract.noalias() = top.transpose() * partial_contract2;
+
+    p0 = rdm[0] / (rdm[0] + rdm[6]);
+    distribution = std::bernoulli_distribution(1 - p0);
+    bit_val = distribution(*random_gen);
+    sample->push_back(bit_val);
+  }
+
+  // Draw num_samples bitstring samples from state and store the result
+  // bit vectors in results. Uses scratch and scratch2 as workspace.
+  static void Sample(MPS& state, MPS& scratch, MPS& scratch2,
+                     unsigned num_samples, unsigned seed,
+                     std::vector<std::vector<bool>>* results) {
+    std::mt19937 rand_source(seed);
+    results->reserve(num_samples);
+    for (unsigned i = 0; i < num_samples; i++) {
+      SampleOnce(state, scratch, scratch2, &rand_source, &(*results)[i]);
+    }
+  }
+
   // Testing only. Convert the MPS to a wavefunction under "normal" ordering.
   // Requires: wf be allocated beforehand with bond_dim * 2 ^ num_qubits -1
   // memory.
diff --git a/lib/qtrajectory.h b/lib/qtrajectory.h
index bba4708c..1da6692f 100644
--- a/lib/qtrajectory.h
+++ b/lib/qtrajectory.h
@@ -16,6 +16,7 @@
 #define QTRAJECTORY_H_
 
 #include <cmath>
+#include <complex>
 #include <cstdint>
 #include <random>
 #include <vector>
@@ -55,6 +56,37 @@ class QuantumTrajectorySimulator {
      * If true, normalize the state vector before performing measurements.
      */
     bool normalize_before_mea_gates = true;
+    /**
+     * If false, do not apply deferred operators after the main loop for
+     * the "primary" noise trajectory, that is the trajectory in which
+     * the primary (the first operators in their respective channels) Kraus
+     * operators are sampled for each channel and there are no measurements
+     * in the computational basis. This can be used to speed up simulations
+     * of circuits with weak noise and without measurements by reusing
+     * the primary trajectory results. There is an additional condition for
+     * RunBatch. In this case, the deferred operators after the main loop are
+     * still applied for the first occurence of the primary trajectory.
+     * The primary Kraus operators should have the highest sampling
+     * probabilities to achieve the highest speedup.
+     *
+     * It is the client's responsibility to collect the primary trajectory
+     * results and to reuse them.
+     */
+    bool apply_last_deferred_ops = true;
+  };
+
+  /**
+   * Struct with statistics to populate by RunBatch and RunOnce methods.
+   */
+  struct Stat {
+    /**
+     * Indices of sampled Kraus operator indices and/or measured bitstrings.
+     */
+    std::vector<uint64_t> samples;
+    /**
+     * True if the "primary" noise trajectory is sampled, false otherwise.
+     */
+    bool primary;
   };
 
   /**
@@ -70,8 +102,7 @@ class QuantumTrajectorySimulator {
    *   computing expectation values, etc). This function should have three
    *   required parameters [repetition ID (uint64_t), final state vector
    *   (const State&), statistics of sampled Kraus operator indices and/or
-   *   measured bitstrings (const std::vector<uint64_t>&)] and any number of
-   *   optional parameters.
+   *   measured bitstrings (const Stat&)] and any number of optional parameters.
    * @param args Optional arguments for the 'measure' function.
    * @return True if the simulation completed successfully; false otherwise.
    */
@@ -100,8 +131,7 @@ class QuantumTrajectorySimulator {
    *   computing expectation values, etc). This function should have three
    *   required parameters [repetition ID (uint64_t), final state vector
    *   (const State&), statistics of sampled Kraus operator indices and/or
-   *   measured bitstrings (const std::vector<uint64_t>&)] and any number of
-   *   optional parameters.
+   *   measured bitstrings (const Stat&)] and any number of optional parameters.
    * @param args Optional arguments for the 'measure' function.
    * @return True if the simulation completed successfully; false otherwise.
    */
@@ -116,20 +146,27 @@ class QuantumTrajectorySimulator {
     gates.reserve(4 * std::size_t(cend - cbeg));
 
     State state = state_space.Null();
-    State scratch = state_space.Null();
 
-    std::vector<uint64_t> stat;
+    Stat stat;
+    bool had_primary_realization = false;
 
     for (uint64_t r = r0; r < r1; ++r) {
       if (!state_space.IsNull(state)) {
         state_space.SetStateZero(state);
       }
 
-      if (!RunIteration(param, num_qubits, cbeg, cend, r,
-                        state_space, simulator, gates, scratch, state, stat)) {
+      bool apply_last_deferred_ops =
+          param.apply_last_deferred_ops || !had_primary_realization;
+
+      if (!RunIteration(param, apply_last_deferred_ops, num_qubits, cbeg, cend,
+                        r, state_space, simulator, gates, state, stat)) {
         return false;
       }
 
+      if (stat.primary && !had_primary_realization) {
+        had_primary_realization = true;
+      }
+
       measure(r, state, stat, args...);
     }
 
@@ -144,7 +181,6 @@ class QuantumTrajectorySimulator {
    * @param state_space StateSpace object required to manipulate state vector.
    * @param simulator Simulator object. Provides specific implementations for
    *   applying gates.
-   * @param scratch A temporary state vector. Used for samping Kraus operators.
    * @param state The state of the system, to be updated by this method.
    * @param stat Statistics of sampled Kraus operator indices and/or measured
    *   bitstrings, to be populated by this method.
@@ -153,11 +189,10 @@ class QuantumTrajectorySimulator {
   static bool RunOnce(const Parameter& param,
                       const NoisyCircuit<Gate>& circuit, uint64_t r,
                       const StateSpace& state_space, const Simulator& simulator,
-                      State& scratch, State& state,
-                      std::vector<uint64_t>& stat) {
+                      State& state, Stat& stat) {
     return RunOnce(param, circuit.num_qubits, circuit.channels.begin(),
                    circuit.channels.end(), r, state_space, simulator,
-                   scratch, state, stat);
+                   state, stat);
   }
 
   /**
@@ -170,7 +205,6 @@ class QuantumTrajectorySimulator {
    * @param state_space StateSpace object required to manipulate state vector.
    * @param simulator Simulator object. Provides specific implementations for
    *   applying gates.
-   * @param scratch A temporary state vector. Used for samping Kraus operators.
    * @param state The state of the system, to be updated by this method.
    * @param stat Statistics of sampled Kraus operator indices and/or measured
    *   bitstrings, to be populated by this method.
@@ -180,13 +214,12 @@ class QuantumTrajectorySimulator {
                       ncircuit_iterator<Gate> cbeg,
                       ncircuit_iterator<Gate> cend,
                       uint64_t r, const StateSpace& state_space,
-                      const Simulator& simulator, State& scratch, State& state,
-                      std::vector<uint64_t>& stat) {
+                      const Simulator& simulator, State& state, Stat& stat) {
     std::vector<const Gate*> gates;
     gates.reserve(4 * std::size_t(cend - cbeg));
 
-    if (!RunIteration(param, num_qubits, cbeg, cend, r,
-                      state_space, simulator, gates, scratch, state, stat)) {
+    if (!RunIteration(param, param.apply_last_deferred_ops, num_qubits, cbeg,
+                      cend, r, state_space, simulator, gates, state, stat)) {
       return false;
     }
 
@@ -194,16 +227,17 @@ class QuantumTrajectorySimulator {
   }
 
  private:
-  static bool RunIteration(const Parameter& param, unsigned num_qubits,
+  static bool RunIteration(const Parameter& param,
+                           bool apply_last_deferred_ops, unsigned num_qubits,
                            ncircuit_iterator<Gate> cbeg,
                            ncircuit_iterator<Gate> cend,
                            uint64_t rep, const StateSpace& state_space,
                            const Simulator& simulator,
-                           std::vector<const Gate*>& gates, State& scratch,
-                           State& state, std::vector<uint64_t>& stat) {
+                           std::vector<const Gate*>& gates,
+                           State& state, Stat& stat) {
     if (param.collect_kop_stat || param.collect_mea_stat) {
-      stat.reserve(std::size_t(cend - cbeg));
-      stat.resize(0);
+      stat.samples.reserve(std::size_t(cend - cbeg));
+      stat.samples.resize(0);
     }
 
     if (state_space.IsNull(state)) {
@@ -216,12 +250,12 @@ class QuantumTrajectorySimulator {
     }
 
     gates.resize(0);
-    stat.resize(0);
 
     RGen rgen(rep);
     std::uniform_real_distribution<double> distr(0.0, 1.0);
 
     bool unitary = true;
+    stat.primary = true;
 
     for (auto it = cbeg; it != cend; ++it) {
       const auto& channel = *it;
@@ -247,6 +281,8 @@ class QuantumTrajectorySimulator {
 
         CollectStat(param.collect_mea_stat, mresult.bits, stat);
 
+        stat.primary = false;
+
         continue;
       }
 
@@ -279,14 +315,8 @@ class QuantumTrajectorySimulator {
 
       NormalizeState(!unitary, state_space, unitary, state);
 
-      if (state_space.IsNull(scratch)) {
-        scratch = CreateState(num_qubits, state_space);
-        if (state_space.IsNull(scratch)) {
-          return false;
-        }
-      }
-
-      state_space.Copy(state, scratch);
+      double max_prob = 0;
+      std::size_t max_prob_index = 0;
 
       // Perform sampling of Kraus operators using norms of updated states.
       for (std::size_t i = 0; i < channel.size(); ++i) {
@@ -294,43 +324,39 @@ class QuantumTrajectorySimulator {
 
         if (kop.unitary) continue;
 
-        // Apply the Kraus operator.
-        if (kop.ops.size() == 1) {
-          ApplyGate(simulator, kop.ops[0], state);
-        } else {
-          DeferOps(kop.ops, gates);
+        double prob = std::real(
+            simulator.ExpectationValue(kop.qubits, kop.kd_k.data(), state));
 
-          if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) {
-            return false;
-          }
+        if (prob > max_prob) {
+          max_prob = prob;
+          max_prob_index = i;
         }
 
-        double n2 = state_space.Norm(state);
-
-        cp += n2 - kop.prob;
+        cp += prob - kop.prob;
 
         if (r < cp || i == channel.size() - 1) {
           // Sample ith Kraus operator if r < cp
-          // Sample the first Kraus operator if r is greater than the sum of
-          // all probablities due to round-off errors.
-          uint64_t k = r < cp ? i : 0;
+          // Sample the highest probability Kraus operator if r is greater
+          // than the sum of all probablities due to round-off errors.
+          uint64_t k = r < cp ? i : max_prob_index;
 
+          DeferOps(channel[k].ops, gates);
           CollectStat(param.collect_kop_stat, k, stat);
 
           unitary = false;
 
           break;
         }
-
-        state_space.Copy(scratch, state);
       }
     }
 
-    if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) {
-      return false;
-    }
+    if (apply_last_deferred_ops || !stat.primary) {
+      if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) {
+        return false;
+      }
 
-    NormalizeState(!unitary, state_space, unitary, state);
+      NormalizeState(!unitary, state_space, unitary, state);
+    }
 
     return true;
   }
@@ -384,10 +410,13 @@ class QuantumTrajectorySimulator {
     }
   }
 
-  static void CollectStat(bool collect_stat, uint64_t i,
-                          std::vector<uint64_t>& stat) {
+  static void CollectStat(bool collect_stat, uint64_t i, Stat& stat) {
     if (collect_stat) {
-      stat.push_back(i);
+      stat.samples.push_back(i);
+    }
+
+    if (i != 0) {
+      stat.primary = false;
     }
   }
 
diff --git a/lib/run_qsim.h b/lib/run_qsim.h
index bbc401b6..b0aad9f3 100644
--- a/lib/run_qsim.h
+++ b/lib/run_qsim.h
@@ -79,7 +79,7 @@ struct QSimRunner final {
     double t0 = 0.0;
     double t1 = 0.0;
 
-    if (param.verbosity > 0) {
+    if (param.verbosity > 1) {
       t0 = GetTime();
     }
 
@@ -96,17 +96,33 @@ struct QSimRunner final {
     state_space.SetStateZero(state);
     Simulator simulator = factory.CreateSimulator();
 
+    if (param.verbosity > 1) {
+      t1 = GetTime();
+      IO::messagef("init time is %g seconds.\n", t1 - t0);
+      t0 = GetTime();
+    }
+
     auto fused_gates = Fuser::FuseGates(param, circuit.num_qubits,
                                         circuit.gates, times_to_measure_at);
+
     if (fused_gates.size() == 0 && circuit.gates.size() > 0) {
       return false;
     }
 
+    if (param.verbosity > 1) {
+      t1 = GetTime();
+      IO::messagef("fuse time is %g seconds.\n", t1 - t0);
+    }
+
+    if (param.verbosity > 0) {
+      t0 = GetTime();
+    }
+
     unsigned cur_time_index = 0;
 
     // Apply fused gates.
     for (std::size_t i = 0; i < fused_gates.size(); ++i) {
-      if (param.verbosity > 1) {
+      if (param.verbosity > 3) {
         t1 = GetTime();
       }
 
@@ -116,7 +132,7 @@ struct QSimRunner final {
         return false;
       }
 
-      if (param.verbosity > 1) {
+      if (param.verbosity > 3) {
         double t2 = GetTime();
         IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1);
       }
@@ -132,7 +148,7 @@ struct QSimRunner final {
 
     if (param.verbosity > 0) {
       double t2 = GetTime();
-      IO::messagef("time elapsed %g seconds.\n", t2 - t0);
+      IO::messagef("time is %g seconds.\n", t2 - t0);
     }
 
     return true;
@@ -159,7 +175,7 @@ struct QSimRunner final {
     double t0 = 0.0;
     double t1 = 0.0;
 
-    if (param.verbosity > 0) {
+    if (param.verbosity > 1) {
       t0 = GetTime();
     }
 
@@ -168,16 +184,33 @@ struct QSimRunner final {
     StateSpace state_space = factory.CreateStateSpace();
     Simulator simulator = factory.CreateSimulator();
 
+    if (param.verbosity > 1) {
+      t1 = GetTime();
+      IO::messagef("init time is %g seconds.\n", t1 - t0);
+      t0 = GetTime();
+    }
+
     auto fused_gates = Fuser::FuseGates(param, circuit.num_qubits,
                                         circuit.gates);
+
     if (fused_gates.size() == 0 && circuit.gates.size() > 0) {
       return false;
     }
+
     measure_results.reserve(fused_gates.size());
 
+    if (param.verbosity > 1) {
+      t1 = GetTime();
+      IO::messagef("fuse time is %g seconds.\n", t1 - t0);
+    }
+
+    if (param.verbosity > 0) {
+      t0 = GetTime();
+    }
+
     // Apply fused gates.
     for (std::size_t i = 0; i < fused_gates.size(); ++i) {
-      if (param.verbosity > 1) {
+      if (param.verbosity > 3) {
         t1 = GetTime();
       }
 
@@ -187,7 +220,7 @@ struct QSimRunner final {
         return false;
       }
 
-      if (param.verbosity > 1) {
+      if (param.verbosity > 3) {
         double t2 = GetTime();
         IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1);
       }
@@ -195,7 +228,7 @@ struct QSimRunner final {
 
     if (param.verbosity > 0) {
       double t2 = GetTime();
-      IO::messagef("time elapsed %g seconds.\n", t2 - t0);
+      IO::messagef("simu time is %g seconds.\n", t2 - t0);
     }
 
     return true;
diff --git a/lib/simulator.h b/lib/simulator.h
new file mode 100644
index 00000000..d5af3c2a
--- /dev/null
+++ b/lib/simulator.h
@@ -0,0 +1,511 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_H_
+#define SIMULATOR_H_
+
+#include <cstdint>
+
+#include "bits.h"
+
+namespace qsim {
+
+/**
+ * Base class for simulator classes.
+ */
+class SimulatorBase {
+ protected:
+  // The follwoing template parameters are used for functions below.
+  // H - the number of high (target) qubits.
+  // L - the number of low (target) qubits.
+  // R - SIMD register width in floats.
+
+  // Fills the table of masks (ms) that is used to calculate base state indices
+  // and the table of offset indices (xss) that is used to access the state
+  // vector entries in matrix-vector multiplication functions. This function is
+  // used in simulator_basic.h, simulator_sse.h and simulator_avx.h (no bmi2
+  // version).
+  template <unsigned H, unsigned L = 0>
+  static void FillIndices(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          uint64_t* ms, uint64_t* xss) {
+    constexpr unsigned hsize = 1 << H;
+
+    uint64_t xs[H];
+
+    xs[0] = uint64_t{1} << (qs[L] + 1);
+    ms[0] = (uint64_t{1} << qs[L]) - 1;
+    for (unsigned i = 1; i < H; ++i) {
+      xs[i] = uint64_t{1} << (qs[L + i] + 1);
+      ms[i] = ((uint64_t{1} << qs[L + i]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[H] = ((uint64_t{1} << num_qubits) - 1) ^ (xs[H - 1] - 1);
+
+    for (unsigned i = 0; i < hsize; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < H; ++k) {
+        a += xs[k] * ((i >> k) & 1);
+      }
+      xss[i] = a;
+    }
+  }
+
+  // Fills gate matrix entries for gates with low qubits.
+  template <unsigned H, unsigned L, unsigned R, typename fp_type>
+  static void FillMatrix(unsigned qmaskl, const fp_type* matrix, fp_type* w) {
+    constexpr unsigned gsize = 1 << (H + L);
+    constexpr unsigned hsize = 1 << H;
+    constexpr unsigned lsize = 1 << L;
+    constexpr unsigned rsize = 1 << R;
+
+    unsigned s = 0;
+
+    for (unsigned i = 0; i < hsize; ++i) {
+      for (unsigned j = 0; j < gsize; ++j) {
+        unsigned p0 = 2 * i * lsize * gsize + 2 * lsize * (j / lsize);
+
+        for (unsigned k = 0; k < rsize; ++k) {
+          unsigned l = bits::CompressBits(k, R, qmaskl);
+          unsigned p = p0 + 2 * (gsize * l + (j + l) % lsize);
+
+          w[s + 0] = matrix[p];
+          w[s + rsize] = matrix[p + 1];
+
+          ++s;
+        }
+
+        s += rsize;
+      }
+    }
+  }
+
+  // Fills gate matrix entries for controlled gates with high target qubits
+  // and low control qubits.
+  template <unsigned H, unsigned R, typename fp_type>
+  static void FillControlledMatrixH(uint64_t cvalsl, uint64_t cmaskl,
+                                    const fp_type* matrix, fp_type* w) {
+    constexpr unsigned hsize = 1 << H;
+    constexpr unsigned rsize = 1 << R;
+
+    unsigned s = 0;
+
+    for (unsigned i = 0; i < hsize; ++i) {
+      for (unsigned j = 0; j < hsize; ++j) {
+        unsigned p = hsize * i + j;
+        fp_type v = i == j ? 1 : 0;
+
+        for (unsigned k = 0; k < rsize; ++k) {
+          w[s] = cvalsl == (k & cmaskl) ? matrix[2 * p] : v;
+          w[s + rsize] = cvalsl == (k & cmaskl) ? matrix[2 * p + 1] : 0;
+
+          ++s;
+        }
+
+        s += rsize;
+      }
+    }
+  }
+
+  // Fills gate matrix entries for controlled gates with low target qubits
+  // and low control qubits.
+  template <unsigned H, unsigned L, unsigned R, typename fp_type>
+  static void FillControlledMatrixL(uint64_t cvalsl, uint64_t cmaskl,
+                                    unsigned qmaskl, const fp_type* matrix,
+                                    fp_type* w) {
+    constexpr unsigned gsize = 1 << (H + L);
+    constexpr unsigned hsize = 1 << H;
+    constexpr unsigned lsize = 1 << L;
+    constexpr unsigned rsize = 1 << R;
+
+    unsigned s = 0;
+
+    for (unsigned i = 0; i < hsize; ++i) {
+      for (unsigned j = 0; j < gsize; ++j) {
+        unsigned p0 = i * lsize * gsize + lsize * (j / lsize);
+
+        for (unsigned k = 0; k < rsize; ++k) {
+          unsigned l = bits::CompressBits(k, R, qmaskl);
+          unsigned p = p0 + gsize * l + (j + l) % lsize;
+
+          fp_type v = p / gsize == p % gsize ? 1 : 0;
+
+          w[s] = cvalsl == (k & cmaskl) ? matrix[2 * p] : v;
+          w[s + rsize] = cvalsl == (k & cmaskl) ? matrix[2 * p + 1] : 0;
+
+          ++s;
+        }
+
+        s += rsize;
+      }
+    }
+  }
+
+/*
+  The GetMasks* functions below provide various masks and related values.
+  GetMasks1, GetMasks2, GetMasks3, GetMasks4, GetMasks5 and GetMasks6 are
+  used in simulator_avx.h (BMI2 version) and in simulator_avx512.h. GetMasks7,
+  GetMasks8, GetMasks9, GetMasks10 and GetMasks11 are used in simulator_avx.h
+  (no BMI2 version) and in simulator_sse.h.
+
+  imaskh - inverted mask of high qubits (high control and target qubits).
+  qmaskh - mask of high qubits (high target qubits).
+  cvalsh - control bit values of high control qubits placed in correct
+           positions.
+  cvalsl - control bit values of low control qubits placed in correct positions.
+  cmaskh - mask of high control qubits.
+  cmaskl - mask of low control qubits.
+  qmaskl - mask of low qubits (low target qubits).
+  cl - the number of low control qubits.
+
+  Note that imaskh, qmaskh and cvalsh are multiplied by two in GetMasks1,
+  GetMasks2, GetMasks3, GetMasks4, GetMasks5 and GetMasks6.
+*/
+
+  struct Masks1 {
+    uint64_t imaskh;
+    uint64_t qmaskh;
+  };
+
+  template <unsigned H, unsigned R>
+  static Masks1 GetMasks1(const std::vector<unsigned>& qs) {
+    uint64_t qmaskh = 0;
+
+    for (unsigned i = 0; i < H; ++i) {
+      qmaskh |= uint64_t{1} << qs[i];
+    }
+
+    return {2 * (~qmaskh ^ ((1 << R) - 1)), 2 * qmaskh};
+  }
+
+  struct Masks2 {
+    uint64_t imaskh;
+    uint64_t qmaskh;
+    unsigned qmaskl;
+  };
+
+  template <unsigned H, unsigned L, unsigned R>
+  static Masks2 GetMasks2(const std::vector<unsigned>& qs) {
+    uint64_t qmaskh = 0;
+    unsigned qmaskl = 0;
+
+    for (unsigned i = 0; i < L; ++i) {
+      qmaskl |= 1 << qs[i];
+    }
+
+    for (unsigned i = L; i < H + L; ++i) {
+      qmaskh |= uint64_t{1} << qs[i];
+    }
+
+    return {2 * (~qmaskh ^ ((1 << R) - 1)), 2 * qmaskh, qmaskl};
+  }
+
+  struct Masks3 {
+    uint64_t imaskh;
+    uint64_t qmaskh;
+    uint64_t cvalsh;
+  };
+
+  template <unsigned H, unsigned R>
+  static Masks3 GetMasks3(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          const std::vector<unsigned>& cqs, uint64_t cvals) {
+    uint64_t qmaskh = 0;
+    uint64_t cmaskh = 0;
+
+    for (unsigned i = 0; i < H; ++i) {
+      qmaskh |= uint64_t{1} << qs[i];
+    }
+
+    for (auto q : cqs) {
+      cmaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
+
+    uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1);
+
+    return {2 * maskh, 2 * qmaskh, 2 * cvalsh};
+  }
+
+  struct Masks4 {
+    uint64_t imaskh;
+    uint64_t qmaskh;
+    uint64_t cvalsh;
+    uint64_t cvalsl;
+    uint64_t cmaskl;
+    unsigned cl;
+  };
+
+  template <unsigned H, unsigned R>
+  static Masks4 GetMasks4(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          const std::vector<unsigned>& cqs, uint64_t cvals) {
+    unsigned cl = 0;
+    uint64_t qmaskh = 0;
+    uint64_t cmaskh = 0;
+    uint64_t cmaskl = 0;
+
+    for (unsigned i = 0; i < H; ++i) {
+      qmaskh |= uint64_t{1} << qs[i];
+    }
+
+    for (auto q : cqs) {
+      if (q >= R) {
+        cmaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        cmaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh);
+    uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl);
+
+    uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1);
+
+    return {2 * maskh, 2 * qmaskh, 2 * cvalsh, cvalsl, cmaskl, cl};
+  }
+
+  struct Masks5 {
+    uint64_t imaskh;
+    uint64_t qmaskh;
+    uint64_t cvalsh;
+    unsigned qmaskl;
+  };
+
+  template <unsigned H, unsigned L, unsigned R>
+  static Masks5 GetMasks5(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          const std::vector<unsigned>& cqs, uint64_t cvals) {
+    uint64_t qmaskh = 0;
+    uint64_t cmaskh = 0;
+    unsigned qmaskl = 0;
+
+    for (unsigned i = 0; i < L; ++i) {
+      qmaskl |= 1 << qs[i];
+    }
+
+    for (unsigned i = L; i < H + L; ++i) {
+      qmaskh |= uint64_t{1} << qs[i];
+    }
+
+    for (auto q : cqs) {
+      cmaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
+
+    uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1);
+
+    return {2 * maskh, 2 * qmaskh, 2 * cvalsh, qmaskl};
+  }
+
+  struct Masks6 {
+    uint64_t imaskh;
+    uint64_t qmaskh;
+    uint64_t cvalsh;
+    uint64_t cvalsl;
+    uint64_t cmaskl;
+    unsigned qmaskl;
+    unsigned cl;
+  };
+
+  template <unsigned H, unsigned L, unsigned R>
+  static Masks6 GetMasks6(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          const std::vector<unsigned>& cqs, uint64_t cvals) {
+    unsigned cl = 0;
+    uint64_t qmaskh = 0;
+    uint64_t cmaskh = 0;
+    uint64_t cmaskl = 0;
+    unsigned qmaskl = 0;
+
+    for (unsigned i = 0; i < L; ++i) {
+      qmaskl |= 1 << qs[i];
+    }
+
+    for (unsigned i = L; i < H + L; ++i) {
+      qmaskh |= uint64_t{1} << qs[i];
+    }
+
+    for (auto q : cqs) {
+      if (q >= R) {
+        cmaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        cmaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh);
+    uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl);
+
+    uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1);
+
+    return {2 * maskh, 2 * qmaskh, 2 * cvalsh, cvalsl, cmaskl, qmaskl, cl};
+  }
+
+  struct Masks7 {
+    uint64_t cvalsh;
+    uint64_t cmaskh;
+  };
+
+  static Masks7 GetMasks7(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          const std::vector<unsigned>& cqs, uint64_t cvals) {
+    uint64_t cmaskh = 0;
+
+    for (auto q : cqs) {
+      cmaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
+
+    return {cvalsh, cmaskh};
+  }
+
+  struct Masks8 {
+    uint64_t cvalsh;
+    uint64_t cmaskh;
+    uint64_t cvalsl;
+    uint64_t cmaskl;
+  };
+
+  template <unsigned R>
+  static Masks8 GetMasks8(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          const std::vector<unsigned>& cqs, uint64_t cvals) {
+    unsigned cl = 0;
+    uint64_t cmaskh = 0;
+    uint64_t cmaskl = 0;
+
+    for (auto q : cqs) {
+      if (q >= R) {
+        cmaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        cmaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh);
+    uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl);
+
+    return {cvalsh, cmaskh, cvalsl, cmaskl};
+  }
+
+  struct Masks9 {
+    uint64_t cvalsh;
+    uint64_t cmaskh;
+    unsigned qmaskl;
+  };
+
+  template <unsigned L>
+  static Masks9 GetMasks9(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          const std::vector<unsigned>& cqs, uint64_t cvals) {
+    uint64_t cmaskh = 0;
+    unsigned qmaskl = 0;
+
+    for (unsigned i = 0; i < L; ++i) {
+      qmaskl |= 1 << qs[i];
+    }
+
+    for (auto q : cqs) {
+      cmaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
+
+    return {cvalsh, cmaskh, qmaskl};
+  }
+
+  struct Masks10 {
+    uint64_t cvalsh;
+    uint64_t cmaskh;
+    uint64_t cvalsl;
+    uint64_t cmaskl;
+    unsigned qmaskl;
+  };
+
+  template <unsigned L, unsigned R>
+  static Masks10 GetMasks10(unsigned num_qubits,
+                            const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals) {
+    unsigned cl = 0;
+    uint64_t cmaskh = 0;
+    uint64_t cmaskl = 0;
+    unsigned qmaskl = 0;
+
+    for (unsigned i = 0; i < L; ++i) {
+      qmaskl |= 1 << qs[i];
+    }
+
+    for (auto q : cqs) {
+      if (q >= R) {
+        cmaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        cmaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh);
+    uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl);
+
+    return {cvalsh, cmaskh, cvalsl, cmaskl, qmaskl};
+  }
+
+  struct Masks11 {
+    unsigned qmaskl;
+  };
+
+  template <unsigned L>
+  static Masks11 GetMasks11(const std::vector<unsigned>& qs) {
+    unsigned qmaskl = 0;
+
+    for (unsigned i = 0; i < L; ++i) {
+      qmaskl |= 1 << qs[i];
+    }
+
+    return {qmaskl};
+  }
+
+  template <unsigned R>
+  static unsigned MaskedAdd(
+      unsigned a, unsigned b, unsigned mask, unsigned lsize) {
+    unsigned c = bits::CompressBits(a, R, mask);
+    return bits::ExpandBits((c + b) % lsize, R, mask);
+  }
+};
+
+template <>
+inline void SimulatorBase::FillIndices<0, 1>(unsigned num_qubits,
+                                             const std::vector<unsigned>& qs,
+                                             uint64_t* ms, uint64_t* xss) {
+  ms[0] = -1;
+  xss[0] = 0;
+}
+
+template <>
+inline void SimulatorBase::FillIndices<0, 2>(unsigned num_qubits,
+                                             const std::vector<unsigned>& qs,
+                                             uint64_t* ms, uint64_t* xss) {
+  ms[0] = -1;
+  xss[0] = 0;
+}
+
+template <>
+inline void SimulatorBase::FillIndices<0, 3>(unsigned num_qubits,
+                                             const std::vector<unsigned>& qs,
+                                             uint64_t* ms, uint64_t* xss) {
+  ms[0] = -1;
+  xss[0] = 0;
+}
+
+}  // namespace qsim
+
+#endif  // SIMULATOR_H_
diff --git a/lib/simulator_avx.h b/lib/simulator_avx.h
index f0dbaa5e..351e0832 100644
--- a/lib/simulator_avx.h
+++ b/lib/simulator_avx.h
@@ -17,11 +17,12 @@
 
 #include <immintrin.h>
 
-#include <algorithm>
 #include <complex>
 #include <cstdint>
+#include <functional>
+#include <vector>
 
-#include "bits.h"
+#include "simulator.h"
 #include "statespace_avx.h"
 
 namespace qsim {
@@ -30,7 +31,7 @@ namespace qsim {
  * Quantum circuit simulator with AVX vectorization.
  */
 template <typename For>
-class SimulatorAVX final {
+class SimulatorAVX final : public SimulatorBase {
  public:
   using StateSpace = StateSpaceAVX<For>;
   using State = typename StateSpace::State;
@@ -52,62 +53,62 @@ class SimulatorAVX final {
     switch (qs.size()) {
     case 1:
       if (qs[0] > 2) {
-        ApplyGate1H(qs, matrix, state);
+        ApplyGateH<1>(qs, matrix, state);
       } else {
-        ApplyGate1L(qs, matrix, state);
+        ApplyGateL<0, 1>(qs, matrix, state);
       }
       break;
     case 2:
       if (qs[0] > 2) {
-        ApplyGate2HH(qs, matrix, state);
+        ApplyGateH<2>(qs, matrix, state);
       } else if (qs[1] > 2) {
-        ApplyGate2HL(qs, matrix, state);
+        ApplyGateL<1, 1>(qs, matrix, state);
       } else {
-        ApplyGate2LL(qs, matrix, state);
+        ApplyGateL<0, 2>(qs, matrix, state);
       }
       break;
     case 3:
       if (qs[0] > 2) {
-        ApplyGate3HHH(qs, matrix, state);
+        ApplyGateH<3>(qs, matrix, state);
       } else if (qs[1] > 2) {
-        ApplyGate3HHL(qs, matrix, state);
+        ApplyGateL<2, 1>(qs, matrix, state);
       } else if (qs[2] > 2) {
-        ApplyGate3HLL(qs, matrix, state);
+        ApplyGateL<1, 2>(qs, matrix, state);
       } else {
-        ApplyGate3LLL(qs, matrix, state);
+        ApplyGateL<0, 3>(qs, matrix, state);
       }
       break;
     case 4:
       if (qs[0] > 2) {
-        ApplyGate4HHHH(qs, matrix, state);
+        ApplyGateH<4>(qs, matrix, state);
       } else if (qs[1] > 2) {
-        ApplyGate4HHHL(qs, matrix, state);
+        ApplyGateL<3, 1>(qs, matrix, state);
       } else if (qs[2] > 2) {
-        ApplyGate4HHLL(qs, matrix, state);
+        ApplyGateL<2, 2>(qs, matrix, state);
       } else {
-        ApplyGate4HLLL(qs, matrix, state);
+        ApplyGateL<1, 3>(qs, matrix, state);
       }
       break;
     case 5:
       if (qs[0] > 2) {
-        ApplyGate5HHHHH(qs, matrix, state);
+        ApplyGateH<5>(qs, matrix, state);
       } else if (qs[1] > 2) {
-        ApplyGate5HHHHL(qs, matrix, state);
+        ApplyGateL<4, 1>(qs, matrix, state);
       } else if (qs[2] > 2) {
-        ApplyGate5HHHLL(qs, matrix, state);
+        ApplyGateL<3, 2>(qs, matrix, state);
       } else {
-        ApplyGate5HHLLL(qs, matrix, state);
+        ApplyGateL<2, 3>(qs, matrix, state);
       }
       break;
     case 6:
       if (qs[0] > 2) {
-        ApplyGate6HHHHHH(qs, matrix, state);
+        ApplyGateH<6>(qs, matrix, state);
       } else if (qs[1] > 2) {
-        ApplyGate6HHHHHL(qs, matrix, state);
+        ApplyGateL<5, 1>(qs, matrix, state);
       } else if (qs[2] > 2) {
-        ApplyGate6HHHHLL(qs, matrix, state);
+        ApplyGateL<4, 2>(qs, matrix, state);
       } else {
-        ApplyGate6HHHLLL(qs, matrix, state);
+        ApplyGateL<3, 3>(qs, matrix, state);
       }
       break;
     default:
@@ -120,13 +121,16 @@ class SimulatorAVX final {
    * Applies a controlled gate using AVX instructions.
    * @param qs Indices of the qubits affected by this gate.
    * @param cqs Indices of control qubits.
-   * @param cmask Bit mask of control qubit values.
+   * @param cvals Bit mask of control qubit values.
    * @param matrix Matrix representation of the gate to be applied.
    * @param state The state of the system, to be updated by this method.
    */
   void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cmask,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
                            const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
+
     if (cqs.size() == 0) {
       ApplyGate(qs, matrix, state);
       return;
@@ -136,90 +140,90 @@ class SimulatorAVX final {
     case 1:
       if (qs[0] > 2) {
         if (cqs[0] > 2) {
-          ApplyControlledGate1H_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate1H_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 2) {
-          ApplyControlledGate1L_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate1L_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
     case 2:
       if (qs[0] > 2) {
         if (cqs[0] > 2) {
-          ApplyControlledGate2HH_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate2HH_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[1] > 2) {
         if (cqs[0] > 2) {
-          ApplyControlledGate2HL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate2HL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 2) {
-          ApplyControlledGate2LL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate2LL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
     case 3:
       if (qs[0] > 2) {
         if (cqs[0] > 2) {
-          ApplyControlledGate3HHH_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3HHH_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[1] > 2) {
         if (cqs[0] > 2) {
-          ApplyControlledGate3HHL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3HHL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[2] > 2) {
         if (cqs[0] > 2) {
-          ApplyControlledGate3HLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3HLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 2) {
-          ApplyControlledGate3LLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3LLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
     case 4:
       if (qs[0] > 2) {
         if (cqs[0] > 2) {
-          ApplyControlledGate4HHHH_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HHHH_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[1] > 2) {
         if (cqs[0] > 2) {
-          ApplyControlledGate4HHHL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HHHL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[2] > 2) {
         if (cqs[0] > 2) {
-          ApplyControlledGate4HHLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HHLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 2) {
-          ApplyControlledGate4HLLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HLLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
@@ -244,62 +248,62 @@ class SimulatorAVX final {
     switch (qs.size()) {
     case 1:
       if (qs[0] > 2) {
-        return ExpectationValue1H(qs, matrix, state);
+        return ExpectationValueH<1>(qs, matrix, state);
       } else {
-        return ExpectationValue1L(qs, matrix, state);
+        return ExpectationValueL<0, 1>(qs, matrix, state);
       }
       break;
     case 2:
       if (qs[0] > 2) {
-        return ExpectationValue2HH(qs, matrix, state);
+        return ExpectationValueH<2>(qs, matrix, state);
       } else if (qs[1] > 2) {
-        return ExpectationValue2HL(qs, matrix, state);
+        return ExpectationValueL<1, 1>(qs, matrix, state);
       } else {
-        return ExpectationValue2LL(qs, matrix, state);
+        return ExpectationValueL<0, 2>(qs, matrix, state);
       }
       break;
     case 3:
       if (qs[0] > 2) {
-        return ExpectationValue3HHH(qs, matrix, state);
+        return ExpectationValueH<3>(qs, matrix, state);
       } else if (qs[1] > 2) {
-        return ExpectationValue3HHL(qs, matrix, state);
+        return ExpectationValueL<2, 1>(qs, matrix, state);
       } else if (qs[2] > 2) {
-        return ExpectationValue3HLL(qs, matrix, state);
+        return ExpectationValueL<1, 2>(qs, matrix, state);
       } else {
-        return ExpectationValue3LLL(qs, matrix, state);
+        return ExpectationValueL<0, 3>(qs, matrix, state);
       }
       break;
     case 4:
       if (qs[0] > 2) {
-        return ExpectationValue4HHHH(qs, matrix, state);
+        return ExpectationValueH<4>(qs, matrix, state);
       } else if (qs[1] > 2) {
-        return ExpectationValue4HHHL(qs, matrix, state);
+        return ExpectationValueL<3, 1>(qs, matrix, state);
       } else if (qs[2] > 2) {
-        return ExpectationValue4HHLL(qs, matrix, state);
+        return ExpectationValueL<2, 2>(qs, matrix, state);
       } else {
-        return ExpectationValue4HLLL(qs, matrix, state);
+        return ExpectationValueL<1, 3>(qs, matrix, state);
       }
       break;
     case 5:
       if (qs[0] > 2) {
-        return ExpectationValue5HHHHH(qs, matrix, state);
+        return ExpectationValueH<5>(qs, matrix, state);
       } else if (qs[1] > 2) {
-        return ExpectationValue5HHHHL(qs, matrix, state);
+        return ExpectationValueL<4, 1>(qs, matrix, state);
       } else if (qs[2] > 2) {
-        return ExpectationValue5HHHLL(qs, matrix, state);
+        return ExpectationValueL<3, 2>(qs, matrix, state);
       } else {
-        return ExpectationValue5HHLLL(qs, matrix, state);
+        return ExpectationValueL<2, 3>(qs, matrix, state);
       }
       break;
     case 6:
       if (qs[0] > 2) {
-        return ExpectationValue6HHHHHH(qs, matrix, state);
+        return ExpectationValueH<6>(qs, matrix, state);
       } else if (qs[1] > 2) {
-        return ExpectationValue6HHHHHL(qs, matrix, state);
+        return ExpectationValueL<5, 1>(qs, matrix, state);
       } else if (qs[2] > 2) {
-        return ExpectationValue6HHHHLL(qs, matrix, state);
+        return ExpectationValueL<4, 2>(qs, matrix, state);
       } else {
-        return ExpectationValue6HHHLLL(qs, matrix, state);
+        return ExpectationValueL<3, 3>(qs, matrix, state);
       }
       break;
     default:
@@ -318,44 +322,31 @@ class SimulatorAVX final {
   }
 
  private:
-  void ApplyGate1H(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
 
+#ifdef __BMI2__
+
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
+                uint64_t imaskh, uint64_t qmaskh, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
       __m256 ru, iu, rn, in;
-      __m256 rs[2], is[2];
+      __m256 rs[hsize], is[hsize];
 
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]);
+      auto p0 = rstate + _pdep_u64(i, imaskh);
 
-      auto p0 = rstate + 2 * k;
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
 
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
+        rs[k] = _mm256_load_ps(p0 + p);
+        is[k] = _mm256_load_ps(p0 + p + 8);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 2; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         ru = _mm256_set1_ps(v[j]);
         iu = _mm256_set1_ps(v[j + 1]);
         rn = _mm256_mul_ps(rs[0], ru);
@@ -365,89 +356,64 @@ class SimulatorAVX final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 2; ++n) {
+        for (unsigned l = 1; l < hsize; ++l) {
           ru = _mm256_set1_ps(v[j]);
           iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
 
           j += 2;
         }
 
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    auto m = GetMasks1<H, 3>(qs);
 
-    unsigned k = 4;
+    unsigned k = 3 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, matrix, ms, xss, rstate);
+    for_.Run(size, f, matrix, m.imaskh, m.qmaskh, state.get());
   }
 
-  void ApplyGate1L(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, State& state) const {
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = StateSpace::Create(4);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const __m256i* idx, fp_type* rstate) {
+                uint64_t imaskh, uint64_t qmaskh, const __m256i* idx,
+                fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
       __m256 rn, in;
-      __m256 rs[2], is[2];
+      __m256 rs[gsize], is[gsize];
+
+      auto p0 = rstate + _pdep_u64(i, imaskh);
 
-      auto p0 = rstate + 16 * i;
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
 
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0);
-        is[2 * l] = _mm256_load_ps(p0 + 8);
+        rs[k2] = _mm256_load_ps(p0 + p);
+        is[k2] = _mm256_load_ps(p0 + p + 8);
 
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
         }
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 1; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm256_mul_ps(rs[0], w[j]);
         in = _mm256_mul_ps(rs[0], w[j + 1]);
         rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
@@ -455,71 +421,60 @@ class SimulatorAVX final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
 
           j += 2;
         }
 
-        _mm256_store_ps(p0, rn);
-        _mm256_store_ps(p0 + 8, in);
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
 
-    unsigned k = 3;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    auto m = GetMasks2<H, L, 3>(qs);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 3 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, w, idx, rstate);
+    for_.Run(size, f, w, m.imaskh, m.qmaskh, idx, state.get());
   }
 
-  void ApplyGate2HH(const std::vector<unsigned>& qs,
-                    const fp_type* matrix, State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
                 fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
       __m256 ru, iu, rn, in;
-      __m256 rs[4], is[4];
+      __m256 rs[hsize], is[hsize];
 
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2]);
+      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
 
-      auto p0 = rstate + 2 * k;
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
 
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
+        rs[k] = _mm256_load_ps(p0 + p);
+        is[k] = _mm256_load_ps(p0 + p + 8);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 4; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         ru = _mm256_set1_ps(v[j]);
         iu = _mm256_set1_ps(v[j + 1]);
         rn = _mm256_mul_ps(rs[0], ru);
@@ -529,110 +484,124 @@ class SimulatorAVX final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 4; ++n) {
+        for (unsigned l = 1; l < hsize; ++l) {
           ru = _mm256_set1_ps(v[j]);
           iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
 
           j += 2;
         }
 
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    auto m = GetMasks3<H, 3>(state.num_qubits(), qs, cqs, cvals);
 
-    unsigned k = 5;
+    unsigned k = 3 + H + cqs.size();
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, matrix, ms, xss, rstate);
+    for_.Run(size, f, matrix, m.imaskh, m.qmaskh, m.cvalsh, state.get());
   }
 
-  void ApplyGate2HL(const std::vector<unsigned>& qs,
-                    const fp_type* matrix, State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
 
-    unsigned p[8];
-    __m256i idx[1];
+      __m256 rn, in;
+      __m256 rs[hsize], is[hsize];
 
-    auto s = StateSpace::Create(6);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
+      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
 
-    unsigned qmask = (1 << qs[0]);
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
 
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+        rs[k] = _mm256_load_ps(p0 + p);
+        is[k] = _mm256_load_ps(p0 + p + 8);
       }
 
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
+      uint64_t j = 0;
 
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
 
-        unsigned l = 2 * (4 * i + m);
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
 
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
+          j += 2;
         }
 
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
       }
-    }
+    };
+
+    __m256 w[1 << (1 + 2 * H)];
+
+    auto m = GetMasks4<H, 3>(state.num_qubits(), qs, cqs, cvals);
+    FillControlledMatrixH<H, 3>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 3 + H + cqs.size() - m.cl;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, state.get());
+  }
 
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
                 const __m256i* idx, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
       __m256 rn, in;
-      __m256 rs[4], is[4];
+      __m256 rs[gsize], is[gsize];
 
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]);
+      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
 
-      auto p0 = rstate + 2 * k;
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
 
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
+        rs[k2] = _mm256_load_ps(p0 + p);
+        is[k2] = _mm256_load_ps(p0 + p + 8);
 
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
         }
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 2; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm256_mul_ps(rs[0], w[j]);
         in = _mm256_mul_ps(rs[0], w[j + 1]);
         rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
@@ -640,87 +609,149 @@ class SimulatorAVX final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
 
           j += 2;
         }
 
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
 
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    if (CH) {
+      auto m = GetMasks5<H, L, 3>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+      unsigned r = 3 + H + cqs.size();
+      unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+      uint64_t size = uint64_t{1} << n;
+
+      for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get());
+    } else {
+      auto m = GetMasks6<H, L, 3>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillControlledMatrixL<H, L, 3>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
+
+      unsigned r = 3 + H + cqs.size() - m.cl;
+      unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+      uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, w, ms, xss, idx, rstate);
+      for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get());
+    }
   }
 
-  void ApplyGate2LL(const std::vector<unsigned>& qs,
-                    const fp_type* matrix, State& state) const {
-    unsigned p[8];
-    __m256i idx[3];
+  template <unsigned H>
+  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                uint64_t imaskh, uint64_t qmaskh, const fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
 
-    auto s = StateSpace::Create(5);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
 
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+      auto p0 = rstate + _pdep_u64(i, imaskh);
 
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm256_load_ps(p0 + p);
+        is[k] = _mm256_load_ps(p0 + p + 8);
       }
 
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
+      double re = 0;
+      double im = 0;
+      uint64_t j = 0;
 
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
 
-        unsigned l = 2 * (4 * i + m);
+        j += 2;
 
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
 
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
+          j += 2;
         }
+
+        __m256 v_re = _mm256_fmadd_ps(is[k], in, _mm256_mul_ps(rs[k], rn));
+        __m256 v_im = _mm256_fnmadd_ps(is[k], rn, _mm256_mul_ps(rs[k], in));
+
+        re += detail::HorizontalSumAVX(v_re);
+        im += detail::HorizontalSumAVX(v_im);
       }
-    }
 
+      return std::complex<double>{re, im};
+    };
+
+    auto m = GetMasks1<H, 3>(qs);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return
+        for_.RunReduce(size, f, Op(), matrix, m.imaskh, m.qmaskh, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const __m256i* idx, fp_type* rstate) {
+                uint64_t imaskh, uint64_t qmaskh, const __m256i* idx,
+                const fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
       __m256 rn, in;
-      __m256 rs[4], is[4];
+      __m256 rs[gsize], is[gsize];
 
-      auto p0 = rstate + 16 * i;
+      auto p0 = rstate + _pdep_u64(i, imaskh);
 
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0);
-        is[4 * l] = _mm256_load_ps(p0 + 8);
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
 
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
+        rs[k2] = _mm256_load_ps(p0 + p);
+        is[k2] = _mm256_load_ps(p0 + p + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
         }
       }
 
+      double re = 0;
+      double im = 0;
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 1; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm256_mul_ps(rs[0], w[j]);
         in = _mm256_mul_ps(rs[0], w[j + 1]);
         rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
@@ -728,72 +759,73 @@ class SimulatorAVX final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
 
           j += 2;
         }
 
-        _mm256_store_ps(p0, rn);
-        _mm256_store_ps(p0 + 8, in);
+        unsigned m = lsize * k;
+
+        __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn));
+        __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in));
+
+        re += detail::HorizontalSumAVX(v_re);
+        im += detail::HorizontalSumAVX(v_im);
       }
+
+      return std::complex<double>{re, im};
     };
 
-    fp_type* rstate = state.get();
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
 
-    unsigned k = 3;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    auto m = GetMasks2<H, L, 3>(qs);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 3 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, w, idx, rstate);
+    using Op = std::plus<std::complex<double>>;
+    return
+        for_.RunReduce(size, f, Op(), w, m.imaskh, m.qmaskh, idx, state.get());
   }
 
-  void ApplyGate3HHH(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
+#else  // __BMI2__
 
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
+                const uint64_t* ms, const uint64_t* xss, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
       __m256 ru, iu, rn, in;
-      __m256 rs[8], is[8];
+      __m256 rs[hsize], is[hsize];
 
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2])
-          | (64 * i & ms[3]);
+      i *= 8;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
 
-      auto p0 = rstate + 2 * k;
+      auto p0 = rstate + 2 * ii;
 
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm256_load_ps(p0 + xss[k]);
+        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 8; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         ru = _mm256_set1_ps(v[j]);
         iu = _mm256_set1_ps(v[j + 1]);
         rn = _mm256_mul_ps(rs[0], ru);
@@ -803,114 +835,71 @@ class SimulatorAVX final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 8; ++n) {
+        for (unsigned l = 1; l < hsize; ++l) {
           ru = _mm256_set1_ps(v[j]);
           iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
 
           j += 2;
         }
 
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
 
-    unsigned k = 6;
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 3 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, matrix, ms, xss, rstate);
+    for_.Run(size, f, matrix, ms, xss, state.get());
   }
 
-  void ApplyGate3HHL(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = StateSpace::Create(8);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                const uint64_t* ms, const uint64_t* xss, const __m256i* idx,
+                fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
 
-        unsigned l = 2 * (8 * i + m);
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
 
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
+      i *= 8;
 
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
       }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
 
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2]);
+      auto p0 = rstate + 2 * ii;
 
-      auto p0 = rstate + 2 * k;
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        rs[k2] = _mm256_load_ps(p0 + xss[k]);
+        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
 
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
         }
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 4; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm256_mul_ps(rs[0], w[j]);
         in = _mm256_mul_ps(rs[0], w[j + 1]);
         rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
@@ -918,108 +907,140 @@ class SimulatorAVX final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
 
           j += 2;
         }
 
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
 
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    auto m = GetMasks11<L>(qs);
+
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 3 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, w, ms, xss, idx, rstate);
+    for_.Run(size, f, w, ms, xss, idx, state.get());
   }
 
-  void ApplyGate3HLL(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
 
-    unsigned p[8];
-    __m256i idx[3];
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      i *= 8;
 
-    auto s = StateSpace::Create(7);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
 
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+      if ((ii & cmaskh) != cvalsh) return;
 
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm256_load_ps(p0 + xss[k]);
+        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
       }
 
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
+      uint64_t j = 0;
 
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
 
-        unsigned l = 2 * (8 * i + m);
+        j += 2;
 
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
 
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
+          j += 2;
         }
+
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
       }
-    }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
 
+    for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, fp_type* rstate) {
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
       __m256 rn, in;
-      __m256 rs[8], is[8];
+      __m256 rs[hsize], is[hsize];
+
+      i *= 8;
 
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]);
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
 
-      auto p0 = rstate + 2 * k;
+      if ((ii & cmaskh) != cvalsh) return;
 
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm256_load_ps(p0 + xss[l] + 8);
+      auto p0 = rstate + 2 * ii;
 
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm256_load_ps(p0 + xss[k]);
+        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 2; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm256_mul_ps(rs[0], w[j]);
         in = _mm256_mul_ps(rs[0], w[j + 1]);
         rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
@@ -1027,87 +1048,76 @@ class SimulatorAVX final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
 
           j += 2;
         }
 
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m256 w[1 << (1 + 2 * H)];
 
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    auto m = GetMasks8<3>(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+    FillControlledMatrixH<H, 3>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 3 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, w, ms, xss, idx, rstate);
+    for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, state.get());
   }
 
-  void ApplyGate3LLL(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, State& state) const {
-    unsigned p[8];
-    __m256i idx[7];
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, const __m256i* idx, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
 
-    auto s = StateSpace::Create(6);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
 
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+      i *= 8;
 
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
       }
 
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
+      if ((ii & cmaskh) != cvalsh) return;
 
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
+      auto p0 = rstate + 2 * ii;
 
-      auto p0 = rstate + 16 * i;
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
 
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[8 * l] = _mm256_load_ps(p0);
-        is[8 * l] = _mm256_load_ps(p0 + 8);
+        rs[k2] = _mm256_load_ps(p0 + xss[k]);
+        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
 
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm256_permutevar8x32_ps(rs[8 * l], idx[j - 1]);
-          is[8 * l + j] = _mm256_permutevar8x32_ps(is[8 * l], idx[j - 1]);
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
         }
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 1; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm256_mul_ps(rs[0], w[j]);
         in = _mm256_mul_ps(rs[0], w[j + 1]);
         rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
@@ -1115,72 +1125,79 @@ class SimulatorAVX final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
 
           j += 2;
         }
 
-        _mm256_store_ps(p0, rn);
-        _mm256_store_ps(p0 + 8, in);
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
 
-    unsigned k = 3;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+
+    unsigned r = 3 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, w, idx, rstate);
-  }
+    if (CH) {
+      auto m = GetMasks9<L>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
 
-  void ApplyGate4HHHH(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
+      for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, idx, state.get());
+    } else {
+      auto m = GetMasks10<L, 3>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillControlledMatrixL<H, L, 3>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
 
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
+      for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, idx, state.get());
     }
+  }
 
+  template <unsigned H>
+  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
                 const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
+                const fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
       __m256 ru, iu, rn, in;
-      __m256 rs[16], is[16];
+      __m256 rs[hsize], is[hsize];
+
+      i *= 8;
 
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2])
-          | (64 * i & ms[3]) | (128 * i & ms[4]);
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
 
-      auto p0 = rstate + 2 * k;
+      auto p0 = rstate + 2 * ii;
 
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm256_load_ps(p0 + xss[k]);
+        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
       }
 
+      double re = 0;
+      double im = 0;
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 16; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         ru = _mm256_set1_ps(v[j]);
         iu = _mm256_set1_ps(v[j + 1]);
         rn = _mm256_mul_ps(rs[0], ru);
@@ -1190,6379 +1207,81 @@ class SimulatorAVX final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 16; ++n) {
+        for (unsigned l = 1; l < hsize; ++l) {
           ru = _mm256_set1_ps(v[j]);
           iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
 
           j += 2;
         }
 
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
+        __m256 v_re = _mm256_fmadd_ps(is[k], in, _mm256_mul_ps(rs[k], rn));
+        __m256 v_im = _mm256_fnmadd_ps(is[k], rn, _mm256_mul_ps(rs[k], in));
+
+        re += detail::HorizontalSumAVX(v_re);
+        im += detail::HorizontalSumAVX(v_im);
       }
+
+      return std::complex<double>{re, im};
     };
 
-    fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
 
-    unsigned k = 7;
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 3 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, matrix, ms, xss, rstate);
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get());
   }
 
-  void ApplyGate4HHHL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = StateSpace::Create(10);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2])
-          | (64 * i & ms[3]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, rstate);
-  }
-
-  void ApplyGate4HHLL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = StateSpace::Create(9);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, rstate);
-  }
-
-  void ApplyGate4HLLL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[7];
-
-    auto s = StateSpace::Create(8);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[8 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm256_permutevar8x32_ps(rs[8 * l], idx[j - 1]);
-          is[8 * l + j] = _mm256_permutevar8x32_ps(is[8 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, rstate);
-  }
-
-  void ApplyGate5HHHHH(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, State& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
-      __m256 ru, iu, rn, in;
-      __m256 rs[32], is[32];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2])
-          | (64 * i & ms[3]) | (128 * i & ms[4]) | (256 * i & ms[5]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, rstate);
-  }
-
-  void ApplyGate5HHHHL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = StateSpace::Create(12);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (64 * i + 32 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[32], is[32];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2])
-          | (64 * i & ms[3]) | (128 * i & ms[4]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, rstate);
-  }
-
-  void ApplyGate5HHHLL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = StateSpace::Create(11);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (128 * i + 32 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[32], is[32];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2])
-          | (64 * i & ms[3]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, rstate);
-  }
-
-  void ApplyGate5HHLLL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[7];
-
-    auto s = StateSpace::Create(10);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (256 * i + 32 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[32], is[32];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[8 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm256_permutevar8x32_ps(rs[8 * l], idx[j - 1]);
-          is[8 * l + j] = _mm256_permutevar8x32_ps(is[8 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, rstate);
-  }
-
-  void ApplyGate6HHHHHH(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, State& state) const {
-    uint64_t xs[6];
-    uint64_t ms[7];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 6; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1);
-
-    uint64_t xss[64];
-    for (unsigned i = 0; i < 64; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 6; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
-      __m256 ru, iu, rn, in;
-      __m256 rs[64], is[64];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2])
-          | (64 * i & ms[3]) | (128 * i & ms[4]) | (256 * i & ms[5])
-          | (512 * i & ms[6]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 9;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, rstate);
-  }
-
-  void ApplyGate6HHHHHL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, State& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = StateSpace::Create(14);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 32; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (128 * i + 64 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[64], is[64];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2])
-          | (64 * i & ms[3]) | (128 * i & ms[4]) | (256 * i & ms[5]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, rstate);
-  }
-
-  void ApplyGate6HHHHLL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = StateSpace::Create(13);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (256 * i + 64 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[64], is[64];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2])
-          | (64 * i & ms[3]) | (128 * i & ms[4]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, rstate);
-  }
-
-  void ApplyGate6HHHLLL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[7];
-
-    auto s = StateSpace::Create(12);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (512 * i + 64 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[64], is[64];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2])
-          | (64 * i & ms[3]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[8 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm256_permutevar8x32_ps(rs[8 * l], idx[j - 1]);
-          is[8 * l + j] = _mm256_permutevar8x32_ps(is[8 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, rstate);
-  }
-
-  void ApplyControlledGate1H_H(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m256 ru, iu, rn, in;
-      __m256 rs[2], is[2];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate1H_L(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-
-    auto s = StateSpace::Create(5);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (2 * i + 2 * k + m);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[2], is[2];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate1L_H(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               State& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = StateSpace::Create(4);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[2], is[2];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0);
-        is[2 * l] = _mm256_load_ps(p0 + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0, rn);
-        _mm256_store_ps(p0 + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate1L_L(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               State& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = StateSpace::Create(4);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[2], is[2];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0);
-        is[2 * l] = _mm256_load_ps(p0 + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0, rn);
-        _mm256_store_ps(p0 + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate2HH_H(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m256 ru, iu, rn, in;
-      __m256 rs[4], is[4];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate2HH_L(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-
-    auto s = StateSpace::Create(7);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (4 * i + 4 * k + m);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[4], is[4];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate2HL_H(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = StateSpace::Create(6);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[4], is[4];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate2HL_L(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = StateSpace::Create(6);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[4], is[4];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate2LL_H(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = StateSpace::Create(5);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[4], is[4];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0);
-        is[4 * l] = _mm256_load_ps(p0 + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0, rn);
-        _mm256_store_ps(p0 + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate2LL_L(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = StateSpace::Create(5);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[4], is[4];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0);
-        is[4 * l] = _mm256_load_ps(p0 + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0, rn);
-        _mm256_store_ps(p0 + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate3HHH_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m256 ru, iu, rn, in;
-      __m256 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate3HHH_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-
-    auto s = StateSpace::Create(9);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (8 * i + 8 * k + m);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate3HHL_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = StateSpace::Create(8);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate3HHL_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = StateSpace::Create(8);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate3HLL_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = StateSpace::Create(7);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate3HLL_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = StateSpace::Create(7);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate3LLL_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[7];
-
-    auto s = StateSpace::Create(6);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[8 * l] = _mm256_load_ps(p0);
-        is[8 * l] = _mm256_load_ps(p0 + 8);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm256_permutevar8x32_ps(rs[8 * l], idx[j - 1]);
-          is[8 * l + j] = _mm256_permutevar8x32_ps(is[8 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0, rn);
-        _mm256_store_ps(p0 + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate3LLL_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[7];
-
-    auto s = StateSpace::Create(6);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[8 * l] = _mm256_load_ps(p0);
-        is[8 * l] = _mm256_load_ps(p0 + 8);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm256_permutevar8x32_ps(rs[8 * l], idx[j - 1]);
-          is[8 * l + j] = _mm256_permutevar8x32_ps(is[8 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0, rn);
-        _mm256_store_ps(p0 + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate4HHHH_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m256 ru, iu, rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate4HHHH_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-
-    auto s = StateSpace::Create(11);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (16 * i + 16 * k + m);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate4HHHL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = StateSpace::Create(10);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate4HHHL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = StateSpace::Create(10);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate4HHLL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = StateSpace::Create(9);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate4HHLL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = StateSpace::Create(9);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate4HLLL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[7];
-
-    auto s = StateSpace::Create(8);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[8 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm256_permutevar8x32_ps(rs[8 * l], idx[j - 1]);
-          is[8 * l + j] = _mm256_permutevar8x32_ps(is[8 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate4HLLL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[7];
-
-    auto s = StateSpace::Create(8);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[8 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm256_permutevar8x32_ps(rs[8 * l], idx[j - 1]);
-          is[8 * l + j] = _mm256_permutevar8x32_ps(is[8 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue1H(const std::vector<unsigned>& qs,
-                                          const fp_type* matrix,
-                                          const State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      __m256 ru, iu, rn, in;
-      __m256 rs[2], is[2];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        __m256 v_re = _mm256_fmadd_ps(is[l], in, _mm256_mul_ps(rs[l], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[l], rn, _mm256_mul_ps(rs[l], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue1L(const std::vector<unsigned>& qs,
-                                          const fp_type* matrix,
-                                          const State& state) const {
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = StateSpace::Create(4);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const __m256i* idx, const fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[2], is[2];
-
-      auto p0 = rstate + 16 * i;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0);
-        is[2 * l] = _mm256_load_ps(p0 + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        __m256 v_re = _mm256_fmadd_ps(is[l], in, _mm256_mul_ps(rs[l], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[l], rn, _mm256_mul_ps(rs[l], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 3;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue2HH(const std::vector<unsigned>& qs,
-                                           const fp_type* matrix,
-                                           const State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      __m256 ru, iu, rn, in;
-      __m256 rs[4], is[4];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        __m256 v_re = _mm256_fmadd_ps(is[l], in, _mm256_mul_ps(rs[l], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[l], rn, _mm256_mul_ps(rs[l], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue2HL(const std::vector<unsigned>& qs,
-                                           const fp_type* matrix,
-                                           const State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = StateSpace::Create(6);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, const fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[4], is[4];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 2 * l;
-
-        __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue2LL(const std::vector<unsigned>& qs,
-                                           const fp_type* matrix,
-                                           const State& state) const {
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = StateSpace::Create(5);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const __m256i* idx, const fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[4], is[4];
-
-      auto p0 = rstate + 16 * i;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0);
-        is[4 * l] = _mm256_load_ps(p0 + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        __m256 v_re = _mm256_fmadd_ps(is[l], in, _mm256_mul_ps(rs[l], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[l], rn, _mm256_mul_ps(rs[l], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 3;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue3HHH(const std::vector<unsigned>& qs,
-                                            const fp_type* matrix,
-                                            const State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      __m256 ru, iu, rn, in;
-      __m256 rs[8], is[8];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2])
-          | (64 * i & ms[3]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        __m256 v_re = _mm256_fmadd_ps(is[l], in, _mm256_mul_ps(rs[l], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[l], rn, _mm256_mul_ps(rs[l], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue3HHL(const std::vector<unsigned>& qs,
-                                            const fp_type* matrix,
-                                            const State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = StateSpace::Create(8);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, const fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 2 * l;
-
-        __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue3HLL(const std::vector<unsigned>& qs,
-                                            const fp_type* matrix,
-                                            const State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = StateSpace::Create(7);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, const fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 4 * l;
-
-        __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue3LLL(const std::vector<unsigned>& qs,
-                                            const fp_type* matrix,
-                                            const State& state) const {
-    unsigned p[8];
-    __m256i idx[7];
-
-    auto s = StateSpace::Create(6);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const __m256i* idx, const fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
-
-      auto p0 = rstate + 16 * i;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[8 * l] = _mm256_load_ps(p0);
-        is[8 * l] = _mm256_load_ps(p0 + 8);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm256_permutevar8x32_ps(rs[8 * l], idx[j - 1]);
-          is[8 * l + j] = _mm256_permutevar8x32_ps(is[8 * l], idx[j - 1]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        __m256 v_re = _mm256_fmadd_ps(is[l], in, _mm256_mul_ps(rs[l], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[l], rn, _mm256_mul_ps(rs[l], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 3;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue4HHHH(const std::vector<unsigned>& qs,
-                                             const fp_type* matrix,
-                                             const State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      __m256 ru, iu, rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2])
-          | (64 * i & ms[3]) | (128 * i & ms[4]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        __m256 v_re = _mm256_fmadd_ps(is[l], in, _mm256_mul_ps(rs[l], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[l], rn, _mm256_mul_ps(rs[l], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue4HHHL(const std::vector<unsigned>& qs,
-                                             const fp_type* matrix,
-                                             const State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = StateSpace::Create(10);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, const fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2])
-          | (64 * i & ms[3]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 2 * l;
-
-        __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue4HHLL(const std::vector<unsigned>& qs,
-                                             const fp_type* matrix,
-                                             const State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = StateSpace::Create(9);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, const fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 4 * l;
-
-        __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue4HLLL(const std::vector<unsigned>& qs,
-                                             const fp_type* matrix,
-                                             const State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[7];
-
-    auto s = StateSpace::Create(8);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, const fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[8 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm256_permutevar8x32_ps(rs[8 * l], idx[j - 1]);
-          is[8 * l + j] = _mm256_permutevar8x32_ps(is[8 * l], idx[j - 1]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 8 * l;
-
-        __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue5HHHHH(const std::vector<unsigned>& qs,
-                                              const fp_type* matrix,
-                                              const State& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      __m256 ru, iu, rn, in;
-      __m256 rs[32], is[32];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2])
-          | (64 * i & ms[3]) | (128 * i & ms[4]) | (256 * i & ms[5]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        __m256 v_re = _mm256_fmadd_ps(is[l], in, _mm256_mul_ps(rs[l], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[l], rn, _mm256_mul_ps(rs[l], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue5HHHHL(const std::vector<unsigned>& qs,
-                                              const fp_type* matrix,
-                                              const State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = StateSpace::Create(12);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (64 * i + 32 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, const fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[32], is[32];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2])
-          | (64 * i & ms[3]) | (128 * i & ms[4]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 2 * l;
-
-        __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue5HHHLL(const std::vector<unsigned>& qs,
-                                              const fp_type* matrix,
-                                              const State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = StateSpace::Create(11);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (128 * i + 32 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, const fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[32], is[32];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2])
-          | (64 * i & ms[3]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 4 * l;
-
-        __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue5HHLLL(const std::vector<unsigned>& qs,
-                                              const fp_type* matrix,
-                                              const State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[7];
-
-    auto s = StateSpace::Create(10);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (256 * i + 32 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
+  template <unsigned H, unsigned L>
+  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, const fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[32], is[32];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[8 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm256_permutevar8x32_ps(rs[8 * l], idx[j - 1]);
-          is[8 * l + j] = _mm256_permutevar8x32_ps(is[8 * l], idx[j - 1]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 8 * l;
-
-        __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue6HHHHHH(const std::vector<unsigned>& qs,
-                                               const fp_type* matrix,
-                                               const State& state) const {
-    uint64_t xs[6];
-    uint64_t ms[7];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 6; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1);
-
-    uint64_t xss[64];
-    for (unsigned i = 0; i < 64; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 6; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
+                const uint64_t* ms, const uint64_t* xss, const __m256i* idx,
                 const fp_type* rstate) {
-      __m256 ru, iu, rn, in;
-      __m256 rs[64], is[64];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2])
-          | (64 * i & ms[3]) | (128 * i & ms[4]) | (256 * i & ms[5])
-          | (512 * i & ms[6]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        __m256 v_re = _mm256_fmadd_ps(is[l], in, _mm256_mul_ps(rs[l], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[l], rn, _mm256_mul_ps(rs[l], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
 
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 9;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue6HHHHHL(const std::vector<unsigned>& qs,
-                                               const fp_type* matrix,
-                                               const State& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = StateSpace::Create(14);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 32; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (128 * i + 64 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (64 * i + m);
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
 
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
+      i *= 8;
 
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
       }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, const fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[64], is[64];
 
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2])
-          | (64 * i & ms[3]) | (128 * i & ms[4]) | (256 * i & ms[5]);
+      auto p0 = rstate + 2 * ii;
 
-      auto p0 = rstate + 2 * k;
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
 
-      for (unsigned l = 0; l < 32; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
+        rs[k2] = _mm256_load_ps(p0 + xss[k]);
+        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
 
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
         }
       }
 
       double re = 0;
       double im = 0;
-
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 32; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm256_mul_ps(rs[0], w[j]);
         in = _mm256_mul_ps(rs[0], w[j + 1]);
         rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
@@ -7570,16 +1289,16 @@ class SimulatorAVX final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
 
           j += 2;
         }
 
-        unsigned m = 2 * l;
+        unsigned m = lsize * k;
 
         __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn));
         __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in));
@@ -7591,272 +1310,40 @@ class SimulatorAVX final {
       return std::complex<double>{re, im};
     };
 
-    const fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue6HHHHLL(const std::vector<unsigned>& qs,
-                                               const fp_type* matrix,
-                                               const State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = StateSpace::Create(13);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (256 * i + 64 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, const fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[64], is[64];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2])
-          | (64 * i & ms[3]) | (128 * i & ms[4]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 4 * l;
-
-        __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
 
-      return std::complex<double>{re, im};
-    };
+    auto m = GetMasks11<L>(qs);
 
-    const fp_type* rstate = state.get();
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
 
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    unsigned r = 3 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
     uint64_t size = uint64_t{1} << n;
 
     using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
+    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, state.get());
   }
 
-  std::complex<double> ExpectationValue6HHHLLL(const std::vector<unsigned>& qs,
-                                               const fp_type* matrix,
-                                               const State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[7];
+#endif  // __BMI2__
 
-    auto s = StateSpace::Create(12);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
+  template <unsigned L>
+  static void FillPermutationIndices(unsigned qmaskl, __m256i* idx) {
+    constexpr unsigned lsize = 1 << L;
 
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+    for (unsigned i = 0; i < lsize - 1; ++i) {
+      unsigned p[8];
 
-    for (unsigned i = 0; i < 7; ++i) {
       for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+        p[j] = MaskedAdd<3>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl));
       }
 
       idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
     }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (512 * i + 64 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, const fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[64], is[64];
-
-      uint64_t k = (8 * i & ms[0]) | (16 * i & ms[1]) | (32 * i & ms[2])
-          | (64 * i & ms[3]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[8 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm256_permutevar8x32_ps(rs[8 * l], idx[j - 1]);
-          is[8 * l + j] = _mm256_permutevar8x32_ps(is[8 * l], idx[j - 1]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 8 * l;
-
-        __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  static unsigned MaskedAdd(
-      unsigned a, unsigned b, unsigned mask, unsigned lsize) {
-    unsigned c = bits::CompressBits(a, 3, mask);
-    return bits::ExpandBits((c + b) % lsize, 3, mask);
   }
 
   For for_;
diff --git a/lib/simulator_avx512.h b/lib/simulator_avx512.h
index ddce44db..9e03a867 100644
--- a/lib/simulator_avx512.h
+++ b/lib/simulator_avx512.h
@@ -17,11 +17,12 @@
 
 #include <immintrin.h>
 
-#include <algorithm>
 #include <complex>
 #include <cstdint>
+#include <functional>
+#include <vector>
 
-#include "bits.h"
+#include "simulator.h"
 #include "statespace_avx512.h"
 
 namespace qsim {
@@ -30,7 +31,7 @@ namespace qsim {
  * Quantum circuit simulator with AVX512 vectorization.
  */
 template <typename For>
-class SimulatorAVX512 final {
+class SimulatorAVX512 final : public SimulatorBase {
  public:
   using StateSpace = StateSpaceAVX512<For>;
   using State = typename StateSpace::State;
@@ -52,68 +53,68 @@ class SimulatorAVX512 final {
     switch (qs.size()) {
     case 1:
       if (qs[0] > 3) {
-        ApplyGate1H(qs, matrix, state);
+        ApplyGateH<1>(qs, matrix, state);
       } else {
-        ApplyGate1L(qs, matrix, state);
+        ApplyGateL<0, 1>(qs, matrix, state);
       }
       break;
     case 2:
       if (qs[0] > 3) {
-        ApplyGate2HH(qs, matrix, state);
+        ApplyGateH<2>(qs, matrix, state);
       } else if (qs[1] > 3) {
-        ApplyGate2HL(qs, matrix, state);
+        ApplyGateL<1, 1>(qs, matrix, state);
       } else {
-        ApplyGate2LL(qs, matrix, state);
+        ApplyGateL<0, 2>(qs, matrix, state);
       }
       break;
     case 3:
       if (qs[0] > 3) {
-        ApplyGate3HHH(qs, matrix, state);
+        ApplyGateH<3>(qs, matrix, state);
       } else if (qs[1] > 3) {
-        ApplyGate3HHL(qs, matrix, state);
+        ApplyGateL<2, 1>(qs, matrix, state);
       } else if (qs[2] > 3) {
-        ApplyGate3HLL(qs, matrix, state);
+        ApplyGateL<1, 2>(qs, matrix, state);
       } else {
-        ApplyGate3LLL(qs, matrix, state);
+        ApplyGateL<0, 3>(qs, matrix, state);
       }
       break;
     case 4:
       if (qs[0] > 3) {
-        ApplyGate4HHHH(qs, matrix, state);
+        ApplyGateH<4>(qs, matrix, state);
       } else if (qs[1] > 3) {
-        ApplyGate4HHHL(qs, matrix, state);
+        ApplyGateL<3, 1>(qs, matrix, state);
       } else if (qs[2] > 3) {
-        ApplyGate4HHLL(qs, matrix, state);
+        ApplyGateL<2, 2>(qs, matrix, state);
       } else if (qs[3] > 3) {
-        ApplyGate4HLLL(qs, matrix, state);
+        ApplyGateL<1, 3>(qs, matrix, state);
       } else {
-        ApplyGate4LLLL(qs, matrix, state);
+        ApplyGateL<0, 4>(qs, matrix, state);
       }
       break;
     case 5:
       if (qs[0] > 3) {
-        ApplyGate5HHHHH(qs, matrix, state);
+        ApplyGateH<5>(qs, matrix, state);
       } else if (qs[1] > 3) {
-        ApplyGate5HHHHL(qs, matrix, state);
+        ApplyGateL<4, 1>(qs, matrix, state);
       } else if (qs[2] > 3) {
-        ApplyGate5HHHLL(qs, matrix, state);
+        ApplyGateL<3, 2>(qs, matrix, state);
       } else if (qs[3] > 3) {
-        ApplyGate5HHLLL(qs, matrix, state);
+        ApplyGateL<2, 3>(qs, matrix, state);
       } else {
-        ApplyGate5HLLLL(qs, matrix, state);
+        ApplyGateL<1, 4>(qs, matrix, state);
       }
       break;
     case 6:
       if (qs[0] > 3) {
-        ApplyGate6HHHHHH(qs, matrix, state);
+        ApplyGateH<6>(qs, matrix, state);
       } else if (qs[1] > 3) {
-        ApplyGate6HHHHHL(qs, matrix, state);
+        ApplyGateL<5, 1>(qs, matrix, state);
       } else if (qs[2] > 3) {
-        ApplyGate6HHHHLL(qs, matrix, state);
+        ApplyGateL<4, 2>(qs, matrix, state);
       } else if (qs[3] > 3) {
-        ApplyGate6HHHLLL(qs, matrix, state);
+        ApplyGateL<3, 3>(qs, matrix, state);
       } else {
-        ApplyGate6HHLLLL(qs, matrix, state);
+        ApplyGateL<2, 4>(qs, matrix, state);
       }
       break;
     default:
@@ -126,13 +127,16 @@ class SimulatorAVX512 final {
    * Applies a controlled gate using AVX512 instructions.
    * @param qs Indices of the qubits affected by this gate.
    * @param cqs Indices of control qubits.
-   * @param cmask Bit mask of control qubit values.
+   * @param cvals Bit mask of control qubit values.
    * @param matrix Matrix representation of the gate to be applied.
    * @param state The state of the system, to be updated by this method.
    */
   void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cmask,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
                            const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
+
     if (cqs.size() == 0) {
       ApplyGate(qs, matrix, state);
       return;
@@ -142,96 +146,96 @@ class SimulatorAVX512 final {
     case 1:
       if (qs[0] > 3) {
         if (cqs[0] > 3) {
-          ApplyControlledGate1H_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate1H_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 3) {
-          ApplyControlledGate1L_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate1L_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
     case 2:
       if (qs[0] > 3) {
         if (cqs[0] > 3) {
-          ApplyControlledGate2HH_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate2HH_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[1] > 3) {
         if (cqs[0] > 3) {
-          ApplyControlledGate2HL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate2HL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 3) {
-          ApplyControlledGate2LL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate2LL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
     case 3:
       if (qs[0] > 3) {
         if (cqs[0] > 3) {
-          ApplyControlledGate3HHH_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3HHH_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[1] > 3) {
         if (cqs[0] > 3) {
-          ApplyControlledGate3HHL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3HHL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[2] > 3) {
         if (cqs[0] > 3) {
-          ApplyControlledGate3HLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3HLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 3) {
-          ApplyControlledGate3LLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3LLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
     case 4:
       if (qs[0] > 3) {
         if (cqs[0] > 3) {
-          ApplyControlledGate4HHHH_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HHHH_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[1] > 3) {
         if (cqs[0] > 3) {
-          ApplyControlledGate4HHHL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HHHL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[2] > 3) {
         if (cqs[0] > 3) {
-          ApplyControlledGate4HHLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HHLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[3] > 3) {
         if (cqs[0] > 3) {
-          ApplyControlledGate4HLLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HLLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 3) {
-          ApplyControlledGate4LLLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 4, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4LLLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 4, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
@@ -256,68 +260,68 @@ class SimulatorAVX512 final {
     switch (qs.size()) {
     case 1:
       if (qs[0] > 3) {
-        return ExpectationValue1H(qs, matrix, state);
+        return ExpectationValueH<1>(qs, matrix, state);
       } else {
-        return ExpectationValue1L(qs, matrix, state);
+        return ExpectationValueL<0, 1>(qs, matrix, state);
       }
       break;
     case 2:
       if (qs[0] > 3) {
-        return ExpectationValue2HH(qs, matrix, state);
+        return ExpectationValueH<2>(qs, matrix, state);
       } else if (qs[1] > 3) {
-        return ExpectationValue2HL(qs, matrix, state);
+        return ExpectationValueL<1, 1>(qs, matrix, state);
       } else {
-        return ExpectationValue2LL(qs, matrix, state);
+        return ExpectationValueL<0, 2>(qs, matrix, state);
       }
       break;
     case 3:
       if (qs[0] > 3) {
-        return ExpectationValue3HHH(qs, matrix, state);
+        return ExpectationValueH<3>(qs, matrix, state);
       } else if (qs[1] > 3) {
-        return ExpectationValue3HHL(qs, matrix, state);
+        return ExpectationValueL<2, 1>(qs, matrix, state);
       } else if (qs[2] > 3) {
-        return ExpectationValue3HLL(qs, matrix, state);
+        return ExpectationValueL<1, 2>(qs, matrix, state);
       } else {
-        return ExpectationValue3LLL(qs, matrix, state);
+        return ExpectationValueL<0, 3>(qs, matrix, state);
       }
       break;
     case 4:
       if (qs[0] > 3) {
-        return ExpectationValue4HHHH(qs, matrix, state);
+        return ExpectationValueH<4>(qs, matrix, state);
       } else if (qs[1] > 3) {
-        return ExpectationValue4HHHL(qs, matrix, state);
+        return ExpectationValueL<3, 1>(qs, matrix, state);
       } else if (qs[2] > 3) {
-        return ExpectationValue4HHLL(qs, matrix, state);
+        return ExpectationValueL<2, 2>(qs, matrix, state);
       } else if (qs[3] > 3) {
-        return ExpectationValue4HLLL(qs, matrix, state);
+        return ExpectationValueL<1, 3>(qs, matrix, state);
       } else {
-        return ExpectationValue4LLLL(qs, matrix, state);
+        return ExpectationValueL<0, 4>(qs, matrix, state);
       }
       break;
     case 5:
       if (qs[0] > 3) {
-        return ExpectationValue5HHHHH(qs, matrix, state);
+        return ExpectationValueH<5>(qs, matrix, state);
       } else if (qs[1] > 3) {
-        return ExpectationValue5HHHHL(qs, matrix, state);
+        return ExpectationValueL<4, 1>(qs, matrix, state);
       } else if (qs[2] > 3) {
-        return ExpectationValue5HHHLL(qs, matrix, state);
+        return ExpectationValueL<3, 2>(qs, matrix, state);
       } else if (qs[3] > 3) {
-        return ExpectationValue5HHLLL(qs, matrix, state);
+        return ExpectationValueL<2, 3>(qs, matrix, state);
       } else {
-        return ExpectationValue5HLLLL(qs, matrix, state);
+        return ExpectationValueL<1, 4>(qs, matrix, state);
       }
       break;
     case 6:
       if (qs[0] > 3) {
-        return ExpectationValue6HHHHHH(qs, matrix, state);
+        return ExpectationValueH<6>(qs, matrix, state);
       } else if (qs[1] > 3) {
-        return ExpectationValue6HHHHHL(qs, matrix, state);
+        return ExpectationValueL<5, 1>(qs, matrix, state);
       } else if (qs[2] > 3) {
-        return ExpectationValue6HHHHLL(qs, matrix, state);
+        return ExpectationValueL<4, 2>(qs, matrix, state);
       } else if (qs[3] > 3) {
-        return ExpectationValue6HHHLLL(qs, matrix, state);
+        return ExpectationValueL<3, 3>(qs, matrix, state);
       } else {
-        return ExpectationValue6HHLLLL(qs, matrix, state);
+        return ExpectationValueL<2, 4>(qs, matrix, state);
       }
       break;
     default:
@@ -336,44 +340,28 @@ class SimulatorAVX512 final {
   }
 
  private:
-  void ApplyGate1H(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
+                uint64_t imaskh, uint64_t qmaskh, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
       __m512 ru, iu, rn, in;
-      __m512 rs[2], is[2];
+      __m512 rs[hsize], is[hsize];
 
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]);
+      auto p0 = rstate + _pdep_u64(i, imaskh);
 
-      auto p0 = rstate + 2 * k;
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
 
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+        rs[k] = _mm512_load_ps(p0 + p);
+        is[k] = _mm512_load_ps(p0 + p + 16);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 2; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         ru = _mm512_set1_ps(v[j]);
         iu = _mm512_set1_ps(v[j + 1]);
         rn = _mm512_mul_ps(rs[0], ru);
@@ -383,91 +371,64 @@ class SimulatorAVX512 final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 2; ++n) {
+        for (unsigned l = 1; l < hsize; ++l) {
           ru = _mm512_set1_ps(v[j]);
           iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
+          rn = _mm512_fmadd_ps(rs[l], ru, rn);
+          in = _mm512_fmadd_ps(rs[l], iu, in);
+          rn = _mm512_fnmadd_ps(is[l], iu, rn);
+          in = _mm512_fmadd_ps(is[l], ru, in);
 
           j += 2;
         }
 
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    auto m = GetMasks1<H, 4>(qs);
 
-    unsigned k = 5;
+    unsigned k = 4 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, matrix, ms, xss, rstate);
+    for_.Run(size, f, matrix, m.imaskh, m.qmaskh, state.get());
   }
 
-  void ApplyGate1L(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, State& state) const {
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = StateSpace::Create(5);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const __m512i* idx, fp_type* rstate) {
+                uint64_t imaskh, uint64_t qmaskh, const __m512i* idx,
+                fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
       __m512 rn, in;
-      __m512 rs[2], is[2];
+      __m512 rs[gsize], is[gsize];
 
-      auto p0 = rstate + 32 * i;
+      auto p0 = rstate + _pdep_u64(i, imaskh);
 
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0);
-        is[2 * l] = _mm512_load_ps(p0 + 16);
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
 
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        rs[k2] = _mm512_load_ps(p0 + p);
+        is[k2] = _mm512_load_ps(p0 + p + 16);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
+          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
         }
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 1; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm512_mul_ps(rs[0], w[j]);
         in = _mm512_mul_ps(rs[0], w[j + 1]);
         rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
@@ -475,71 +436,60 @@ class SimulatorAVX512 final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
+          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[l], w[j], in);
 
           j += 2;
         }
 
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    __m512i idx[1 << L];
+    __m512 w[1 << (1 + 2 * H + L)];
 
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    auto m = GetMasks2<H, L, 4>(qs);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 4 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, w, idx, rstate);
+    for_.Run(size, f, w, m.imaskh, m.qmaskh, idx, state.get());
   }
 
-  void ApplyGate2HH(const std::vector<unsigned>& qs,
-                    const fp_type* matrix, State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
                 fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
       __m512 ru, iu, rn, in;
-      __m512 rs[4], is[4];
+      __m512 rs[hsize], is[hsize];
 
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]);
+      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
 
-      auto p0 = rstate + 2 * k;
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
 
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+        rs[k] = _mm512_load_ps(p0 + p);
+        is[k] = _mm512_load_ps(p0 + p + 16);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 4; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         ru = _mm512_set1_ps(v[j]);
         iu = _mm512_set1_ps(v[j + 1]);
         rn = _mm512_mul_ps(rs[0], ru);
@@ -549,112 +499,57 @@ class SimulatorAVX512 final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 4; ++n) {
+        for (unsigned l = 1; l < hsize; ++l) {
           ru = _mm512_set1_ps(v[j]);
           iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
+          rn = _mm512_fmadd_ps(rs[l], ru, rn);
+          in = _mm512_fmadd_ps(rs[l], iu, in);
+          rn = _mm512_fnmadd_ps(is[l], iu, rn);
+          in = _mm512_fmadd_ps(is[l], ru, in);
 
           j += 2;
         }
 
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    auto m = GetMasks3<H, 4>(state.num_qubits(), qs, cqs, cvals);
 
-    unsigned k = 6;
+    unsigned k = 4 + H + cqs.size();
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, matrix, ms, xss, rstate);
+    for_.Run(size, f, matrix, m.imaskh, m.qmaskh, m.cvalsh, state.get());
   }
 
-  void ApplyGate2HL(const std::vector<unsigned>& qs,
-                    const fp_type* matrix, State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = StateSpace::Create(7);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[4], is[4];
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
 
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]);
+      __m512 rn, in;
+      __m512 rs[hsize], is[hsize];
 
-      auto p0 = rstate + 2 * k;
+      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
 
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
 
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
+        rs[k] = _mm512_load_ps(p0 + p);
+        is[k] = _mm512_load_ps(p0 + p + 16);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 2; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm512_mul_ps(rs[0], w[j]);
         in = _mm512_mul_ps(rs[0], w[j + 1]);
         rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
@@ -662,89 +557,66 @@ class SimulatorAVX512 final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
+          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[l], w[j], in);
 
           j += 2;
         }
 
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    __m512 w[1 << (1 + 2 * H)];
 
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    auto m = GetMasks4<H, 4>(state.num_qubits(), qs, cqs, cvals);
+    FillControlledMatrixH<H, 4>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 4 + H + cqs.size() - m.cl;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, w, ms, xss, idx, rstate);
+    for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, state.get());
   }
 
-  void ApplyGate2LL(const std::vector<unsigned>& qs,
-                    const fp_type* matrix, State& state) const {
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = StateSpace::Create(6);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
                 const __m512i* idx, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
       __m512 rn, in;
-      __m512 rs[4], is[4];
+      __m512 rs[gsize], is[gsize];
+
+      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
 
-      auto p0 = rstate + 32 * i;
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
 
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0);
-        is[4 * l] = _mm512_load_ps(p0 + 16);
+        rs[k2] = _mm512_load_ps(p0 + p);
+        is[k2] = _mm512_load_ps(p0 + p + 16);
 
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
+          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
         }
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 1; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm512_mul_ps(rs[0], w[j]);
         in = _mm512_mul_ps(rs[0], w[j + 1]);
         rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
@@ -752,72 +624,74 @@ class SimulatorAVX512 final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
+          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[l], w[j], in);
 
           j += 2;
         }
 
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    __m512i idx[1 << L];
+    __m512 w[1 << (1 + 2 * H + L)];
 
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    if (CH) {
+      auto m = GetMasks5<H, L, 4>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
 
-    for_.Run(size, f, w, idx, rstate);
-  }
+      unsigned r = 4 + H + cqs.size();
+      unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+      uint64_t size = uint64_t{1} << n;
 
-  void ApplyGate3HHH(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
+      for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get());
+    } else {
+      auto m = GetMasks6<H, L, 4>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillControlledMatrixL<H, L, 4>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
 
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
+      unsigned r = 4 + H + cqs.size() - m.cl;
+      unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+      uint64_t size = uint64_t{1} << n;
+
+      for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get());
     }
+  }
 
+  template <unsigned H>
+  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
+                uint64_t imaskh, uint64_t qmaskh, const fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
       __m512 ru, iu, rn, in;
-      __m512 rs[8], is[8];
+      __m512 rs[hsize], is[hsize];
 
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
-          | (128 * i & ms[3]);
+      auto p0 = rstate + _pdep_u64(i, imaskh);
 
-      auto p0 = rstate + 2 * k;
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
 
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+        rs[k] = _mm512_load_ps(p0 + p);
+        is[k] = _mm512_load_ps(p0 + p + 16);
       }
 
+      double re = 0;
+      double im = 0;
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 8; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         ru = _mm512_set1_ps(v[j]);
         iu = _mm512_set1_ps(v[j + 1]);
         rn = _mm512_mul_ps(rs[0], ru);
@@ -827,116 +701,72 @@ class SimulatorAVX512 final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 8; ++n) {
+        for (unsigned l = 1; l < hsize; ++l) {
           ru = _mm512_set1_ps(v[j]);
           iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
+          rn = _mm512_fmadd_ps(rs[l], ru, rn);
+          in = _mm512_fmadd_ps(rs[l], iu, in);
+          rn = _mm512_fnmadd_ps(is[l], iu, rn);
+          in = _mm512_fmadd_ps(is[l], ru, in);
 
           j += 2;
         }
 
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
+        __m512 v_re = _mm512_fmadd_ps(is[k], in, _mm512_mul_ps(rs[k], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[k], rn, _mm512_mul_ps(rs[k], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
       }
+
+      return std::complex<double>{re, im};
     };
 
-    fp_type* rstate = state.get();
+    auto m = GetMasks1<H, 4>(qs);
 
-    unsigned k = 7;
+    unsigned k = 4 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, matrix, ms, xss, rstate);
+    using Op = std::plus<std::complex<double>>;
+    return
+        for_.RunReduce(size, f, Op(), matrix, m.imaskh, m.qmaskh, state.get());
   }
 
-  void ApplyGate3HHL(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = StateSpace::Create(9);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
+  template <unsigned H, unsigned L>
+  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, fp_type* rstate) {
+                uint64_t imaskh, uint64_t qmaskh, const __m512i* idx,
+                const fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
       __m512 rn, in;
-      __m512 rs[8], is[8];
+      __m512 rs[gsize], is[gsize];
 
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]);
+      auto p0 = rstate + _pdep_u64(i, imaskh);
 
-      auto p0 = rstate + 2 * k;
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
 
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+        rs[k2] = _mm512_load_ps(p0 + p);
+        is[k2] = _mm512_load_ps(p0 + p + 16);
 
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
+          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
         }
       }
 
+      double re = 0;
+      double im = 0;
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 4; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm512_mul_ps(rs[0], w[j]);
         in = _mm512_mul_ps(rs[0], w[j + 1]);
         rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
@@ -944,7930 +774,58 @@ class SimulatorAVX512 final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
+          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[l], w[j], in);
 
           j += 2;
         }
 
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, rstate);
-  }
-
-  void ApplyGate3HLL(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = StateSpace::Create(8);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]);
-
-      auto p0 = rstate + 2 * k;
+        unsigned m = lsize * k;
 
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
 
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
       }
 
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
+      return std::complex<double>{re, im};
     };
 
-    fp_type* rstate = state.get();
+    __m512i idx[1 << L];
+    __m512 w[1 << (1 + 2 * H + L)];
 
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    auto m = GetMasks2<H, L, 4>(qs);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 4 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, w, ms, xss, idx, rstate);
+    using Op = std::plus<std::complex<double>>;
+    return
+        for_.RunReduce(size, f, Op(), w, m.imaskh, m.qmaskh, idx, state.get());
   }
 
-  void ApplyGate3LLL(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, State& state) const {
-    unsigned p[16];
-    __m512i idx[7];
-
-    auto s = StateSpace::Create(7);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
+  template <unsigned L>
+  static void FillPermutationIndices(unsigned qmaskl, __m512i* idx) {
+    constexpr unsigned lsize = 1 << L;
 
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+    for (unsigned i = 0; i < lsize; ++i) {
+      unsigned p[16];
 
-    for (unsigned i = 0; i < 7; ++i) {
       for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+        p[j] = MaskedAdd<4>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl));
       }
 
       idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
                                 p[9], p[8], p[7], p[6], p[5], p[4],
                                 p[3], p[2], p[1], p[0]);
     }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[8], is[8];
-
-      auto p0 = rstate + 32 * i;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[8 * l] = _mm512_load_ps(p0);
-        is[8 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
-          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, idx, rstate);
-  }
-
-  void ApplyGate4HHHH(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
-      __m512 ru, iu, rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
-          | (128 * i & ms[3]) | (256 * i & ms[4]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, rstate);
-  }
-
-  void ApplyGate4HHHL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = StateSpace::Create(11);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
-          | (128 * i & ms[3]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, rstate);
-  }
-
-  void ApplyGate4HHLL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = StateSpace::Create(10);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, rstate);
-  }
-
-  void ApplyGate4HLLL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[7];
-
-    auto s = StateSpace::Create(9);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
-          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, rstate);
-  }
-
-  void ApplyGate4LLLL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, State& state) const {
-    unsigned p[16];
-    __m512i idx[15];
-
-    auto s = StateSpace::Create(8);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
-
-    for (unsigned i = 0; i < 15; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      auto p0 = rstate + 32 * i;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[16 * l] = _mm512_load_ps(p0);
-        is[16 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 16; ++j) {
-          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
-          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, idx, rstate);
-  }
-
-  void ApplyGate5HHHHH(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, State& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
-      __m512 ru, iu, rn, in;
-      __m512 rs[32], is[32];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
-          | (128 * i & ms[3]) | (256 * i & ms[4]) | (512 * i & ms[5]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 9;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, rstate);
-  }
-
-  void ApplyGate5HHHHL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = StateSpace::Create(13);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (64 * i + 32 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[32], is[32];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
-          | (128 * i & ms[3]) | (256 * i & ms[4]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, rstate);
-  }
-
-  void ApplyGate5HHHLL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = StateSpace::Create(12);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (128 * i + 32 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[32], is[32];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
-          | (128 * i & ms[3]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, rstate);
-  }
-
-  void ApplyGate5HHLLL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[7];
-
-    auto s = StateSpace::Create(11);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (256 * i + 32 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[32], is[32];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
-          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, rstate);
-  }
-
-  void ApplyGate5HLLLL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[4] + 1);
-    ms[0] = (uint64_t{1} << qs[4]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[15];
-
-    auto s = StateSpace::Create(10);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
-
-    for (unsigned i = 0; i < 15; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (512 * i + 32 * k + 16 * (m / 16) + (k + m) % 16);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[32], is[32];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[16 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[16 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 16; ++j) {
-          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
-          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, rstate);
-  }
-
-  void ApplyGate6HHHHHH(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, State& state) const {
-    uint64_t xs[6];
-    uint64_t ms[7];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 6; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1);
-
-    uint64_t xss[64];
-    for (unsigned i = 0; i < 64; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 6; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
-      __m512 ru, iu, rn, in;
-      __m512 rs[64], is[64];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
-          | (128 * i & ms[3]) | (256 * i & ms[4]) | (512 * i & ms[5])
-          | (1024 * i & ms[6]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 10;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, rstate);
-  }
-
-  void ApplyGate6HHHHHL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, State& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = StateSpace::Create(15);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 32; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (128 * i + 64 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[64], is[64];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
-          | (128 * i & ms[3]) | (256 * i & ms[4]) | (512 * i & ms[5]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 9;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, rstate);
-  }
-
-  void ApplyGate6HHHHLL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = StateSpace::Create(14);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (256 * i + 64 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[64], is[64];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
-          | (128 * i & ms[3]) | (256 * i & ms[4]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, rstate);
-  }
-
-  void ApplyGate6HHHLLL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[7];
-
-    auto s = StateSpace::Create(13);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (512 * i + 64 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[64], is[64];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
-          | (128 * i & ms[3]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
-          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, rstate);
-  }
-
-  void ApplyGate6HHLLLL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[4] + 1);
-    ms[0] = (uint64_t{1} << qs[4]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 4] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 4]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[15];
-
-    auto s = StateSpace::Create(12);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
-
-    for (unsigned i = 0; i < 15; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (1024 * i + 64 * k + 16 * (m / 16) + (k + m) % 16);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[64], is[64];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[16 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[16 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 16; ++j) {
-          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
-          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, rstate);
-  }
-
-  void ApplyControlledGate1H_H(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m512 ru, iu, rn, in;
-      __m512 rs[2], is[2];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate1H_L(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-
-    auto s = StateSpace::Create(6);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (2 * i + 2 * k + m);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[2], is[2];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate1L_H(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               State& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = StateSpace::Create(5);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[2], is[2];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0);
-        is[2 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate1L_L(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               State& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = StateSpace::Create(5);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[2], is[2];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0);
-        is[2 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate2HH_H(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m512 ru, iu, rn, in;
-      __m512 rs[4], is[4];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate2HH_L(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-
-    auto s = StateSpace::Create(8);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (4 * i + 4 * k + m);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[4], is[4];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate2HL_H(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = StateSpace::Create(7);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[4], is[4];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate2HL_L(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = StateSpace::Create(7);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[4], is[4];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate2LL_H(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = StateSpace::Create(6);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[4], is[4];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0);
-        is[4 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate2LL_L(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = StateSpace::Create(6);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[4], is[4];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0);
-        is[4 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate3HHH_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m512 ru, iu, rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate3HHH_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-
-    auto s = StateSpace::Create(10);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (8 * i + 8 * k + m);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate3HHL_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = StateSpace::Create(9);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate3HHL_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = StateSpace::Create(9);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate3HLL_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = StateSpace::Create(8);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate3HLL_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = StateSpace::Create(8);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate3LLL_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[7];
-
-    auto s = StateSpace::Create(7);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[8 * l] = _mm512_load_ps(p0);
-        is[8 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
-          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate3LLL_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[7];
-
-    auto s = StateSpace::Create(7);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[8 * l] = _mm512_load_ps(p0);
-        is[8 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
-          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate4HHHH_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m512 ru, iu, rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate4HHHH_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-
-    auto s = StateSpace::Create(12);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (16 * i + 16 * k + m);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate4HHHL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = StateSpace::Create(11);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate4HHHL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = StateSpace::Create(11);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate4HHLL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = StateSpace::Create(10);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate4HHLL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = StateSpace::Create(10);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate4HLLL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[7];
-
-    auto s = StateSpace::Create(9);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
-          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate4HLLL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[7];
-
-    auto s = StateSpace::Create(9);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
-          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate4LLLL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[15];
-
-    auto s = StateSpace::Create(8);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
-
-    for (unsigned i = 0; i < 15; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[16 * l] = _mm512_load_ps(p0);
-        is[16 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 16; ++j) {
-          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
-          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  void ApplyControlledGate4LLLL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[15];
-
-    auto s = StateSpace::Create(8);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
-
-    for (unsigned i = 0; i < 15; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[16 * l] = _mm512_load_ps(p0);
-        is[16 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 16; ++j) {
-          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
-          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue1H(const std::vector<unsigned>& qs,
-                                          const fp_type* matrix,
-                                          const State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      __m512 ru, iu, rn, in;
-      __m512 rs[2], is[2];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue1L(const std::vector<unsigned>& qs,
-                                          const fp_type* matrix,
-                                          const State& state) const {
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = StateSpace::Create(5);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const __m512i* idx, const fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[2], is[2];
-
-      auto p0 = rstate + 32 * i;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0);
-        is[2 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue2HH(const std::vector<unsigned>& qs,
-                                           const fp_type* matrix,
-                                           const State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      __m512 ru, iu, rn, in;
-      __m512 rs[4], is[4];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue2HL(const std::vector<unsigned>& qs,
-                                           const fp_type* matrix,
-                                           const State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = StateSpace::Create(7);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, const fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[4], is[4];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 2 * l;
-
-        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue2LL(const std::vector<unsigned>& qs,
-                                           const fp_type* matrix,
-                                           const State& state) const {
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = StateSpace::Create(6);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const __m512i* idx, const fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[4], is[4];
-
-      auto p0 = rstate + 32 * i;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0);
-        is[4 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue3HHH(const std::vector<unsigned>& qs,
-                                            const fp_type* matrix,
-                                            const State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      __m512 ru, iu, rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
-          | (128 * i & ms[3]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue3HHL(const std::vector<unsigned>& qs,
-                                            const fp_type* matrix,
-                                            const State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = StateSpace::Create(9);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, const fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 2 * l;
-
-        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue3HLL(const std::vector<unsigned>& qs,
-                                            const fp_type* matrix,
-                                            const State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = StateSpace::Create(8);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, const fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 4 * l;
-
-        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue3LLL(const std::vector<unsigned>& qs,
-                                            const fp_type* matrix,
-                                            const State& state) const {
-    unsigned p[16];
-    __m512i idx[7];
-
-    auto s = StateSpace::Create(7);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const __m512i* idx, const fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[8], is[8];
-
-      auto p0 = rstate + 32 * i;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[8 * l] = _mm512_load_ps(p0);
-        is[8 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
-          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue4HHHH(const std::vector<unsigned>& qs,
-                                             const fp_type* matrix,
-                                             const State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      __m512 ru, iu, rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
-          | (128 * i & ms[3]) | (256 * i & ms[4]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue4HHHL(const std::vector<unsigned>& qs,
-                                             const fp_type* matrix,
-                                             const State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = StateSpace::Create(11);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, const fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
-          | (128 * i & ms[3]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 2 * l;
-
-        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue4HHLL(const std::vector<unsigned>& qs,
-                                             const fp_type* matrix,
-                                             const State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = StateSpace::Create(10);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, const fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 4 * l;
-
-        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue4HLLL(const std::vector<unsigned>& qs,
-                                             const fp_type* matrix,
-                                             const State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[7];
-
-    auto s = StateSpace::Create(9);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, const fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
-          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 8 * l;
-
-        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue4LLLL(const std::vector<unsigned>& qs,
-                                             const fp_type* matrix,
-                                             const State& state) const {
-    unsigned p[16];
-    __m512i idx[15];
-
-    auto s = StateSpace::Create(8);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
-
-    for (unsigned i = 0; i < 15; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const __m512i* idx, const fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      auto p0 = rstate + 32 * i;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[16 * l] = _mm512_load_ps(p0);
-        is[16 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 16; ++j) {
-          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
-          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue5HHHHH(const std::vector<unsigned>& qs,
-                                              const fp_type* matrix,
-                                              const State& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      __m512 ru, iu, rn, in;
-      __m512 rs[32], is[32];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
-          | (128 * i & ms[3]) | (256 * i & ms[4]) | (512 * i & ms[5]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 9;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue5HHHHL(const std::vector<unsigned>& qs,
-                                              const fp_type* matrix,
-                                              const State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = StateSpace::Create(13);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (64 * i + 32 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, const fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[32], is[32];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
-          | (128 * i & ms[3]) | (256 * i & ms[4]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 2 * l;
-
-        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue5HHHLL(const std::vector<unsigned>& qs,
-                                              const fp_type* matrix,
-                                              const State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = StateSpace::Create(12);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (128 * i + 32 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, const fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[32], is[32];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
-          | (128 * i & ms[3]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 4 * l;
-
-        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue5HHLLL(const std::vector<unsigned>& qs,
-                                              const fp_type* matrix,
-                                              const State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[7];
-
-    auto s = StateSpace::Create(11);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (256 * i + 32 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, const fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[32], is[32];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
-          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 8 * l;
-
-        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue5HLLLL(const std::vector<unsigned>& qs,
-                                              const fp_type* matrix,
-                                              const State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[4] + 1);
-    ms[0] = (uint64_t{1} << qs[4]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[15];
-
-    auto s = StateSpace::Create(10);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
-
-    for (unsigned i = 0; i < 15; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (512 * i + 32 * k + 16 * (m / 16) + (k + m) % 16);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, const fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[32], is[32];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[16 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[16 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 16; ++j) {
-          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
-          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 16 * l;
-
-        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue6HHHHHH(const std::vector<unsigned>& qs,
-                                               const fp_type* matrix,
-                                               const State& state) const {
-    uint64_t xs[6];
-    uint64_t ms[7];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 6; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1);
-
-    uint64_t xss[64];
-    for (unsigned i = 0; i < 64; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 6; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      __m512 ru, iu, rn, in;
-      __m512 rs[64], is[64];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
-          | (128 * i & ms[3]) | (256 * i & ms[4]) | (512 * i & ms[5])
-          | (1024 * i & ms[6]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 10;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue6HHHHHL(const std::vector<unsigned>& qs,
-                                               const fp_type* matrix,
-                                               const State& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = StateSpace::Create(15);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 32; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (128 * i + 64 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, const fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[64], is[64];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
-          | (128 * i & ms[3]) | (256 * i & ms[4]) | (512 * i & ms[5]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 2 * l;
-
-        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 9;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue6HHHHLL(const std::vector<unsigned>& qs,
-                                               const fp_type* matrix,
-                                               const State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = StateSpace::Create(14);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (256 * i + 64 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, const fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[64], is[64];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
-          | (128 * i & ms[3]) | (256 * i & ms[4]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 4 * l;
-
-        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue6HHHLLL(const std::vector<unsigned>& qs,
-                                               const fp_type* matrix,
-                                               const State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[7];
-
-    auto s = StateSpace::Create(13);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (512 * i + 64 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, const fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[64], is[64];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
-          | (128 * i & ms[3]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
-          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 8 * l;
-
-        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  std::complex<double> ExpectationValue6HHLLLL(const std::vector<unsigned>& qs,
-                                               const fp_type* matrix,
-                                               const State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[4] + 1);
-    ms[0] = (uint64_t{1} << qs[4]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 4] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 4]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[15];
-
-    auto s = StateSpace::Create(12);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
-
-    for (unsigned i = 0; i < 15; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (1024 * i + 64 * k + 16 * (m / 16) + (k + m) % 16);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, const fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[64], is[64];
-
-      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[16 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[16 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 16; ++j) {
-          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
-          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = 16 * l;
-
-        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
-  }
-
-  static unsigned MaskedAdd(
-      unsigned a, unsigned b, unsigned mask, unsigned lsize) {
-    unsigned c = bits::CompressBits(a, 4, mask);
-    return bits::ExpandBits((c + b) % lsize, 4, mask);
   }
 
   For for_;
diff --git a/lib/simulator_basic.h b/lib/simulator_basic.h
index 707825e8..9e136c68 100644
--- a/lib/simulator_basic.h
+++ b/lib/simulator_basic.h
@@ -15,12 +15,12 @@
 #ifndef SIMULATOR_BASIC_H_
 #define SIMULATOR_BASIC_H_
 
-
-#include <algorithm>
 #include <complex>
 #include <cstdint>
+#include <functional>
+#include <vector>
 
-#include "bits.h"
+#include "simulator.h"
 #include "statespace_basic.h"
 
 namespace qsim {
@@ -29,7 +29,7 @@ namespace qsim {
  * Quantum circuit simulator without vectorization.
  */
 template <typename For, typename FP = float>
-class SimulatorBasic final {
+class SimulatorBasic final : public SimulatorBase {
  public:
   using StateSpace = StateSpaceBasic<For, FP>;
   using State = typename StateSpace::State;
@@ -50,22 +50,22 @@ class SimulatorBasic final {
 
     switch (qs.size()) {
     case 1:
-      ApplyGate1H(qs, matrix, state);
+      ApplyGateH<1>(qs, matrix, state);
       break;
     case 2:
-      ApplyGate2H(qs, matrix, state);
+      ApplyGateH<2>(qs, matrix, state);
       break;
     case 3:
-      ApplyGate3H(qs, matrix, state);
+      ApplyGateH<3>(qs, matrix, state);
       break;
     case 4:
-      ApplyGate4H(qs, matrix, state);
+      ApplyGateH<4>(qs, matrix, state);
       break;
     case 5:
-      ApplyGate5H(qs, matrix, state);
+      ApplyGateH<5>(qs, matrix, state);
       break;
     case 6:
-      ApplyGate6H(qs, matrix, state);
+      ApplyGateH<6>(qs, matrix, state);
       break;
     default:
       // Not implemented.
@@ -77,13 +77,15 @@ class SimulatorBasic final {
    * Applies a controlled gate using non-vectorized instructions.
    * @param qs Indices of the qubits affected by this gate.
    * @param cqs Indices of control qubits.
-   * @param cmask Bit mask of control qubit values.
+   * @param cvals Bit mask of control qubit values.
    * @param matrix Matrix representation of the gate to be applied.
    * @param state The state of the system, to be updated by this method.
    */
   void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cmask,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
                            const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
     if (cqs.size() == 0) {
       ApplyGate(qs, matrix, state);
       return;
@@ -91,16 +93,16 @@ class SimulatorBasic final {
 
     switch (qs.size()) {
     case 1:
-      ApplyControlledGate1H(qs, cqs, cmask, matrix, state);
+      ApplyControlledGateH<1>(qs, cqs, cvals, matrix, state);
       break;
     case 2:
-      ApplyControlledGate2H(qs, cqs, cmask, matrix, state);
+      ApplyControlledGateH<2>(qs, cqs, cvals, matrix, state);
       break;
     case 3:
-      ApplyControlledGate3H(qs, cqs, cmask, matrix, state);
+      ApplyControlledGateH<3>(qs, cqs, cvals, matrix, state);
       break;
     case 4:
-      ApplyControlledGate4H(qs, cqs, cmask, matrix, state);
+      ApplyControlledGateH<4>(qs, cqs, cvals, matrix, state);
       break;
     default:
       // Not implemented.
@@ -123,22 +125,22 @@ class SimulatorBasic final {
 
     switch (qs.size()) {
     case 1:
-      return ExpectationValue1H(qs, matrix, state);
+      return ExpectationValueH<1>(qs, matrix, state);
       break;
     case 2:
-      return ExpectationValue2H(qs, matrix, state);
+      return ExpectationValueH<2>(qs, matrix, state);
       break;
     case 3:
-      return ExpectationValue3H(qs, matrix, state);
+      return ExpectationValueH<3>(qs, matrix, state);
       break;
     case 4:
-      return ExpectationValue4H(qs, matrix, state);
+      return ExpectationValueH<4>(qs, matrix, state);
       break;
     case 5:
-      return ExpectationValue5H(qs, matrix, state);
+      return ExpectationValueH<5>(qs, matrix, state);
       break;
     case 6:
-      return ExpectationValue6H(qs, matrix, state);
+      return ExpectationValueH<6>(qs, matrix, state);
       break;
     default:
       // Not implemented.
@@ -156,1088 +158,144 @@ class SimulatorBasic final {
   }
 
  private:
-  void ApplyGate1H(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
-      fp_type rn, in;
-      fp_type rs[2], is[2];
-
-      uint64_t k = (1 * i & ms[0]) | (2 * i & ms[1]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
-
-          j += 2;
-        }
-
-        *(p0 + xss[l]) = rn;
-        *(p0 + xss[l] + 1) = in;
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 1;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, rstate);
-  }
-
-  void ApplyGate2H(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
+                const uint64_t* ms, const uint64_t* xss, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
 
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
       fp_type rn, in;
-      fp_type rs[4], is[4];
-
-      uint64_t k = (1 * i & ms[0]) | (2 * i & ms[1]) | (4 * i & ms[2]);
+      fp_type rs[hsize], is[hsize];
 
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
       }
 
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
+      auto p0 = rstate + 2 * ii;
 
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
-
-          j += 2;
-        }
-
-        *(p0 + xss[l]) = rn;
-        *(p0 + xss[l] + 1) = in;
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 2;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, rstate);
-  }
-
-  void ApplyGate3H(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
-      fp_type rn, in;
-      fp_type rs[8], is[8];
-
-      uint64_t k = (1 * i & ms[0]) | (2 * i & ms[1]) | (4 * i & ms[2])
-          | (8 * i & ms[3]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = *(p0 + xss[k]);
+        is[k] = *(p0 + xss[k] + 1);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 8; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = rs[0] * v[j] - is[0] * v[j + 1];
         in = rs[0] * v[j + 1] + is[0] * v[j];
 
         j += 2;
 
-        for (unsigned n = 1; n < 8; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn += rs[l] * v[j] - is[l] * v[j + 1];
+          in += rs[l] * v[j + 1] + is[l] * v[j];
 
           j += 2;
         }
 
-        *(p0 + xss[l]) = rn;
-        *(p0 + xss[l] + 1) = in;
+        *(p0 + xss[k]) = rn;
+        *(p0 + xss[k] + 1) = in;
       }
     };
 
-    fp_type* rstate = state.get();
-
-    unsigned k = 3;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, rstate);
-  }
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
 
-  void ApplyGate4H(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
 
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
-      fp_type rn, in;
-      fp_type rs[16], is[16];
-
-      uint64_t k = (1 * i & ms[0]) | (2 * i & ms[1]) | (4 * i & ms[2])
-          | (8 * i & ms[3]) | (16 * i & ms[4]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
-
-          j += 2;
-        }
-
-        *(p0 + xss[l]) = rn;
-        *(p0 + xss[l] + 1) = in;
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, matrix, ms, xss, rstate);
+    for_.Run(size, f, matrix, ms, xss, state.get());
   }
 
-  void ApplyGate5H(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, State& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
+  template <unsigned H>
+  void ApplyControlledGateH(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs,
+                            uint64_t cvals, const fp_type* matrix,
+                            State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
                 const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
-      fp_type rn, in;
-      fp_type rs[32], is[32];
-
-      uint64_t k = (1 * i & ms[0]) | (2 * i & ms[1]) | (4 * i & ms[2])
-          | (8 * i & ms[3]) | (16 * i & ms[4]) | (32 * i & ms[5]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
-
-          j += 2;
-        }
-
-        *(p0 + xss[l]) = rn;
-        *(p0 + xss[l] + 1) = in;
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, rstate);
-  }
-
-  void ApplyGate6H(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, State& state) const {
-    uint64_t xs[6];
-    uint64_t ms[7];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 6; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1);
-
-    uint64_t xss[64];
-    for (unsigned i = 0; i < 64; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 6; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
-      fp_type rn, in;
-      fp_type rs[64], is[64];
-
-      uint64_t k = (1 * i & ms[0]) | (2 * i & ms[1]) | (4 * i & ms[2])
-          | (8 * i & ms[3]) | (16 * i & ms[4]) | (32 * i & ms[5])
-          | (64 * i & ms[6]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
-
-          j += 2;
-        }
-
-        *(p0 + xss[l]) = rn;
-        *(p0 + xss[l] + 1) = in;
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, rstate);
-  }
-
-  void ApplyControlledGate1H(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs,
-                             uint64_t cmask, const fp_type* matrix,
-                             State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      fp_type rn, in;
-      fp_type rs[2], is[2];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
-
-          j += 2;
-        }
-
-        *(p0 + xss[l]) = rn;
-        *(p0 + xss[l] + 1) = in;
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 1 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate2H(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs,
-                             uint64_t cmask, const fp_type* matrix,
-                             State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      fp_type rn, in;
-      fp_type rs[4], is[4];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
-
-          j += 2;
-        }
-
-        *(p0 + xss[l]) = rn;
-        *(p0 + xss[l] + 1) = in;
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 2 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate3H(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs,
-                             uint64_t cmask, const fp_type* matrix,
-                             State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
+                uint64_t cvalsh, uint64_t cmaskh, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
 
-    emaskh = ~emaskh;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
       fp_type rn, in;
-      fp_type rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
+      fp_type rs[hsize], is[hsize];
 
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
       }
 
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
+      if ((ii & cmaskh) == cvalsh) {
+        auto p0 = rstate + 2 * ii;
 
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
-
-          j += 2;
-        }
-
-        *(p0 + xss[l]) = rn;
-        *(p0 + xss[l] + 1) = in;
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate4H(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs,
-                             uint64_t cmask, const fp_type* matrix,
-                             State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
+        for (unsigned k = 0; k < hsize; ++k) {
+          rs[k] = *(p0 + xss[k]);
+          is[k] = *(p0 + xss[k] + 1);
         }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
 
-    emaskh = ~emaskh;
+        uint64_t j = 0;
 
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      fp_type rn, in;
-      fp_type rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
+        for (unsigned k = 0; k < hsize; ++k) {
+          rn = rs[0] * v[j] - is[0] * v[j + 1];
+          in = rs[0] * v[j + 1] + is[0] * v[j];
 
           j += 2;
-        }
 
-        *(p0 + xss[l]) = rn;
-        *(p0 + xss[l] + 1) = in;
-      }
-    };
-
-    fp_type* rstate = state.get();
+          for (unsigned l = 1; l < hsize; ++l) {
+            rn += rs[l] * v[j] - is[l] * v[j + 1];
+            in += rs[l] * v[j + 1] + is[l] * v[j];
 
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
+            j += 2;
+          }
 
-    for_.Run(size, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  std::complex<double> ExpectationValue1H(const std::vector<unsigned>& qs,
-                                          const fp_type* matrix,
-                                          const State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
+          *(p0 + xss[k]) = rn;
+          *(p0 + xss[k] + 1) = in;
         }
       }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      fp_type rn, in;
-      fp_type rs[2], is[2];
-
-      uint64_t k = (1 * i & ms[0]) | (2 * i & ms[1]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
-
-          j += 2;
-        }
-
-        re += rs[l] * rn + is[l] * in;
-        im += rs[l] * in - is[l] * rn;
-      }
-
-      return std::complex<double>{re, im};
     };
 
-    const fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
 
-    unsigned k = 1;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
 
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
+    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
 
-  std::complex<double> ExpectationValue2H(const std::vector<unsigned>& qs,
-                                          const fp_type* matrix,
-                                          const State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      fp_type rn, in;
-      fp_type rs[4], is[4];
-
-      uint64_t k = (1 * i & ms[0]) | (2 * i & ms[1]) | (4 * i & ms[2]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
-
-          j += 2;
-        }
-
-        re += rs[l] * rn + is[l] * in;
-        im += rs[l] * in - is[l] * rn;
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 2;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
     uint64_t size = uint64_t{1} << n;
 
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
+    for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get());
   }
 
-  std::complex<double> ExpectationValue3H(const std::vector<unsigned>& qs,
-                                          const fp_type* matrix,
-                                          const State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
+  template <unsigned H>
+  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
                 const uint64_t* ms, const uint64_t* xss,
                 const fp_type* rstate) {
-      fp_type rn, in;
-      fp_type rs[8], is[8];
-
-      uint64_t k = (1 * i & ms[0]) | (2 * i & ms[1]) | (4 * i & ms[2])
-          | (8 * i & ms[3]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
-      }
-
-      double re = 0;
-      double im = 0;
+      constexpr unsigned hsize = 1 << H;
 
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
-
-          j += 2;
-        }
-
-        re += rs[l] * rn + is[l] * in;
-        im += rs[l] * in - is[l] * rn;
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 3;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue4H(const std::vector<unsigned>& qs,
-                                          const fp_type* matrix,
-                                          const State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
       fp_type rn, in;
-      fp_type rs[16], is[16];
-
-      uint64_t k = (1 * i & ms[0]) | (2 * i & ms[1]) | (4 * i & ms[2])
-          | (8 * i & ms[3]) | (16 * i & ms[4]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
+      fp_type rs[hsize], is[hsize];
 
-          j += 2;
-        }
-
-        re += rs[l] * rn + is[l] * in;
-        im += rs[l] * in - is[l] * rn;
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
       }
 
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
+      auto p0 = rstate + 2 * ii;
 
-  std::complex<double> ExpectationValue5H(const std::vector<unsigned>& qs,
-                                          const fp_type* matrix,
-                                          const State& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      fp_type rn, in;
-      fp_type rs[32], is[32];
-
-      uint64_t k = (1 * i & ms[0]) | (2 * i & ms[1]) | (4 * i & ms[2])
-          | (8 * i & ms[3]) | (16 * i & ms[4]) | (32 * i & ms[5]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = *(p0 + xss[k]);
+        is[k] = *(p0 + xss[k] + 1);
       }
 
       double re = 0;
@@ -1245,111 +303,36 @@ class SimulatorBasic final {
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 32; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = rs[0] * v[j] - is[0] * v[j + 1];
         in = rs[0] * v[j + 1] + is[0] * v[j];
 
         j += 2;
 
-        for (unsigned n = 1; n < 32; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn += rs[l] * v[j] - is[l] * v[j + 1];
+          in += rs[l] * v[j + 1] + is[l] * v[j];
 
           j += 2;
         }
 
-        re += rs[l] * rn + is[l] * in;
-        im += rs[l] * in - is[l] * rn;
+        re += rs[k] * rn + is[k] * in;
+        im += rs[k] * in - is[k] * rn;
       }
 
       return std::complex<double>{re, im};
     };
 
-    const fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue6H(const std::vector<unsigned>& qs,
-                                          const fp_type* matrix,
-                                          const State& state) const {
-    uint64_t xs[6];
-    uint64_t ms[7];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 6; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1);
-
-    uint64_t xss[64];
-    for (unsigned i = 0; i < 64; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 6; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      fp_type rn, in;
-      fp_type rs[64], is[64];
-
-      uint64_t k = (1 * i & ms[0]) | (2 * i & ms[1]) | (4 * i & ms[2])
-          | (8 * i & ms[3]) | (16 * i & ms[4]) | (32 * i & ms[5])
-          | (64 * i & ms[6]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
-
-          j += 2;
-        }
-
-        re += rs[l] * rn + is[l] * in;
-        im += rs[l] * in - is[l] * rn;
-      }
-
-      return std::complex<double>{re, im};
-    };
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
 
-    const fp_type* rstate = state.get();
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
 
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
     uint64_t size = uint64_t{1} << n;
 
     using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
+    return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get());
   }
 
   For for_;
diff --git a/lib/simulator_cuda.h b/lib/simulator_cuda.h
index 36b1e869..b507a224 100644
--- a/lib/simulator_cuda.h
+++ b/lib/simulator_cuda.h
@@ -60,6 +60,10 @@ class SimulatorCUDA final {
     ErrorCheck(cudaFree(d_idx));
     ErrorCheck(cudaFree(d_ms));
     ErrorCheck(cudaFree(d_xss));
+
+    if (scratch_ != nullptr) {
+      ErrorCheck(cudaFree(scratch_));
+    }
   }
 
   /**
diff --git a/lib/simulator_custatevec.h b/lib/simulator_custatevec.h
new file mode 100644
index 00000000..df01d974
--- /dev/null
+++ b/lib/simulator_custatevec.h
@@ -0,0 +1,182 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_CUSTATEVEC_H_
+#define SIMULATOR_CUSTATEVEC_H_
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+
+#include <cuComplex.h>
+#include <custatevec.h>
+
+#include "statespace_custatevec.h"
+#include "util_custatevec.h"
+
+namespace qsim {
+
+/**
+ * Quantum circuit simulator using the NVIDIA cuStateVec library.
+ */
+template <typename FP = float>
+class SimulatorCuStateVec final {
+ public:
+  using StateSpace = StateSpaceCuStateVec<FP>;
+  using State = typename StateSpace::State;
+  using fp_type = typename StateSpace::fp_type;
+
+  static constexpr auto kStateType = StateSpace::kStateType;
+  static constexpr auto kMatrixType = StateSpace::kMatrixType;
+  static constexpr auto kExpectType = StateSpace::kExpectType;
+  static constexpr auto kComputeType = StateSpace::kComputeType;
+  static constexpr auto kMatrixLayout = StateSpace::kMatrixLayout;
+
+  explicit SimulatorCuStateVec(const custatevecHandle_t& handle)
+      : handle_(handle), workspace_(nullptr), workspace_size_(0) {}
+
+  ~SimulatorCuStateVec() {
+    ErrorCheck(cudaFree(workspace_));
+  }
+
+  /**
+   * Applies a gate using the NVIDIA cuStateVec library.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    auto workspace_size = ApplyGateWorkSpaceSize(
+        state.num_qubits(), qs.size(), 0, matrix);
+    AllocWorkSpace(workspace_size);
+
+    ErrorCheck(custatevecApplyMatrix(
+                   handle_, state.get(), kStateType, state.num_qubits(),
+                   matrix, kMatrixType, kMatrixLayout, 0,
+                   (int32_t*) qs.data(), qs.size(), nullptr, nullptr, 0,
+                   kComputeType, workspace_, workspace_size));
+  }
+
+  /**
+   * Applies a controlled gate using the NVIDIA cuStateVec library.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cmask Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cmask,
+                           const fp_type* matrix, State& state) const {
+    std::vector<int32_t> control_bits;
+    control_bits.reserve(cqs.size());
+
+    for (std::size_t i = 0; i < cqs.size(); ++i) {
+      control_bits.push_back((cmask >> i) & 1);
+    }
+
+    auto workspace_size = ApplyGateWorkSpaceSize(
+        state.num_qubits(), qs.size(), cqs.size(), matrix);
+    AllocWorkSpace(workspace_size);
+
+    ErrorCheck(custatevecApplyMatrix(
+                   handle_, state.get(), kStateType, state.num_qubits(),
+                   matrix, kMatrixType, kMatrixLayout, 0,
+                   (int32_t*) qs.data(), qs.size(),
+                   (int32_t*) cqs.data(), control_bits.data(), cqs.size(),
+                   kComputeType, workspace_, workspace_size));
+  }
+
+  /**
+   * Computes the expectation value of an operator using the NVIDIA cuStateVec
+   * library.
+   * @param qs Indices of the qubits the operator acts on.
+   * @param matrix The operator matrix.
+   * @param state The state of the system.
+   * @return The computed expectation value.
+   */
+  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
+                                        const fp_type* matrix,
+                                        const State& state) const {
+    auto workspace_size = ExpectationValueWorkSpaceSize(
+        state.num_qubits(), qs.size(), matrix);
+    AllocWorkSpace(workspace_size);
+
+    cuDoubleComplex eval;
+
+    ErrorCheck(custatevecComputeExpectation(
+                   handle_, state.get(), kStateType, state.num_qubits(),
+                   &eval, kExpectType, nullptr, matrix, kMatrixType,
+                   kMatrixLayout, (int32_t*) qs.data(), qs.size(),
+                   kComputeType, workspace_, workspace_size));
+
+    return {cuCreal(eval), cuCimag(eval)};
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 32;
+  }
+
+ private:
+  size_t ApplyGateWorkSpaceSize(
+      unsigned num_qubits, unsigned num_targets, unsigned num_controls,
+      const fp_type* matrix) const {
+    size_t size;
+
+    ErrorCheck(custatevecApplyMatrixGetWorkspaceSize(
+                   handle_, kStateType, num_qubits, matrix, kMatrixType,
+                   kMatrixLayout, 0, num_targets, num_controls, kComputeType,
+                   &size));
+
+    return size;
+  }
+
+  size_t ExpectationValueWorkSpaceSize(
+      unsigned num_qubits, unsigned num_targets, const fp_type* matrix) const {
+    size_t size;
+
+    ErrorCheck(custatevecComputeExpectationGetWorkspaceSize(
+                   handle_, kStateType, num_qubits, matrix, kMatrixType,
+                   kMatrixLayout, num_targets, kComputeType, &size));
+
+    return size;
+  }
+
+  void* AllocWorkSpace(size_t size) const {
+    if (size > workspace_size_) {
+      if (workspace_ != nullptr) {
+        ErrorCheck(cudaFree(workspace_));
+      }
+
+      ErrorCheck(cudaMalloc(const_cast<void**>(&workspace_), size));
+
+      const_cast<uint64_t&>(workspace_size_) = size;
+    }
+
+    return workspace_;
+  }
+
+  const custatevecHandle_t handle_;
+
+  void* workspace_;
+  size_t workspace_size_;
+};
+
+}  // namespace qsim
+
+#endif  // SIMULATOR_CUSTATEVEC_H_
diff --git a/lib/simulator_sse.h b/lib/simulator_sse.h
index 760a707a..50279e22 100644
--- a/lib/simulator_sse.h
+++ b/lib/simulator_sse.h
@@ -17,11 +17,12 @@
 
 #include <smmintrin.h>
 
-#include <algorithm>
 #include <complex>
 #include <cstdint>
+#include <functional>
+#include <vector>
 
-#include "bits.h"
+#include "simulator.h"
 #include "statespace_sse.h"
 
 namespace qsim {
@@ -30,7 +31,7 @@ namespace qsim {
  * Quantum circuit simulator with SSE vectorization.
  */
 template <typename For>
-class SimulatorSSE final {
+class SimulatorSSE final : public SimulatorBase {
  public:
   using StateSpace = StateSpaceSSE<For>;
   using State = typename StateSpace::State;
@@ -52,54 +53,54 @@ class SimulatorSSE final {
     switch (qs.size()) {
     case 1:
       if (qs[0] > 1) {
-        ApplyGate1H(qs, matrix, state);
+        ApplyGateH<1>(qs, matrix, state);
       } else {
-        ApplyGate1L(qs, matrix, state);
+        ApplyGateL<0, 1>(qs, matrix, state);
       }
       break;
     case 2:
       if (qs[0] > 1) {
-        ApplyGate2HH(qs, matrix, state);
+        ApplyGateH<2>(qs, matrix, state);
       } else if (qs[1] > 1) {
-        ApplyGate2HL(qs, matrix, state);
+        ApplyGateL<1, 1>(qs, matrix, state);
       } else {
-        ApplyGate2LL(qs, matrix, state);
+        ApplyGateL<0, 2>(qs, matrix, state);
       }
       break;
     case 3:
       if (qs[0] > 1) {
-        ApplyGate3HHH(qs, matrix, state);
+        ApplyGateH<3>(qs, matrix, state);
       } else if (qs[1] > 1) {
-        ApplyGate3HHL(qs, matrix, state);
+        ApplyGateL<2, 1>(qs, matrix, state);
       } else {
-        ApplyGate3HLL(qs, matrix, state);
+        ApplyGateL<1, 2>(qs, matrix, state);
       }
       break;
     case 4:
       if (qs[0] > 1) {
-        ApplyGate4HHHH(qs, matrix, state);
+        ApplyGateH<4>(qs, matrix, state);
       } else if (qs[1] > 1) {
-        ApplyGate4HHHL(qs, matrix, state);
+        ApplyGateL<3, 1>(qs, matrix, state);
       } else {
-        ApplyGate4HHLL(qs, matrix, state);
+        ApplyGateL<2, 2>(qs, matrix, state);
       }
       break;
     case 5:
       if (qs[0] > 1) {
-        ApplyGate5HHHHH(qs, matrix, state);
+        ApplyGateH<5>(qs, matrix, state);
       } else if (qs[1] > 1) {
-        ApplyGate5HHHHL(qs, matrix, state);
+        ApplyGateL<4, 1>(qs, matrix, state);
       } else {
-        ApplyGate5HHHLL(qs, matrix, state);
+        ApplyGateL<3, 2>(qs, matrix, state);
       }
       break;
     case 6:
       if (qs[0] > 1) {
-        ApplyGate6HHHHHH(qs, matrix, state);
+        ApplyGateH<6>(qs, matrix, state);
       } else if (qs[1] > 1) {
-        ApplyGate6HHHHHL(qs, matrix, state);
+        ApplyGateL<5, 1>(qs, matrix, state);
       } else {
-        ApplyGate6HHHHLL(qs, matrix, state);
+        ApplyGateL<4, 2>(qs, matrix, state);
       }
       break;
     default:
@@ -112,13 +113,16 @@ class SimulatorSSE final {
    * Applies a controlled gate using SSE instructions.
    * @param qs Indices of the qubits affected by this gate.
    * @param cqs Indices of control qubits.
-   * @param cmask Bit mask of control qubit values.
+   * @param cvals Bit mask of control qubit values.
    * @param matrix Matrix representation of the gate to be applied.
    * @param state The state of the system, to be updated by this method.
    */
   void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cmask,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
                            const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
+
     if (cqs.size() == 0) {
       ApplyGate(qs, matrix, state);
       return;
@@ -128,78 +132,78 @@ class SimulatorSSE final {
     case 1:
       if (qs[0] > 1) {
         if (cqs[0] > 1) {
-          ApplyControlledGate1H_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate1H_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 1) {
-          ApplyControlledGate1L_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate1L_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
     case 2:
       if (qs[0] > 1) {
         if (cqs[0] > 1) {
-          ApplyControlledGate2HH_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate2HH_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[1] > 1) {
         if (cqs[0] > 1) {
-          ApplyControlledGate2HL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate2HL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 1) {
-          ApplyControlledGate2LL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate2LL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
     case 3:
       if (qs[0] > 1) {
         if (cqs[0] > 1) {
-          ApplyControlledGate3HHH_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3HHH_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[1] > 1) {
         if (cqs[0] > 1) {
-          ApplyControlledGate3HHL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3HHL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 1) {
-          ApplyControlledGate3HLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3HLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
     case 4:
       if (qs[0] > 1) {
         if (cqs[0] > 1) {
-          ApplyControlledGate4HHHH_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HHHH_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[1] > 1) {
         if (cqs[0] > 1) {
-          ApplyControlledGate4HHHL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HHHL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 1) {
-          ApplyControlledGate4HHLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HHLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
@@ -224,54 +228,54 @@ class SimulatorSSE final {
     switch (qs.size()) {
     case 1:
       if (qs[0] > 1) {
-        return ExpectationValue1H(qs, matrix, state);
+        return ExpectationValueH<1>(qs, matrix, state);
       } else {
-        return ExpectationValue1L(qs, matrix, state);
+        return ExpectationValueL<0, 1>(qs, matrix, state);
       }
       break;
     case 2:
       if (qs[0] > 1) {
-        return ExpectationValue2HH(qs, matrix, state);
+        return ExpectationValueH<2>(qs, matrix, state);
       } else if (qs[1] > 1) {
-        return ExpectationValue2HL(qs, matrix, state);
+        return ExpectationValueL<1, 1>(qs, matrix, state);
       } else {
-        return ExpectationValue2LL(qs, matrix, state);
+        return ExpectationValueL<0, 2>(qs, matrix, state);
       }
       break;
     case 3:
       if (qs[0] > 1) {
-        return ExpectationValue3HHH(qs, matrix, state);
+        return ExpectationValueH<3>(qs, matrix, state);
       } else if (qs[1] > 1) {
-        return ExpectationValue3HHL(qs, matrix, state);
+        return ExpectationValueL<2, 1>(qs, matrix, state);
       } else {
-        return ExpectationValue3HLL(qs, matrix, state);
+        return ExpectationValueL<1, 2>(qs, matrix, state);
       }
       break;
     case 4:
       if (qs[0] > 1) {
-        return ExpectationValue4HHHH(qs, matrix, state);
+        return ExpectationValueH<4>(qs, matrix, state);
       } else if (qs[1] > 1) {
-        return ExpectationValue4HHHL(qs, matrix, state);
+        return ExpectationValueL<3, 1>(qs, matrix, state);
       } else {
-        return ExpectationValue4HHLL(qs, matrix, state);
+        return ExpectationValueL<2, 2>(qs, matrix, state);
       }
       break;
     case 5:
       if (qs[0] > 1) {
-        return ExpectationValue5HHHHH(qs, matrix, state);
+        return ExpectationValueH<5>(qs, matrix, state);
       } else if (qs[1] > 1) {
-        return ExpectationValue5HHHHL(qs, matrix, state);
+        return ExpectationValueL<4, 1>(qs, matrix, state);
       } else {
-        return ExpectationValue5HHHLL(qs, matrix, state);
+        return ExpectationValueL<3, 2>(qs, matrix, state);
       }
       break;
     case 6:
       if (qs[0] > 1) {
-        return ExpectationValue6HHHHHH(qs, matrix, state);
+        return ExpectationValueH<6>(qs, matrix, state);
       } else if (qs[1] > 1) {
-        return ExpectationValue6HHHHHL(qs, matrix, state);
+        return ExpectationValueL<5, 1>(qs, matrix, state);
       } else {
-        return ExpectationValue6HHHHLL(qs, matrix, state);
+        return ExpectationValueL<4, 2>(qs, matrix, state);
       }
       break;
     default:
@@ -290,199 +294,34 @@ class SimulatorSSE final {
   }
 
  private:
-  void ApplyGate1H(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[2], is[2];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, rstate);
-  }
-
-  void ApplyGate1L(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, State& state) const {
-    unsigned p[4];
-
-    auto s = StateSpace::Create(3);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                unsigned q0, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[2], is[2];
-
-      auto p0 = rstate + 8 * i;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[2 * l] = _mm_load_ps(p0);
-        is[2 * l] = _mm_load_ps(p0 + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0, rn);
-        _mm_store_ps(p0 + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 2;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
+                const uint64_t* ms, const uint64_t* xss, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
 
-    for_.Run(size, f, w, qs[0], rstate);
-  }
+      __m128 ru, iu, rn, in;
+      __m128 rs[hsize], is[hsize];
 
-  void ApplyGate2HH(const std::vector<unsigned>& qs,
-                    const fp_type* matrix, State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
+      i *= 4;
 
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
       }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[4], is[4];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2]);
 
-      auto p0 = rstate + 2 * k;
+      auto p0 = rstate + 2 * ii;
 
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm_load_ps(p0 + xss[k]);
+        is[k] = _mm_load_ps(p0 + xss[k] + 4);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 4; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         ru = _mm_set1_ps(v[j]);
         iu = _mm_set1_ps(v[j + 1]);
         rn = _mm_mul_ps(rs[0], ru);
@@ -492,182 +331,81 @@ class SimulatorSSE final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 4; ++n) {
+        for (unsigned l = 1; l < hsize; ++l) {
           ru = _mm_set1_ps(v[j]);
           iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
 
           j += 2;
         }
 
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
 
-    unsigned k = 4;
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 2 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, matrix, ms, xss, rstate);
+    for_.Run(size, f, matrix, ms, xss, state.get());
   }
 
-  void ApplyGate2HL(const std::vector<unsigned>& qs,
-                    const fp_type* matrix, State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(5);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
                 const uint64_t* ms, const uint64_t* xss,
                 unsigned q0, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[4], is[4];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
 
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
+      __m128 rn, in;
+      __m128 rs[gsize], is[gsize];
 
-          j += 2;
-        }
+      i *= 4;
 
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
       }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, w, ms, xss, qs[0], rstate);
-  }
-
-  void ApplyGate2LL(const std::vector<unsigned>& qs,
-                    const fp_type* matrix, State& state) const {
-    unsigned p[4];
-
-    auto s = StateSpace::Create(4);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+      auto p0 = rstate + 2 * ii;
 
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
 
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
+        rs[k2] = _mm_load_ps(p0 + xss[k]);
+        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
 
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
+        if (L == 1) {
+          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
+                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
+                               : _mm_shuffle_ps(is[k2], is[k2], 78);
+        } else if (L == 2) {
+          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
+          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
+          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
+          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
+          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
         }
       }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[4], is[4];
-
-      auto p0 = rstate + 8 * i;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[4 * l] = _mm_load_ps(p0);
-        is[4 * l] = _mm_load_ps(p0 + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 1; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm_mul_ps(rs[0], w[j]);
         in = _mm_mul_ps(rs[0], w[j + 1]);
         rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
@@ -675,72 +413,68 @@ class SimulatorSSE final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
 
           j += 2;
         }
 
-        _mm_store_ps(p0, rn);
-        _mm_store_ps(p0 + 4, in);
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m128 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks11<L>(qs);
 
-    unsigned k = 2;
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+    FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 2 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, w, rstate);
+    for_.Run(size, f, w, ms, xss, qs[0], state.get());
   }
 
-  void ApplyGate3HHH(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
       __m128 ru, iu, rn, in;
-      __m128 rs[8], is[8];
+      __m128 rs[hsize], is[hsize];
+
+      i *= 4;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
 
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2])
-          | (32 * i & ms[3]);
+      if ((ii & cmaskh) != cvalsh) return;
 
-      auto p0 = rstate + 2 * k;
+      auto p0 = rstate + 2 * ii;
 
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm_load_ps(p0 + xss[k]);
+        is[k] = _mm_load_ps(p0 + xss[k] + 4);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 8; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         ru = _mm_set1_ps(v[j]);
         iu = _mm_set1_ps(v[j + 1]);
         rn = _mm_mul_ps(rs[0], ru);
@@ -750,105 +484,67 @@ class SimulatorSSE final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 8; ++n) {
+        for (unsigned l = 1; l < hsize; ++l) {
           ru = _mm_set1_ps(v[j]);
           iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
 
           j += 2;
         }
 
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
 
-    unsigned k = 5;
+    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 2 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, matrix, ms, xss, rstate);
+    for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get());
   }
 
-  void ApplyGate3HHL(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(7);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
 
-        unsigned l = 2 * (8 * i + m);
+      __m128 rn, in;
+      __m128 rs[hsize], is[hsize];
 
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
+      i *= 4;
 
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
       }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned q0, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[8], is[8];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2]);
 
-      auto p0 = rstate + 2 * k;
+      if ((ii & cmaskh) != cvalsh) return;
 
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
+      auto p0 = rstate + 2 * ii;
 
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm_load_ps(p0 + xss[k]);
+        is[k] = _mm_load_ps(p0 + xss[k] + 4);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 4; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm_mul_ps(rs[0], w[j]);
         in = _mm_mul_ps(rs[0], w[j + 1]);
         rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
@@ -856,101 +552,85 @@ class SimulatorSSE final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
 
           j += 2;
         }
 
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m128 w[1 << (1 + 2 * H)];
 
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    auto m = GetMasks8<2>(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+    FillControlledMatrixH<H, 2>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 2 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, w, ms, xss, qs[0], rstate);
+    for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, state.get());
   }
 
-  void ApplyGate3HLL(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(6);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, unsigned q0, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
 
-        unsigned l = 2 * (8 * i + m);
+      __m128 rn, in;
+      __m128 rs[gsize], is[gsize];
 
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
+      i *= 4;
 
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
       }
-    }
 
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[8], is[8];
+      if ((ii & cmaskh) != cvalsh) return;
 
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]);
+      auto p0 = rstate + 2 * ii;
 
-      auto p0 = rstate + 2 * k;
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
 
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[4 * l] = _mm_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm_load_ps(p0 + xss[l] + 4);
+        rs[k2] = _mm_load_ps(p0 + xss[k]);
+        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
 
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
+        if (L == 1) {
+          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
+                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
+                               : _mm_shuffle_ps(is[k2], is[k2], 78);
+        } else if (L == 2) {
+          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
+          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
+          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
+          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
+          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
+        }
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 2; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm_mul_ps(rs[0], w[j]);
         in = _mm_mul_ps(rs[0], w[j + 1]);
         rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
@@ -958,72 +638,76 @@ class SimulatorSSE final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
 
           j += 2;
         }
 
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m128 w[1 << (1 + 2 * H + L)];
 
-    unsigned k = 3;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+
+    unsigned r = 2 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, w, ms, xss, rstate);
-  }
+    if (CH) {
+      auto m = GetMasks9<L>(state.num_qubits(), qs, cqs, cvals);
+      FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
 
-  void ApplyGate4HHHH(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
+      for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, qs[0], state.get());
+    } else {
+      auto m = GetMasks10<L, 2>(state.num_qubits(), qs, cqs, cvals);
+      FillControlledMatrixL<H, L, 2>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
 
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
+      for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, qs[0], state.get());
     }
+  }
 
+  template <unsigned H>
+  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
                 const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
+                const fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
       __m128 ru, iu, rn, in;
-      __m128 rs[16], is[16];
+      __m128 rs[hsize], is[hsize];
 
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2])
-          | (32 * i & ms[3]) | (64 * i & ms[4]);
+      i *= 4;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
 
-      auto p0 = rstate + 2 * k;
+      auto p0 = rstate + 2 * ii;
 
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm_load_ps(p0 + xss[k]);
+        is[k] = _mm_load_ps(p0 + xss[k] + 4);
       }
 
+      double re = 0;
+      double im = 0;
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 16; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         ru = _mm_set1_ps(v[j]);
         iu = _mm_set1_ps(v[j + 1]);
         rn = _mm_mul_ps(rs[0], ru);
@@ -1033,106 +717,90 @@ class SimulatorSSE final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 16; ++n) {
+        for (unsigned l = 1; l < hsize; ++l) {
           ru = _mm_set1_ps(v[j]);
           iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
 
           j += 2;
         }
 
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
+        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[k], rn), _mm_mul_ps(is[k], in));
+        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[k], in), _mm_mul_ps(is[k], rn));
+
+        re += detail::HorizontalSumSSE(v_re);
+        im += detail::HorizontalSumSSE(v_im);
       }
+
+      return std::complex<double>{re, im};
     };
 
-    fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
 
-    unsigned k = 6;
+    unsigned k = 2 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
 
-    for_.Run(size, f, matrix, ms, xss, rstate);
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get());
   }
 
-  void ApplyGate4HHHL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(9);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
+  template <unsigned H, unsigned L>
+  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
+                const uint64_t* ms, const uint64_t* xss, unsigned q0,
+                const fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
 
-        unsigned l = 2 * (16 * i + m);
+      __m128 rn, in;
+      __m128 rs[gsize], is[gsize];
 
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
+      i *= 4;
 
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
       }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned q0, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[16], is[16];
 
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2])
-          | (32 * i & ms[3]);
+      auto p0 = rstate + 2 * ii;
 
-      auto p0 = rstate + 2 * k;
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
 
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
+        rs[k2] = _mm_load_ps(p0 + xss[k]);
+        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
 
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
+        if (L == 1) {
+          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
+                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
+                               : _mm_shuffle_ps(is[k2], is[k2], 78);
+        } else if (L == 2) {
+          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
+          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
+          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
+          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
+          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
+        }
       }
 
+      double re = 0;
+      double im = 0;
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 8; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm_mul_ps(rs[0], w[j]);
         in = _mm_mul_ps(rs[0], w[j + 1]);
         rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
@@ -1140,5017 +808,42 @@ class SimulatorSSE final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
 
           j += 2;
         }
 
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
+        unsigned m = lsize * k;
+
+        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[m], rn), _mm_mul_ps(is[m], in));
+        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[m], in), _mm_mul_ps(is[m], rn));
+
+        re += detail::HorizontalSumSSE(v_re);
+        im += detail::HorizontalSumSSE(v_im);
       }
+
+      return std::complex<double>{re, im};
     };
 
-    fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m128 w[1 << (1 + 2 * H + L)];
 
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    auto m = GetMasks11<L>(qs);
 
-    for_.Run(size, f, w, ms, xss, qs[0], rstate);
-  }
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+    FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
 
-  void ApplyGate4HHLL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(8);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[16], is[16];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[4 * l] = _mm_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, rstate);
-  }
-
-  void ApplyGate5HHHHH(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, State& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[32], is[32];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2])
-          | (32 * i & ms[3]) | (64 * i & ms[4]) | (128 * i & ms[5]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, rstate);
-  }
-
-  void ApplyGate5HHHHL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(11);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (64 * i + 32 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned q0, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[32], is[32];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2])
-          | (32 * i & ms[3]) | (64 * i & ms[4]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, qs[0], rstate);
-  }
-
-  void ApplyGate5HHHLL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(10);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (128 * i + 32 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[32], is[32];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2])
-          | (32 * i & ms[3]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[4 * l] = _mm_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, rstate);
-  }
-
-  void ApplyGate6HHHHHH(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, State& state) const {
-    uint64_t xs[6];
-    uint64_t ms[7];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 6; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1);
-
-    uint64_t xss[64];
-    for (unsigned i = 0; i < 64; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 6; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[64], is[64];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2])
-          | (32 * i & ms[3]) | (64 * i & ms[4]) | (128 * i & ms[5])
-          | (256 * i & ms[6]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, rstate);
-  }
-
-  void ApplyGate6HHHHHL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, State& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(13);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 32; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (128 * i + 64 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned q0, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[64], is[64];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2])
-          | (32 * i & ms[3]) | (64 * i & ms[4]) | (128 * i & ms[5]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, qs[0], rstate);
-  }
-
-  void ApplyGate6HHHHLL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(12);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (256 * i + 64 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[64], is[64];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2])
-          | (32 * i & ms[3]) | (64 * i & ms[4]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[4 * l] = _mm_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, rstate);
-  }
-
-  void ApplyControlledGate1H_H(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[2], is[2];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate1H_L(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(4);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (2 * i + 2 * k + m);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[2], is[2];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate1L_H(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               State& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(3);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                unsigned q0, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[2], is[2];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[2 * l] = _mm_load_ps(p0);
-        is[2 * l] = _mm_load_ps(p0 + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0, rn);
-        _mm_store_ps(p0 + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 2 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w,
-             state.num_qubits(), cmaskh, emaskh, qs[0], rstate);
-  }
-
-  void ApplyControlledGate1L_L(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               State& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(3);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                unsigned q0, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[2], is[2];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[2 * l] = _mm_load_ps(p0);
-        is[2 * l] = _mm_load_ps(p0 + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0, rn);
-        _mm_store_ps(p0 + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 2 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w,
-             state.num_qubits(), cmaskh, emaskh, qs[0], rstate);
-  }
-
-  void ApplyControlledGate2HH_H(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[4], is[4];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate2HH_L(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(6);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (4 * i + 4 * k + m);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[4], is[4];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate2HL_H(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(5);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                unsigned q0, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[4], is[4];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, qs[0], rstate);
-  }
-
-  void ApplyControlledGate2HL_L(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(5);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                unsigned q0, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[4], is[4];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, qs[0], rstate);
-  }
-
-  void ApplyControlledGate2LL_H(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(4);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[4], is[4];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[4 * l] = _mm_load_ps(p0);
-        is[4 * l] = _mm_load_ps(p0 + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0, rn);
-        _mm_store_ps(p0 + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 2 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate2LL_L(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                State& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(4);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[4], is[4];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[4 * l] = _mm_load_ps(p0);
-        is[4 * l] = _mm_load_ps(p0 + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0, rn);
-        _mm_store_ps(p0 + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 2 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate3HHH_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate3HHH_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(8);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (8 * i + 8 * k + m);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate3HHL_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(7);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                unsigned q0, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, qs[0], rstate);
-  }
-
-  void ApplyControlledGate3HHL_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(7);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                unsigned q0, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, qs[0], rstate);
-  }
-
-  void ApplyControlledGate3HLL_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(6);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[4 * l] = _mm_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate3HLL_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(6);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[8], is[8];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[4 * l] = _mm_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate4HHHH_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate4HHHH_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(10);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (16 * i + 16 * k + m);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate4HHHL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(9);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                unsigned q0, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, qs[0], rstate);
-  }
-
-  void ApplyControlledGate4HHHL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(9);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                unsigned q0, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, qs[0], rstate);
-  }
-
-  void ApplyControlledGate4HHLL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(8);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[4 * l] = _mm_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  void ApplyControlledGate4HHLL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(8);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[16], is[16];
-
-      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[4 * l] = _mm_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, rstate);
-  }
-
-  std::complex<double> ExpectationValue1H(const std::vector<unsigned>& qs,
-                                          const fp_type* matrix,
-                                          const State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[2], is[2];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
-
-          j += 2;
-        }
-
-        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[l], rn), _mm_mul_ps(is[l], in));
-        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[l], in), _mm_mul_ps(is[l], rn));
-
-        re += detail::HorizontalSumSSE(v_re);
-        im += detail::HorizontalSumSSE(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 3;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue1L(const std::vector<unsigned>& qs,
-                                          const fp_type* matrix,
-                                          const State& state) const {
-    unsigned p[4];
-
-    auto s = StateSpace::Create(3);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                unsigned q0, const fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[2], is[2];
-
-      auto p0 = rstate + 8 * i;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[2 * l] = _mm_load_ps(p0);
-        is[2 * l] = _mm_load_ps(p0 + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[l], rn), _mm_mul_ps(is[l], in));
-        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[l], in), _mm_mul_ps(is[l], rn));
-
-        re += detail::HorizontalSumSSE(v_re);
-        im += detail::HorizontalSumSSE(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 2;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, qs[0], rstate);
-  }
-
-  std::complex<double> ExpectationValue2HH(const std::vector<unsigned>& qs,
-                                           const fp_type* matrix,
-                                           const State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[4], is[4];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
-
-          j += 2;
-        }
-
-        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[l], rn), _mm_mul_ps(is[l], in));
-        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[l], in), _mm_mul_ps(is[l], rn));
-
-        re += detail::HorizontalSumSSE(v_re);
-        im += detail::HorizontalSumSSE(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue2HL(const std::vector<unsigned>& qs,
-                                           const fp_type* matrix,
-                                           const State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(5);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned q0, const fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[4], is[4];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        unsigned m = 2 * l;
-
-        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[m], rn), _mm_mul_ps(is[m], in));
-        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[m], in), _mm_mul_ps(is[m], rn));
-
-        re += detail::HorizontalSumSSE(v_re);
-        im += detail::HorizontalSumSSE(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 3;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, qs[0], rstate);
-  }
-
-  std::complex<double> ExpectationValue2LL(const std::vector<unsigned>& qs,
-                                           const fp_type* matrix,
-                                           const State& state) const {
-    unsigned p[4];
-
-    auto s = StateSpace::Create(4);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[4], is[4];
-
-      auto p0 = rstate + 8 * i;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[4 * l] = _mm_load_ps(p0);
-        is[4 * l] = _mm_load_ps(p0 + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[l], rn), _mm_mul_ps(is[l], in));
-        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[l], in), _mm_mul_ps(is[l], rn));
-
-        re += detail::HorizontalSumSSE(v_re);
-        im += detail::HorizontalSumSSE(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 2;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, rstate);
-  }
-
-  std::complex<double> ExpectationValue3HHH(const std::vector<unsigned>& qs,
-                                            const fp_type* matrix,
-                                            const State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[8], is[8];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2])
-          | (32 * i & ms[3]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
-
-          j += 2;
-        }
-
-        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[l], rn), _mm_mul_ps(is[l], in));
-        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[l], in), _mm_mul_ps(is[l], rn));
-
-        re += detail::HorizontalSumSSE(v_re);
-        im += detail::HorizontalSumSSE(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue3HHL(const std::vector<unsigned>& qs,
-                                            const fp_type* matrix,
-                                            const State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(7);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned q0, const fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[8], is[8];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        unsigned m = 2 * l;
-
-        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[m], rn), _mm_mul_ps(is[m], in));
-        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[m], in), _mm_mul_ps(is[m], rn));
-
-        re += detail::HorizontalSumSSE(v_re);
-        im += detail::HorizontalSumSSE(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, qs[0], rstate);
-  }
-
-  std::complex<double> ExpectationValue3HLL(const std::vector<unsigned>& qs,
-                                            const fp_type* matrix,
-                                            const State& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(6);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[8], is[8];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[4 * l] = _mm_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        unsigned m = 4 * l;
-
-        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[m], rn), _mm_mul_ps(is[m], in));
-        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[m], in), _mm_mul_ps(is[m], rn));
-
-        re += detail::HorizontalSumSSE(v_re);
-        im += detail::HorizontalSumSSE(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 3;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue4HHHH(const std::vector<unsigned>& qs,
-                                             const fp_type* matrix,
-                                             const State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[16], is[16];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2])
-          | (32 * i & ms[3]) | (64 * i & ms[4]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
-
-          j += 2;
-        }
-
-        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[l], rn), _mm_mul_ps(is[l], in));
-        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[l], in), _mm_mul_ps(is[l], rn));
-
-        re += detail::HorizontalSumSSE(v_re);
-        im += detail::HorizontalSumSSE(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue4HHHL(const std::vector<unsigned>& qs,
-                                             const fp_type* matrix,
-                                             const State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(9);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned q0, const fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[16], is[16];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2])
-          | (32 * i & ms[3]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        unsigned m = 2 * l;
-
-        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[m], rn), _mm_mul_ps(is[m], in));
-        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[m], in), _mm_mul_ps(is[m], rn));
-
-        re += detail::HorizontalSumSSE(v_re);
-        im += detail::HorizontalSumSSE(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, qs[0], rstate);
-  }
-
-  std::complex<double> ExpectationValue4HHLL(const std::vector<unsigned>& qs,
-                                             const fp_type* matrix,
-                                             const State& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(8);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[16], is[16];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[4 * l] = _mm_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        unsigned m = 4 * l;
-
-        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[m], rn), _mm_mul_ps(is[m], in));
-        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[m], in), _mm_mul_ps(is[m], rn));
-
-        re += detail::HorizontalSumSSE(v_re);
-        im += detail::HorizontalSumSSE(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue5HHHHH(const std::vector<unsigned>& qs,
-                                              const fp_type* matrix,
-                                              const State& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[32], is[32];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2])
-          | (32 * i & ms[3]) | (64 * i & ms[4]) | (128 * i & ms[5]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
-
-          j += 2;
-        }
-
-        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[l], rn), _mm_mul_ps(is[l], in));
-        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[l], in), _mm_mul_ps(is[l], rn));
-
-        re += detail::HorizontalSumSSE(v_re);
-        im += detail::HorizontalSumSSE(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue5HHHHL(const std::vector<unsigned>& qs,
-                                              const fp_type* matrix,
-                                              const State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(11);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (64 * i + 32 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned q0, const fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[32], is[32];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2])
-          | (32 * i & ms[3]) | (64 * i & ms[4]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        unsigned m = 2 * l;
-
-        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[m], rn), _mm_mul_ps(is[m], in));
-        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[m], in), _mm_mul_ps(is[m], rn));
-
-        re += detail::HorizontalSumSSE(v_re);
-        im += detail::HorizontalSumSSE(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, qs[0], rstate);
-  }
-
-  std::complex<double> ExpectationValue5HHHLL(const std::vector<unsigned>& qs,
-                                              const fp_type* matrix,
-                                              const State& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(10);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (128 * i + 32 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[32], is[32];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2])
-          | (32 * i & ms[3]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[4 * l] = _mm_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        unsigned m = 4 * l;
-
-        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[m], rn), _mm_mul_ps(is[m], in));
-        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[m], in), _mm_mul_ps(is[m], rn));
-
-        re += detail::HorizontalSumSSE(v_re);
-        im += detail::HorizontalSumSSE(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue6HHHHHH(const std::vector<unsigned>& qs,
-                                               const fp_type* matrix,
-                                               const State& state) const {
-    uint64_t xs[6];
-    uint64_t ms[7];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 6; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1);
-
-    uint64_t xss[64];
-    for (unsigned i = 0; i < 64; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 6; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[64], is[64];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2])
-          | (32 * i & ms[3]) | (64 * i & ms[4]) | (128 * i & ms[5])
-          | (256 * i & ms[6]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
-
-          j += 2;
-        }
-
-        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[l], rn), _mm_mul_ps(is[l], in));
-        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[l], in), _mm_mul_ps(is[l], rn));
-
-        re += detail::HorizontalSumSSE(v_re);
-        im += detail::HorizontalSumSSE(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
-  }
-
-  std::complex<double> ExpectationValue6HHHHHL(const std::vector<unsigned>& qs,
-                                               const fp_type* matrix,
-                                               const State& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(13);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 32; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (128 * i + 64 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned q0, const fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[64], is[64];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2])
-          | (32 * i & ms[3]) | (64 * i & ms[4]) | (128 * i & ms[5]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        unsigned m = 2 * l;
-
-        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[m], rn), _mm_mul_ps(is[m], in));
-        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[m], in), _mm_mul_ps(is[m], rn));
-
-        re += detail::HorizontalSumSSE(v_re);
-        im += detail::HorizontalSumSSE(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, qs[0], rstate);
-  }
-
-  std::complex<double> ExpectationValue6HHHHLL(const std::vector<unsigned>& qs,
-                                               const fp_type* matrix,
-                                               const State& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = StateSpace::Create(12);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (256 * i + 64 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[64], is[64];
-
-      uint64_t k = (4 * i & ms[0]) | (8 * i & ms[1]) | (16 * i & ms[2])
-          | (32 * i & ms[3]) | (64 * i & ms[4]);
-
-      auto p0 = rstate + 2 * k;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[4 * l] = _mm_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        unsigned m = 4 * l;
-
-        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[m], rn), _mm_mul_ps(is[m], in));
-        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[m], in), _mm_mul_ps(is[m], rn));
-
-        re += detail::HorizontalSumSSE(v_re);
-        im += detail::HorizontalSumSSE(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    const fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    unsigned k = 2 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
 
     using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, rstate);
-  }
-
-  static unsigned MaskedAdd(
-      unsigned a, unsigned b, unsigned mask, unsigned lsize) {
-    unsigned c = bits::CompressBits(a, 2, mask);
-    return bits::ExpandBits((c + b) % lsize, 2, mask);
+    return for_.RunReduce(size, f, Op(), w, ms, xss, qs[0], state.get());
   }
 
   For for_;
diff --git a/lib/statespace_custatevec.h b/lib/statespace_custatevec.h
new file mode 100644
index 00000000..12d8db55
--- /dev/null
+++ b/lib/statespace_custatevec.h
@@ -0,0 +1,378 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATESPACE_CUSTATEVEC_H_
+#define STATESPACE_CUSTATEVEC_H_
+
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <cublas_v2.h>
+#include <cuComplex.h>
+#include <custatevec.h>
+
+#include "statespace.h"
+#include "util_custatevec.h"
+#include "vectorspace_cuda.h"
+
+namespace qsim {
+
+namespace detail {
+
+template <typename FP>
+__global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) {
+  uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
+
+  if (k < size) {
+    state[2 * k] = v;
+    state[2 * k + 1] = 0;
+  }
+}
+
+}  // namespace detail
+
+/**
+ * Object containing context and routines for cuStateVec state-vector
+ * manipulations. It is not recommended to use `GetAmpl` and `SetAmpl`.
+ */
+template <typename FP = float>
+class StateSpaceCuStateVec :
+    public StateSpace<StateSpaceCuStateVec<FP>, VectorSpaceCUDA, FP> {
+ private:
+  using Base = StateSpace<StateSpaceCuStateVec<FP>, qsim::VectorSpaceCUDA, FP>;
+
+ public:
+  using State = typename Base::State;
+  using fp_type = typename Base::fp_type;
+
+ private:
+  static constexpr auto is_float = std::is_same<fp_type, float>::value;
+
+ public:
+  static constexpr auto kStateType = is_float ? CUDA_C_32F : CUDA_C_64F;
+  static constexpr auto kMatrixType = kStateType;
+  static constexpr auto kExpectType = CUDA_C_64F;
+  static constexpr auto kComputeType =
+      is_float ? CUSTATEVEC_COMPUTE_32F : CUSTATEVEC_COMPUTE_64F;
+  static constexpr auto kMatrixLayout = CUSTATEVEC_MATRIX_LAYOUT_ROW;
+
+  explicit StateSpaceCuStateVec(const cublasHandle_t& cublas_handle,
+                                const custatevecHandle_t& custatevec_handle)
+      : cublas_handle_(cublas_handle), custatevec_handle_(custatevec_handle),
+        workspace_(nullptr), workspace_size_(0) {}
+
+  virtual ~StateSpaceCuStateVec() {
+    if (workspace_ != nullptr) {
+      ErrorCheck(cudaFree(workspace_));
+    }
+  }
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return 2 * (uint64_t{1} << num_qubits);
+  };
+
+  void InternalToNormalOrder(State& state) const {
+  }
+
+  void NormalToInternalOrder(State& state) const {
+  }
+
+  void SetAllZeros(State& state) const {
+    ErrorCheck(cudaMemset(state.get(), 0,
+                          MinSize(state.num_qubits()) * sizeof(fp_type)));
+  }
+
+  // Uniform superposition.
+  void SetStateUniform(State& state) const {
+    uint64_t size = uint64_t{1} << state.num_qubits();
+
+    unsigned threads = size < 256 ? size : 256;
+    unsigned blocks = size / threads;
+
+    fp_type v = double{1} / std::sqrt(size);
+
+    detail::SetStateUniformKernel<<<blocks, threads>>>(v, size, state.get());
+    ErrorCheck(cudaPeekAtLastError());
+  }
+
+  // |0> state.
+  void SetStateZero(State& state) const {
+    SetAllZeros(state);
+    fp_type one[1] = {1};
+    ErrorCheck(
+        cudaMemcpy(state.get(), one, sizeof(fp_type), cudaMemcpyHostToDevice));
+  }
+
+  // It is not recommended to use this function.
+  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
+    fp_type a[2];
+    auto p = state.get() + 2 * i;
+    ErrorCheck(cudaMemcpy(a, p, 2 * sizeof(fp_type), cudaMemcpyDeviceToHost));
+    return std::complex<fp_type>(a[0], a[1]);
+  }
+
+  // It is not recommended to use this function.
+  static void SetAmpl(
+      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
+    fp_type a[2] = {std::real(ampl), std::imag(ampl)};
+    auto p = state.get() + 2 * i;
+    ErrorCheck(cudaMemcpy(p, a, 2 * sizeof(fp_type), cudaMemcpyHostToDevice));
+  }
+
+  // It is not recommended to use this function.
+  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
+    fp_type a[2] = {re, im};
+    auto p = state.get() + 2 * i;
+    ErrorCheck(cudaMemcpy(p, a, 2 * sizeof(fp_type), cudaMemcpyHostToDevice));
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
+                   const std::complex<fp_type>& val,
+                   bool exclude = false) const {
+    // Not implemented.
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
+                   fp_type im, bool exclude = false) const {
+    // Not implemented.
+  }
+
+  // Does the equivalent of dest += src elementwise.
+  bool Add(const State& src, State& dest) const {
+    if (src.num_qubits() != dest.num_qubits()) {
+      return false;
+    }
+
+    uint64_t size = uint64_t{1} << src.num_qubits();
+
+    if (is_float) {
+      cuComplex a = {1.0, 0.0};
+      auto p1 = (const cuComplex*) src.get();
+      auto p2 = (cuComplex*) dest.get();
+      ErrorCheck(cublasCaxpy(cublas_handle_, size, &a, p1, 1, p2, 1));
+    } else {
+      cuDoubleComplex a = {1.0, 0.0};
+      auto p1 = (const cuDoubleComplex*) src.get();
+      auto p2 = (cuDoubleComplex*) dest.get();
+      ErrorCheck(cublasZaxpy(cublas_handle_, size, &a, p1, 1, p2, 1));
+    }
+
+    return true;
+  }
+
+  // Does the equivalent of state *= a elementwise.
+  void Multiply(fp_type a, State& state) const {
+    uint64_t size = uint64_t{1} << state.num_qubits();
+
+    if (is_float) {
+      float a1 = a;
+      auto p = (cuComplex*) state.get();
+      ErrorCheck(cublasCsscal(cublas_handle_, size, &a1, p, 1));
+    } else {
+      double a1 = a;
+      auto p = (cuDoubleComplex*) state.get();
+      ErrorCheck(cublasZdscal(cublas_handle_, size, &a1, p, 1));
+    }
+  }
+
+  std::complex<double> InnerProduct(
+      const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    uint64_t size = uint64_t{1} << state1.num_qubits();
+
+    if (is_float) {
+      cuComplex result;
+      auto p1 = (const cuComplex*) state1.get();
+      auto p2 = (const cuComplex*) state2.get();
+      ErrorCheck(cublasCdotc(cublas_handle_, size, p1, 1, p2, 1, &result));
+      return {cuCrealf(result), cuCimagf(result)};
+    } else {
+      cuDoubleComplex result;
+      auto p1 = (const cuDoubleComplex*) state1.get();
+      auto p2 = (const cuDoubleComplex*) state2.get();
+      ErrorCheck(cublasZdotc(cublas_handle_, size, p1, 1, p2, 1, &result));
+      return {cuCreal(result), cuCimag(result)};
+    }
+  }
+
+  double RealInnerProduct(const State& state1, const State& state2) const {
+    return std::real(InnerProduct(state1, state2));
+  }
+
+  double Norm(const State& state) const {
+    uint64_t size = uint64_t{1} << state.num_qubits();
+
+    if (is_float) {
+      float result;
+      auto p = (const cuComplex*) state.get();
+      ErrorCheck(cublasScnrm2(cublas_handle_, size, p, 1, &result));
+      return result * result;
+    } else {
+      double result;
+      auto p = (const cuDoubleComplex*) state.get();
+      ErrorCheck(cublasDznrm2(cublas_handle_, size, p, 1, &result));
+      return result * result;
+    }
+  }
+
+  template <typename DistrRealType = double>
+  std::vector<uint64_t> Sample(
+      const State& state, uint64_t num_samples, unsigned seed) const {
+    std::vector<uint64_t> bitstrings;
+
+    if (num_samples > 0) {
+      auto rs = GenerateRandomValues<double>(num_samples, seed, 1.0);
+
+      size_t workspace_size;
+      custatevecSamplerDescriptor_t sampler;
+
+      ErrorCheck(custatevecSamplerCreate(
+                     custatevec_handle_, state.get(), kStateType,
+                     state.num_qubits(), &sampler, num_samples,
+                     &workspace_size));
+
+      AllocWorkSpace(workspace_size);
+
+      ErrorCheck(custatevecSamplerPreprocess(
+                     custatevec_handle_, sampler, workspace_, workspace_size));
+
+      std::vector<custatevecIndex_t> bitstrings0(num_samples);
+      std::vector<int32_t> bitordering;
+
+      bitordering.reserve(state.num_qubits());
+      for (unsigned i = 0; i < state.num_qubits(); ++i) {
+        bitordering.push_back(i);
+      }
+
+      ErrorCheck(custatevecSamplerSample(
+                     custatevec_handle_, sampler, bitstrings0.data(),
+                     bitordering.data(), state.num_qubits(), rs.data(),
+                     num_samples, CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER));
+
+      bitstrings.reserve(num_samples);
+      for (unsigned i = 0; i < num_samples; ++i) {
+        bitstrings.push_back(bitstrings0[i]);
+      }
+    }
+
+    return bitstrings;
+  }
+
+  using MeasurementResult = typename Base::MeasurementResult;
+
+  template <typename RGen>
+  MeasurementResult Measure(const std::vector<unsigned>& qubits,
+                            RGen& rgen, State& state,
+                            bool no_collapse = false) const {
+    auto r = RandomValue(rgen, 1.0);
+
+    MeasurementResult result;
+
+    result.valid = true;
+    result.mask = 0;
+    result.bits = 0;
+    result.bitstring.resize(qubits.size(), 0);
+
+    for (auto q : qubits) {
+      if (q >= state.num_qubits()) {
+        result.valid = false;
+        return result;
+      }
+
+      result.mask |= uint64_t{1} << q;
+    }
+
+    auto collapse = no_collapse ?
+        CUSTATEVEC_COLLAPSE_NONE : CUSTATEVEC_COLLAPSE_NORMALIZE_AND_ZERO;
+
+    ErrorCheck(custatevecBatchMeasure(
+                   custatevec_handle_, state.get(), kStateType,
+                   state.num_qubits(), (int*) result.bitstring.data(),
+                   (int*) qubits.data(), qubits.size(), r, collapse));
+
+    for (std::size_t i = 0; i < result.bitstring.size(); ++i) {
+      result.bits |= result.bitstring[i] << qubits[i];
+    }
+
+    return result;
+  }
+
+  template <typename RGen>
+  MeasurementResult VirtualMeasure(const std::vector<unsigned>& qubits,
+                                   RGen& rgen, const State& state) const {
+    return Measure(qubits, rgen, const_cast<State&>(state), true);
+  }
+
+  void Collapse(const MeasurementResult& mr, State& state) const {
+    unsigned count = 0;
+
+    std::vector<int> bitstring;
+    std::vector<int> bitordering;
+
+    bitstring.reserve(state.num_qubits());
+    bitordering.reserve(state.num_qubits());
+
+    for (unsigned i = 0; i < state.num_qubits(); ++i) {
+      if (((mr.mask >> i) & 1) != 0) {
+        bitstring.push_back((mr.bits >> i) & 1);
+        bitordering.push_back(i);
+        ++count;
+      }
+    }
+
+    ErrorCheck(custatevecCollapseByBitString(
+                   custatevec_handle_, state.get(), kStateType,
+                   state.num_qubits(), bitstring.data(), bitordering.data(),
+                   count, 1.0));
+
+    // TODO: do we need the following?
+    double norm = Norm(state);
+    Multiply(1.0 / std::sqrt(norm), state);
+  }
+
+ private:
+  void* AllocWorkSpace(size_t size) const {
+    if (size > workspace_size_) {
+      if (workspace_ != nullptr) {
+        ErrorCheck(cudaFree(workspace_));
+      }
+
+      ErrorCheck(cudaMalloc(const_cast<void**>(&workspace_), size));
+
+      const_cast<uint64_t&>(workspace_size_) = size;
+    }
+
+    return workspace_;
+  }
+
+  const cublasHandle_t cublas_handle_;
+  const custatevecHandle_t custatevec_handle_;
+
+  void* workspace_;
+  size_t workspace_size_;
+};
+
+}  // namespace qsim
+
+#endif  // STATESPACE_CUSTATEVEC_H_
diff --git a/lib/unitary_calculator_avx.h b/lib/unitary_calculator_avx.h
index 519ff26c..5e566ca9 100644
--- a/lib/unitary_calculator_avx.h
+++ b/lib/unitary_calculator_avx.h
@@ -17,11 +17,12 @@
 
 #include <immintrin.h>
 
-#include <algorithm>
 #include <complex>
 #include <cstdint>
+#include <functional>
+#include <vector>
 
-#include "bits.h"
+#include "simulator.h"
 #include "unitaryspace_avx.h"
 
 namespace qsim {
@@ -31,7 +32,7 @@ namespace unitary {
  * Quantum circuit unitary calculator with AVX vectorization.
  */
 template <typename For>
-class UnitaryCalculatorAVX final {
+class UnitaryCalculatorAVX final : public SimulatorBase {
  public:
   using UnitarySpace = UnitarySpaceAVX<For>;
   using Unitary = typename UnitarySpace::Unitary;
@@ -50,68 +51,68 @@ class UnitaryCalculatorAVX final {
    * @param state The state of the system, to be updated by this method.
    */
   void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, Unitary& state) const {
+                 const fp_type* matrix, State& state) const {
     // Assume qs[0] < qs[1] < qs[2] < ... .
 
     switch (qs.size()) {
     case 1:
       if (qs[0] > 2) {
-        ApplyGate1H(qs, matrix, state);
+        ApplyGateH<1>(qs, matrix, state);
       } else {
-        ApplyGate1L(qs, matrix, state);
+        ApplyGateL<0, 1>(qs, matrix, state);
       }
       break;
     case 2:
       if (qs[0] > 2) {
-        ApplyGate2HH(qs, matrix, state);
+        ApplyGateH<2>(qs, matrix, state);
       } else if (qs[1] > 2) {
-        ApplyGate2HL(qs, matrix, state);
+        ApplyGateL<1, 1>(qs, matrix, state);
       } else {
-        ApplyGate2LL(qs, matrix, state);
+        ApplyGateL<0, 2>(qs, matrix, state);
       }
       break;
     case 3:
       if (qs[0] > 2) {
-        ApplyGate3HHH(qs, matrix, state);
+        ApplyGateH<3>(qs, matrix, state);
       } else if (qs[1] > 2) {
-        ApplyGate3HHL(qs, matrix, state);
+        ApplyGateL<2, 1>(qs, matrix, state);
       } else if (qs[2] > 2) {
-        ApplyGate3HLL(qs, matrix, state);
+        ApplyGateL<1, 2>(qs, matrix, state);
       } else {
-        ApplyGate3LLL(qs, matrix, state);
+        ApplyGateL<0, 3>(qs, matrix, state);
       }
       break;
     case 4:
       if (qs[0] > 2) {
-        ApplyGate4HHHH(qs, matrix, state);
+        ApplyGateH<4>(qs, matrix, state);
       } else if (qs[1] > 2) {
-        ApplyGate4HHHL(qs, matrix, state);
+        ApplyGateL<3, 1>(qs, matrix, state);
       } else if (qs[2] > 2) {
-        ApplyGate4HHLL(qs, matrix, state);
+        ApplyGateL<2, 2>(qs, matrix, state);
       } else {
-        ApplyGate4HLLL(qs, matrix, state);
+        ApplyGateL<1, 3>(qs, matrix, state);
       }
       break;
     case 5:
       if (qs[0] > 2) {
-        ApplyGate5HHHHH(qs, matrix, state);
+        ApplyGateH<5>(qs, matrix, state);
       } else if (qs[1] > 2) {
-        ApplyGate5HHHHL(qs, matrix, state);
+        ApplyGateL<4, 1>(qs, matrix, state);
       } else if (qs[2] > 2) {
-        ApplyGate5HHHLL(qs, matrix, state);
+        ApplyGateL<3, 2>(qs, matrix, state);
       } else {
-        ApplyGate5HHLLL(qs, matrix, state);
+        ApplyGateL<2, 3>(qs, matrix, state);
       }
       break;
     case 6:
       if (qs[0] > 2) {
-        ApplyGate6HHHHHH(qs, matrix, state);
+        ApplyGateH<6>(qs, matrix, state);
       } else if (qs[1] > 2) {
-        ApplyGate6HHHHHL(qs, matrix, state);
+        ApplyGateL<5, 1>(qs, matrix, state);
       } else if (qs[2] > 2) {
-        ApplyGate6HHHHLL(qs, matrix, state);
+        ApplyGateL<4, 2>(qs, matrix, state);
       } else {
-        ApplyGate6HHHLLL(qs, matrix, state);
+        ApplyGateL<3, 3>(qs, matrix, state);
       }
       break;
     default:
@@ -124,13 +125,16 @@ class UnitaryCalculatorAVX final {
    * Applies a controlled gate using AVX instructions.
    * @param qs Indices of the qubits affected by this gate.
    * @param cqs Indices of control qubits.
-   * @param cmask Bit mask of control qubit values.
+   * @param cvals Bit mask of control qubit values.
    * @param matrix Matrix representation of the gate to be applied.
    * @param state The state of the system, to be updated by this method.
    */
   void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cmask,
-                           const fp_type* matrix, Unitary& state) const {
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
+
     if (cqs.size() == 0) {
       ApplyGate(qs, matrix, state);
       return;
@@ -140,90 +144,90 @@ class UnitaryCalculatorAVX final {
     case 1:
       if (qs[0] > 2) {
         if (cqs[0] > 2) {
-          ApplyControlledGate1H_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate1H_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 2) {
-          ApplyControlledGate1L_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate1L_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
     case 2:
       if (qs[0] > 2) {
         if (cqs[0] > 2) {
-          ApplyControlledGate2HH_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate2HH_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[1] > 2) {
         if (cqs[0] > 2) {
-          ApplyControlledGate2HL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate2HL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 2) {
-          ApplyControlledGate2LL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate2LL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
     case 3:
       if (qs[0] > 2) {
         if (cqs[0] > 2) {
-          ApplyControlledGate3HHH_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3HHH_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[1] > 2) {
         if (cqs[0] > 2) {
-          ApplyControlledGate3HHL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3HHL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[2] > 2) {
         if (cqs[0] > 2) {
-          ApplyControlledGate3HLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3HLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 2) {
-          ApplyControlledGate3LLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3LLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
     case 4:
       if (qs[0] > 2) {
         if (cqs[0] > 2) {
-          ApplyControlledGate4HHHH_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HHHH_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[1] > 2) {
         if (cqs[0] > 2) {
-          ApplyControlledGate4HHHL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HHHL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[2] > 2) {
         if (cqs[0] > 2) {
-          ApplyControlledGate4HHLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HHLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 2) {
-          ApplyControlledGate4HLLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HLLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
@@ -241,46 +245,35 @@ class UnitaryCalculatorAVX final {
   }
 
  private:
-  void ApplyGate1H(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
 
+#ifdef __BMI2__
+
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
+                uint64_t imaskh, uint64_t qmaskh, uint64_t size,
+                uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
       __m256 ru, iu, rn, in;
-      __m256 rs[2], is[2];
+      __m256 rs[hsize], is[hsize];
 
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (8 * ii & ms[0]) | (16 * ii & ms[1]);
+      uint64_t r = i % size;
+      uint64_t s = i / size;
 
-      auto p0 = rstate + row_size * r + 2 * c;
+      auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh);
 
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm256_load_ps(p0 + p);
+        is[k] = _mm256_load_ps(p0 + p + 8);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 2; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         ru = _mm256_set1_ps(v[j]);
         iu = _mm256_set1_ps(v[j + 1]);
         rn = _mm256_mul_ps(rs[0], ru);
@@ -290,94 +283,70 @@ class UnitaryCalculatorAVX final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 2; ++n) {
+        for (unsigned l = 1; l < hsize; ++l) {
           ru = _mm256_set1_ps(v[j]);
           iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
 
           j += 2;
         }
 
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    auto m = GetMasks1<H, 3>(qs);
 
-    unsigned k = 4;
+    unsigned k = 3 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
+    for_.Run(size * size2, f,
+             matrix, m.imaskh, m.qmaskh, size, raw_size, state.get());
   }
 
-  void ApplyGate1L(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, Unitary& state) const {
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = UnitarySpace::Create(2);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                uint64_t imaskh, uint64_t qmaskh, const __m256i* idx,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
 
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
 
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
+      uint64_t r = i % size;
+      uint64_t s = i / size;
 
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[2], is[2];
+      auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh);
 
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      auto p0 = rstate + row_size * r + 16 * ii;
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
 
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0);
-        is[2 * l] = _mm256_load_ps(p0 + 8);
+        rs[k2] = _mm256_load_ps(p0 + p);
+        is[k2] = _mm256_load_ps(p0 + p + 8);
 
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
         }
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 1; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm256_mul_ps(rs[0], w[j]);
         in = _mm256_mul_ps(rs[0], w[j + 1]);
         rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
@@ -385,75 +354,66 @@ class UnitaryCalculatorAVX final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
 
           j += 2;
         }
 
-        _mm256_store_ps(p0, rn);
-        _mm256_store_ps(p0 + 8, in);
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks2<H, L, 3>(qs);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
 
-    unsigned k = 3;
+    unsigned k = 3 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, w, idx, size, raw_size, rstate);
+    for_.Run(size * size2, f,
+             w, m.imaskh, m.qmaskh, idx, size, raw_size, state.get());
   }
 
-  void ApplyGate2HH(const std::vector<unsigned>& qs,
-                    const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
                 uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
       __m256 ru, iu, rn, in;
-      __m256 rs[4], is[4];
+      __m256 rs[hsize], is[hsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
 
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (8 * ii & ms[0]) | (16 * ii & ms[1]) | (32 * ii & ms[2]);
+      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
 
-      auto p0 = rstate + row_size * r + 2 * c;
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
 
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
+        rs[k] = _mm256_load_ps(p0 + p);
+        is[k] = _mm256_load_ps(p0 + p + 8);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 4; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         ru = _mm256_set1_ps(v[j]);
         iu = _mm256_set1_ps(v[j + 1]);
         rn = _mm256_mul_ps(rs[0], ru);
@@ -463,115 +423,63 @@ class UnitaryCalculatorAVX final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 4; ++n) {
+        for (unsigned l = 1; l < hsize; ++l) {
           ru = _mm256_set1_ps(v[j]);
           iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
 
           j += 2;
         }
 
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    auto m = GetMasks3<H, 3>(state.num_qubits(), qs, cqs, cvals);
 
-    unsigned k = 5;
+    unsigned k = 3 + H + cqs.size();
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
+    for_.Run(size * size2, f,
+             matrix, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get());
   }
 
-  void ApplyGate2HL(const std::vector<unsigned>& qs,
-                    const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = UnitarySpace::Create(3);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
       __m256 rn, in;
-      __m256 rs[4], is[4];
+      __m256 rs[hsize], is[hsize];
 
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (8 * ii & ms[0]) | (16 * ii & ms[1]);
+      uint64_t r = i % size;
+      uint64_t s = i / size;
 
-      auto p0 = rstate + row_size * r + 2 * c;
+      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
 
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
 
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
+        rs[k] = _mm256_load_ps(p0 + p);
+        is[k] = _mm256_load_ps(p0 + p + 8);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 2; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm256_mul_ps(rs[0], w[j]);
         in = _mm256_mul_ps(rs[0], w[j + 1]);
         rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
@@ -579,92 +487,73 @@ class UnitaryCalculatorAVX final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
 
           j += 2;
         }
 
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    __m256 w[1 << (1 + 2 * H)];
+
+    auto m = GetMasks4<H, 3>(state.num_qubits(), qs, cqs, cvals);
+    FillControlledMatrixH<H, 3>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
 
-    unsigned k = 4;
+    unsigned k = 3 + H + cqs.size() - m.cl;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
+    for_.Run(size * size2, f,
+             w, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get());
   }
 
-  void ApplyGate2LL(const std::vector<unsigned>& qs,
-                    const fp_type* matrix, Unitary& state) const {
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = UnitarySpace::Create(3);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
                 const __m256i* idx, uint64_t size, uint64_t row_size,
                 fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
       __m256 rn, in;
-      __m256 rs[4], is[4];
+      __m256 rs[gsize], is[gsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
 
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      auto p0 = rstate + row_size * r + 16 * ii;
+      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
 
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0);
-        is[4 * l] = _mm256_load_ps(p0 + 8);
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
 
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
+        rs[k2] = _mm256_load_ps(p0 + p);
+        is[k2] = _mm256_load_ps(p0 + p + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
         }
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 1; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm256_mul_ps(rs[0], w[j]);
         in = _mm256_mul_ps(rs[0], w[j + 1]);
         rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
@@ -672,76 +561,86 @@ class UnitaryCalculatorAVX final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
 
           j += 2;
         }
 
-        _mm256_store_ps(p0, rn);
-        _mm256_store_ps(p0 + 8, in);
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
 
-    unsigned k = 3;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, w, idx, size, raw_size, rstate);
-  }
+    if (CH) {
+      auto m = GetMasks5<H, L, 3>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
 
-  void ApplyGate3HHH(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
+      unsigned k = 3 + H + cqs.size();
+      unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+      uint64_t size = uint64_t{1} << n;
 
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
+      for_.Run(size * size2, f, w, m.imaskh, m.qmaskh,
+               m.cvalsh, idx, size, raw_size, state.get());
+    } else {
+      auto m = GetMasks6<H, L, 3>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillControlledMatrixL<H, L, 3>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
+
+      unsigned k = 3 + H + cqs.size() - m.cl;
+      unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+      uint64_t size = uint64_t{1} << n;
+
+      for_.Run(size * size2, f, w, m.imaskh, m.qmaskh,
+               m.cvalsh, idx, size, raw_size, state.get());
     }
+  }
+
+#else  // __BMI2__
 
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
+                const uint64_t* ms, const uint64_t* xss, uint64_t size,
+                uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
       __m256 ru, iu, rn, in;
-      __m256 rs[8], is[8];
+      __m256 rs[hsize], is[hsize];
 
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (8 * ii & ms[0]) | (16 * ii & ms[1]) | (32 * ii & ms[2])
-          | (64 * ii & ms[3]);
+      uint64_t r = 8 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
 
-      auto p0 = rstate + row_size * r + 2 * c;
+      auto p0 = rstate + row_size * s + 2 * t;
 
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm256_load_ps(p0 + xss[k]);
+        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 8; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         ru = _mm256_set1_ps(v[j]);
         iu = _mm256_set1_ps(v[j + 1]);
         rn = _mm256_mul_ps(rs[0], ru);
@@ -751,119 +650,74 @@ class UnitaryCalculatorAVX final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 8; ++n) {
+        for (unsigned l = 1; l < hsize; ++l) {
           ru = _mm256_set1_ps(v[j]);
           iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
 
           j += 2;
         }
 
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
 
-    unsigned k = 6;
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 3 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
+    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get());
   }
 
-  void ApplyGate3HHL(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = UnitarySpace::Create(4);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                const uint64_t* ms, const uint64_t* xss, const __m256i* idx,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
 
-        unsigned l = 2 * (8 * i + m);
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
 
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
+      uint64_t r = 8 * (i % size);
+      uint64_t s = i / size;
 
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
       }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
 
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (8 * ii & ms[0]) | (16 * ii & ms[1]) | (32 * ii & ms[2]);
+      auto p0 = rstate + row_size * s + 2 * t;
 
-      auto p0 = rstate + row_size * r + 2 * c;
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        rs[k2] = _mm256_load_ps(p0 + xss[k]);
+        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
 
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
         }
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 4; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm256_mul_ps(rs[0], w[j]);
         in = _mm256_mul_ps(rs[0], w[j + 1]);
         rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
@@ -871,206 +725,149 @@ class UnitaryCalculatorAVX final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
 
           j += 2;
         }
 
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks11<L>(qs);
 
-    unsigned k = 5;
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 3 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
+    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, state.get());
   }
 
-  void ApplyGate3HLL(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = UnitarySpace::Create(4);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
 
-        unsigned l = 2 * (8 * i + m);
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
 
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
+      uint64_t r = 8 * (i % size);
+      uint64_t s = i / size;
 
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
       }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
 
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (8 * ii & ms[0]) | (16 * ii & ms[1]);
+      if ((t & cmaskh) != cvalsh) return;
 
-      auto p0 = rstate + row_size * r + 2 * c;
+      auto p0 = rstate + row_size * s + 2 * t;
 
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm256_load_ps(p0 + xss[k]);
+        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
 
         j += 2;
 
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
 
           j += 2;
         }
 
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
 
-    unsigned k = 4;
+    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 3 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
+    for_.Run(size * size2, f,
+             matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
   }
 
-  void ApplyGate3LLL(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, Unitary& state) const {
-    unsigned p[8];
-    __m256i idx[7];
-
-    auto s = UnitarySpace::Create(3);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
-        }
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
 
-        unsigned l = 2 * (8 * i + m);
+      __m256 rn, in;
+      __m256 rs[hsize], is[hsize];
 
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
+      uint64_t r = 8 * (i % size);
+      uint64_t s = i / size;
 
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
       }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
 
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      auto p0 = rstate + row_size * r + 16 * ii;
+      if ((t & cmaskh) != cvalsh) return;
 
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[8 * l] = _mm256_load_ps(p0);
-        is[8 * l] = _mm256_load_ps(p0 + 8);
+      auto p0 = rstate + row_size * s + 2 * t;
 
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm256_permutevar8x32_ps(rs[8 * l], idx[j - 1]);
-          is[8 * l + j] = _mm256_permutevar8x32_ps(is[8 * l], idx[j - 1]);
-        }
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm256_load_ps(p0 + xss[k]);
+        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 1; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm256_mul_ps(rs[0], w[j]);
         in = _mm256_mul_ps(rs[0], w[j + 1]);
         rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
@@ -1078,4650 +875,148 @@ class UnitaryCalculatorAVX final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
 
           j += 2;
         }
 
-        _mm256_store_ps(p0, rn);
-        _mm256_store_ps(p0 + 8, in);
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m256 w[1 << (1 + 2 * H)];
+
+    auto m = GetMasks8<3>(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+    FillControlledMatrixH<H, 3>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
 
-    unsigned k = 3;
+    unsigned k = 3 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, w, idx, size, raw_size, rstate);
+    for_.Run(size * size2, f,
+             w, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
   }
 
-  void ApplyGate4HHHH(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, const __m256i* idx, uint64_t size,
+                uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
 
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
+
+      uint64_t r = 8 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
       }
-      xss[i] = a;
-    }
 
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m256 ru, iu, rn, in;
-      __m256 rs[16], is[16];
+      if ((t & cmaskh) != cvalsh) return;
 
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (8 * ii & ms[0]) | (16 * ii & ms[1]) | (32 * ii & ms[2])
-          | (64 * ii & ms[3]) | (128 * ii & ms[4]);
+      auto p0 = rstate + row_size * s + 2 * t;
 
-      auto p0 = rstate + row_size * r + 2 * c;
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
 
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
+        rs[k2] = _mm256_load_ps(p0 + xss[k]);
+        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
+        }
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 16; ++l) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
 
         j += 2;
 
-        for (unsigned n = 1; n < 16; ++n) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
 
           j += 2;
         }
 
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
 
-    unsigned k = 7;
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 3 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
-  }
+    if (CH) {
+      auto m = GetMasks9<L>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
 
-  void ApplyGate4HHHL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
+      for_.Run(size * size2, f, w, ms, xss, m.cvalsh,
+               m.cmaskh, idx, size, raw_size, state.get());
+    } else {
+      auto m = GetMasks10<L, 3>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillControlledMatrixL<H, L, 3>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
 
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
+      for_.Run(size * size2, f, w, ms, xss, m.cvalsh,
+               m.cmaskh, idx, size, raw_size, state.get());
     }
+  }
 
-    unsigned p[8];
-    __m256i idx[1];
+#endif  // __BMI2__
 
-    auto s = UnitarySpace::Create(5);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
+  template <unsigned L>
+  static void FillPermutationIndices(unsigned qmaskl, __m256i* idx) {
+    constexpr unsigned lsize = 1 << L;
 
-    unsigned qmask = (1 << qs[0]);
+    for (unsigned i = 0; i < lsize - 1; ++i) {
+      unsigned p[8];
 
-    for (unsigned i = 0; i < 1; ++i) {
       for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+        p[j] = MaskedAdd<3>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl));
       }
 
       idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
     }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (8 * ii & ms[0]) | (16 * ii & ms[1]) | (32 * ii & ms[2])
-          | (64 * ii & ms[3]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate4HHLL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = UnitarySpace::Create(5);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (8 * ii & ms[0]) | (16 * ii & ms[1]) | (32 * ii & ms[2]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate4HLLL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[7];
-
-    auto s = UnitarySpace::Create(4);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (8 * ii & ms[0]) | (16 * ii & ms[1]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[8 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm256_permutevar8x32_ps(rs[8 * l], idx[j - 1]);
-          is[8 * l + j] = _mm256_permutevar8x32_ps(is[8 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate5HHHHH(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m256 ru, iu, rn, in;
-      __m256 rs[32], is[32];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (8 * ii & ms[0]) | (16 * ii & ms[1]) | (32 * ii & ms[2])
-          | (64 * ii & ms[3]) | (128 * ii & ms[4]) | (256 * ii & ms[5]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
-  }
-
-  void ApplyGate5HHHHL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = UnitarySpace::Create(6);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (64 * i + 32 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[32], is[32];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (8 * ii & ms[0]) | (16 * ii & ms[1]) | (32 * ii & ms[2])
-          | (64 * ii & ms[3]) | (128 * ii & ms[4]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate5HHHLL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = UnitarySpace::Create(6);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (128 * i + 32 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[32], is[32];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (8 * ii & ms[0]) | (16 * ii & ms[1]) | (32 * ii & ms[2])
-          | (64 * ii & ms[3]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate5HHLLL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[7];
-
-    auto s = UnitarySpace::Create(5);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (256 * i + 32 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[32], is[32];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (8 * ii & ms[0]) | (16 * ii & ms[1]) | (32 * ii & ms[2]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[8 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm256_permutevar8x32_ps(rs[8 * l], idx[j - 1]);
-          is[8 * l + j] = _mm256_permutevar8x32_ps(is[8 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate6HHHHHH(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[6];
-    uint64_t ms[7];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 6; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1);
-
-    uint64_t xss[64];
-    for (unsigned i = 0; i < 64; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 6; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m256 ru, iu, rn, in;
-      __m256 rs[64], is[64];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (8 * ii & ms[0]) | (16 * ii & ms[1]) | (32 * ii & ms[2])
-          | (64 * ii & ms[3]) | (128 * ii & ms[4]) | (256 * ii & ms[5])
-          | (512 * ii & ms[6]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 9;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
-  }
-
-  void ApplyGate6HHHHHL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = UnitarySpace::Create(7);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 32; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (128 * i + 64 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[64], is[64];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (8 * ii & ms[0]) | (16 * ii & ms[1]) | (32 * ii & ms[2])
-          | (64 * ii & ms[3]) | (128 * ii & ms[4]) | (256 * ii & ms[5]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate6HHHHLL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = UnitarySpace::Create(7);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (256 * i + 64 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[64], is[64];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (8 * ii & ms[0]) | (16 * ii & ms[1]) | (32 * ii & ms[2])
-          | (64 * ii & ms[3]) | (128 * ii & ms[4]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate6HHHLLL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[8];
-    __m256i idx[7];
-
-    auto s = UnitarySpace::Create(6);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (512 * i + 64 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[64], is[64];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (8 * ii & ms[0]) | (16 * ii & ms[1]) | (32 * ii & ms[2])
-          | (64 * ii & ms[3]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[8 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm256_permutevar8x32_ps(rs[8 * l], idx[j - 1]);
-          is[8 * l + j] = _mm256_permutevar8x32_ps(is[8 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate1H_H(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m256 ru, iu, rn, in;
-      __m256 rs[2], is[2];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate1H_L(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-
-    auto s = UnitarySpace::Create(3);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (2 * i + 2 * k + m);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[2], is[2];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate1L_H(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               Unitary& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = UnitarySpace::Create(2);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[2], is[2];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0);
-        is[2 * l] = _mm256_load_ps(p0 + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0, rn);
-        _mm256_store_ps(p0 + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate1L_L(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               Unitary& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = UnitarySpace::Create(2);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[2], is[2];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0);
-        is[2 * l] = _mm256_load_ps(p0 + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0, rn);
-        _mm256_store_ps(p0 + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate2HH_H(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m256 ru, iu, rn, in;
-      __m256 rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate2HH_L(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-
-    auto s = UnitarySpace::Create(4);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (4 * i + 4 * k + m);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate2HL_H(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = UnitarySpace::Create(3);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate2HL_L(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = UnitarySpace::Create(3);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate2LL_H(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                Unitary& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = UnitarySpace::Create(3);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0);
-        is[4 * l] = _mm256_load_ps(p0 + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0, rn);
-        _mm256_store_ps(p0 + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate2LL_L(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                Unitary& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = UnitarySpace::Create(3);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0);
-        is[4 * l] = _mm256_load_ps(p0 + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0, rn);
-        _mm256_store_ps(p0 + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3HHH_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m256 ru, iu, rn, in;
-      __m256 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3HHH_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-
-    auto s = UnitarySpace::Create(5);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (8 * i + 8 * k + m);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3HHL_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = UnitarySpace::Create(4);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3HHL_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = UnitarySpace::Create(4);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3HLL_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = UnitarySpace::Create(4);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3HLL_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = UnitarySpace::Create(4);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3LLL_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[7];
-
-    auto s = UnitarySpace::Create(3);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[8 * l] = _mm256_load_ps(p0);
-        is[8 * l] = _mm256_load_ps(p0 + 8);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm256_permutevar8x32_ps(rs[8 * l], idx[j - 1]);
-          is[8 * l + j] = _mm256_permutevar8x32_ps(is[8 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0, rn);
-        _mm256_store_ps(p0 + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3LLL_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[7];
-
-    auto s = UnitarySpace::Create(3);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[8 * l] = _mm256_load_ps(p0);
-        is[8 * l] = _mm256_load_ps(p0 + 8);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm256_permutevar8x32_ps(rs[8 * l], idx[j - 1]);
-          is[8 * l + j] = _mm256_permutevar8x32_ps(is[8 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0, rn);
-        _mm256_store_ps(p0 + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HHHH_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m256 ru, iu, rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[n], ru, rn);
-          in = _mm256_fmadd_ps(rs[n], iu, in);
-          rn = _mm256_fnmadd_ps(is[n], iu, rn);
-          in = _mm256_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HHHH_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-
-    auto s = UnitarySpace::Create(6);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (16 * i + 16 * k + m);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = _mm256_load_ps(p0 + xss[l]);
-        is[l] = _mm256_load_ps(p0 + xss[l] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HHHL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = UnitarySpace::Create(5);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HHHL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[1];
-
-    auto s = UnitarySpace::Create(5);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[2 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm256_permutevar8x32_ps(rs[2 * l], idx[j - 1]);
-          is[2 * l + j] = _mm256_permutevar8x32_ps(is[2 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HHLL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = UnitarySpace::Create(5);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HHLL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[3];
-
-    auto s = UnitarySpace::Create(5);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[4 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm256_permutevar8x32_ps(rs[4 * l], idx[j - 1]);
-          is[4 * l + j] = _mm256_permutevar8x32_ps(is[4 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HLLL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[7];
-
-    auto s = UnitarySpace::Create(4);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[8 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm256_permutevar8x32_ps(rs[8 * l], idx[j - 1]);
-          is[8 * l + j] = _mm256_permutevar8x32_ps(is[8 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HLLL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 3, emaskl);
-
-    for (auto q : qs) {
-      if (q > 2) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 7;
-
-    unsigned p[8];
-    __m256i idx[7];
-
-    auto s = UnitarySpace::Create(4);
-    __m256* w = (__m256*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 8; ++j) {
-          unsigned k = bits::CompressBits(j, 3, qmask);
-          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 8; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[8 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 8; ++j) {
-          wf[8 * l + j + 8] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m256 rn, in;
-      __m256 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[8 * l] = _mm256_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm256_load_ps(p0 + xss[l] + 8);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm256_permutevar8x32_ps(rs[8 * l], idx[j - 1]);
-          is[8 * l + j] = _mm256_permutevar8x32_ps(is[8 * l], idx[j - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm256_fmadd_ps(rs[n], w[j], rn);
-          in = _mm256_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[l], rn);
-        _mm256_store_ps(p0 + xss[l] + 8, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  static unsigned MaskedAdd(
-      unsigned a, unsigned b, unsigned mask, unsigned lsize) {
-    unsigned c = bits::CompressBits(a, 3, mask);
-    return bits::ExpandBits((c + b) % lsize, 3, mask);
   }
 
   For for_;
diff --git a/lib/unitary_calculator_avx512.h b/lib/unitary_calculator_avx512.h
index 57ba6484..81053678 100644
--- a/lib/unitary_calculator_avx512.h
+++ b/lib/unitary_calculator_avx512.h
@@ -17,11 +17,12 @@
 
 #include <immintrin.h>
 
-#include <algorithm>
 #include <complex>
 #include <cstdint>
+#include <functional>
+#include <vector>
 
-#include "bits.h"
+#include "simulator.h"
 #include "unitaryspace_avx512.h"
 
 namespace qsim {
@@ -31,7 +32,7 @@ namespace unitary {
  * Quantum circuit unitary calculator with AVX512 vectorization.
  */
 template <typename For>
-class UnitaryCalculatorAVX512 final {
+class UnitaryCalculatorAVX512 final : public SimulatorBase {
  public:
   using UnitarySpace = UnitarySpaceAVX512<For>;
   using Unitary = typename UnitarySpace::Unitary;
@@ -50,74 +51,74 @@ class UnitaryCalculatorAVX512 final {
    * @param state The state of the system, to be updated by this method.
    */
   void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, Unitary& state) const {
+                 const fp_type* matrix, State& state) const {
     // Assume qs[0] < qs[1] < qs[2] < ... .
 
     switch (qs.size()) {
     case 1:
       if (qs[0] > 3) {
-        ApplyGate1H(qs, matrix, state);
+        ApplyGateH<1>(qs, matrix, state);
       } else {
-        ApplyGate1L(qs, matrix, state);
+        ApplyGateL<0, 1>(qs, matrix, state);
       }
       break;
     case 2:
       if (qs[0] > 3) {
-        ApplyGate2HH(qs, matrix, state);
+        ApplyGateH<2>(qs, matrix, state);
       } else if (qs[1] > 3) {
-        ApplyGate2HL(qs, matrix, state);
+        ApplyGateL<1, 1>(qs, matrix, state);
       } else {
-        ApplyGate2LL(qs, matrix, state);
+        ApplyGateL<0, 2>(qs, matrix, state);
       }
       break;
     case 3:
       if (qs[0] > 3) {
-        ApplyGate3HHH(qs, matrix, state);
+        ApplyGateH<3>(qs, matrix, state);
       } else if (qs[1] > 3) {
-        ApplyGate3HHL(qs, matrix, state);
+        ApplyGateL<2, 1>(qs, matrix, state);
       } else if (qs[2] > 3) {
-        ApplyGate3HLL(qs, matrix, state);
+        ApplyGateL<1, 2>(qs, matrix, state);
       } else {
-        ApplyGate3LLL(qs, matrix, state);
+        ApplyGateL<0, 3>(qs, matrix, state);
       }
       break;
     case 4:
       if (qs[0] > 3) {
-        ApplyGate4HHHH(qs, matrix, state);
+        ApplyGateH<4>(qs, matrix, state);
       } else if (qs[1] > 3) {
-        ApplyGate4HHHL(qs, matrix, state);
+        ApplyGateL<3, 1>(qs, matrix, state);
       } else if (qs[2] > 3) {
-        ApplyGate4HHLL(qs, matrix, state);
+        ApplyGateL<2, 2>(qs, matrix, state);
       } else if (qs[3] > 3) {
-        ApplyGate4HLLL(qs, matrix, state);
+        ApplyGateL<1, 3>(qs, matrix, state);
       } else {
-        ApplyGate4LLLL(qs, matrix, state);
+        ApplyGateL<0, 4>(qs, matrix, state);
       }
       break;
     case 5:
       if (qs[0] > 3) {
-        ApplyGate5HHHHH(qs, matrix, state);
+        ApplyGateH<5>(qs, matrix, state);
       } else if (qs[1] > 3) {
-        ApplyGate5HHHHL(qs, matrix, state);
+        ApplyGateL<4, 1>(qs, matrix, state);
       } else if (qs[2] > 3) {
-        ApplyGate5HHHLL(qs, matrix, state);
+        ApplyGateL<3, 2>(qs, matrix, state);
       } else if (qs[3] > 3) {
-        ApplyGate5HHLLL(qs, matrix, state);
+        ApplyGateL<2, 3>(qs, matrix, state);
       } else {
-        ApplyGate5HLLLL(qs, matrix, state);
+        ApplyGateL<1, 4>(qs, matrix, state);
       }
       break;
     case 6:
       if (qs[0] > 3) {
-        ApplyGate6HHHHHH(qs, matrix, state);
+        ApplyGateH<6>(qs, matrix, state);
       } else if (qs[1] > 3) {
-        ApplyGate6HHHHHL(qs, matrix, state);
+        ApplyGateL<5, 1>(qs, matrix, state);
       } else if (qs[2] > 3) {
-        ApplyGate6HHHHLL(qs, matrix, state);
+        ApplyGateL<4, 2>(qs, matrix, state);
       } else if (qs[3] > 3) {
-        ApplyGate6HHHLLL(qs, matrix, state);
+        ApplyGateL<3, 3>(qs, matrix, state);
       } else {
-        ApplyGate6HHLLLL(qs, matrix, state);
+        ApplyGateL<2, 4>(qs, matrix, state);
       }
       break;
     default:
@@ -130,13 +131,16 @@ class UnitaryCalculatorAVX512 final {
    * Applies a controlled gate using AVX512 instructions.
    * @param qs Indices of the qubits affected by this gate.
    * @param cqs Indices of control qubits.
-   * @param cmask Bit mask of control qubit values.
+   * @param cvals Bit mask of control qubit values.
    * @param matrix Matrix representation of the gate to be applied.
    * @param state The state of the system, to be updated by this method.
    */
   void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cmask,
-                           const fp_type* matrix, Unitary& state) const {
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
+
     if (cqs.size() == 0) {
       ApplyGate(qs, matrix, state);
       return;
@@ -146,96 +150,96 @@ class UnitaryCalculatorAVX512 final {
     case 1:
       if (qs[0] > 3) {
         if (cqs[0] > 3) {
-          ApplyControlledGate1H_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate1H_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 3) {
-          ApplyControlledGate1L_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate1L_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
     case 2:
       if (qs[0] > 3) {
         if (cqs[0] > 3) {
-          ApplyControlledGate2HH_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate2HH_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[1] > 3) {
         if (cqs[0] > 3) {
-          ApplyControlledGate2HL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate2HL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 3) {
-          ApplyControlledGate2LL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate2LL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
     case 3:
       if (qs[0] > 3) {
         if (cqs[0] > 3) {
-          ApplyControlledGate3HHH_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3HHH_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[1] > 3) {
         if (cqs[0] > 3) {
-          ApplyControlledGate3HHL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3HHL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[2] > 3) {
         if (cqs[0] > 3) {
-          ApplyControlledGate3HLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3HLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 3) {
-          ApplyControlledGate3LLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3LLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
     case 4:
       if (qs[0] > 3) {
         if (cqs[0] > 3) {
-          ApplyControlledGate4HHHH_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HHHH_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[1] > 3) {
         if (cqs[0] > 3) {
-          ApplyControlledGate4HHHL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HHHL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[2] > 3) {
         if (cqs[0] > 3) {
-          ApplyControlledGate4HHLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HHLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[3] > 3) {
         if (cqs[0] > 3) {
-          ApplyControlledGate4HLLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HLLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 3) {
-          ApplyControlledGate4LLLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 4, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4LLLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 4, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
@@ -253,46 +257,32 @@ class UnitaryCalculatorAVX512 final {
   }
 
  private:
-  void ApplyGate1H(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
+                uint64_t imaskh, uint64_t qmaskh, uint64_t size,
+                uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
       __m512 ru, iu, rn, in;
-      __m512 rs[2], is[2];
+      __m512 rs[hsize], is[hsize];
 
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]);
+      uint64_t r = i % size;
+      uint64_t s = i / size;
 
-      auto p0 = rstate + row_size * r + 2 * c;
+      auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh);
 
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm512_load_ps(p0 + p);
+        is[k] = _mm512_load_ps(p0 + p + 16);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 2; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         ru = _mm512_set1_ps(v[j]);
         iu = _mm512_set1_ps(v[j + 1]);
         rn = _mm512_mul_ps(rs[0], ru);
@@ -302,96 +292,70 @@ class UnitaryCalculatorAVX512 final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 2; ++n) {
+        for (unsigned l = 1; l < hsize; ++l) {
           ru = _mm512_set1_ps(v[j]);
           iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
+          rn = _mm512_fmadd_ps(rs[l], ru, rn);
+          in = _mm512_fmadd_ps(rs[l], iu, in);
+          rn = _mm512_fnmadd_ps(is[l], iu, rn);
+          in = _mm512_fmadd_ps(is[l], ru, in);
 
           j += 2;
         }
 
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    auto m = GetMasks1<H, 4>(qs);
 
-    unsigned k = 5;
+    unsigned k = 4 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
+    for_.Run(size * size2, f,
+             matrix, m.imaskh, m.qmaskh, size, raw_size, state.get());
   }
 
-  void ApplyGate1L(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, Unitary& state) const {
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = UnitarySpace::Create(3);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                uint64_t imaskh, uint64_t qmaskh, const __m512i* idx,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
 
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
+      __m512 rn, in;
+      __m512 rs[gsize], is[gsize];
 
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
+      uint64_t r = i % size;
+      uint64_t s = i / size;
 
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[2], is[2];
+      auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh);
 
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      auto p0 = rstate + row_size * r + 32 * ii;
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
 
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0);
-        is[2 * l] = _mm512_load_ps(p0 + 16);
+        rs[k2] = _mm512_load_ps(p0 + p);
+        is[k2] = _mm512_load_ps(p0 + p + 16);
 
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
+          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
         }
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 1; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm512_mul_ps(rs[0], w[j]);
         in = _mm512_mul_ps(rs[0], w[j + 1]);
         rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
@@ -399,75 +363,66 @@ class UnitaryCalculatorAVX512 final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
+          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[l], w[j], in);
 
           j += 2;
         }
 
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    __m512i idx[1 << L];
+    __m512 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks2<H, L, 4>(qs);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
 
-    unsigned k = 4;
+    unsigned k = 4 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, w, idx, size, raw_size, rstate);
+    for_.Run(size * size2, f,
+             w, m.imaskh, m.qmaskh, idx, size, raw_size, state.get());
   }
 
-  void ApplyGate2HH(const std::vector<unsigned>& qs,
-                    const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
                 uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
       __m512 ru, iu, rn, in;
-      __m512 rs[4], is[4];
+      __m512 rs[hsize], is[hsize];
 
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]);
+      uint64_t r = i % size;
+      uint64_t s = i / size;
 
-      auto p0 = rstate + row_size * r + 2 * c;
+      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
 
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm512_load_ps(p0 + p);
+        is[k] = _mm512_load_ps(p0 + p + 16);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 4; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         ru = _mm512_set1_ps(v[j]);
         iu = _mm512_set1_ps(v[j + 1]);
         rn = _mm512_mul_ps(rs[0], ru);
@@ -477,117 +432,63 @@ class UnitaryCalculatorAVX512 final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 4; ++n) {
+        for (unsigned l = 1; l < hsize; ++l) {
           ru = _mm512_set1_ps(v[j]);
           iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
+          rn = _mm512_fmadd_ps(rs[l], ru, rn);
+          in = _mm512_fmadd_ps(rs[l], iu, in);
+          rn = _mm512_fnmadd_ps(is[l], iu, rn);
+          in = _mm512_fmadd_ps(is[l], ru, in);
 
           j += 2;
         }
 
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    auto m = GetMasks3<H, 4>(state.num_qubits(), qs, cqs, cvals);
 
-    unsigned k = 6;
+    unsigned k = 4 + H + cqs.size();
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
+    for_.Run(size * size2, f,
+             matrix, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get());
   }
 
-  void ApplyGate2HL(const std::vector<unsigned>& qs,
-                    const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = UnitarySpace::Create(4);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
       __m512 rn, in;
-      __m512 rs[4], is[4];
+      __m512 rs[hsize], is[hsize];
 
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]);
+      uint64_t r = i % size;
+      uint64_t s = i / size;
 
-      auto p0 = rstate + row_size * r + 2 * c;
+      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
 
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
 
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
+        rs[k] = _mm512_load_ps(p0 + p);
+        is[k] = _mm512_load_ps(p0 + p + 16);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 2; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm512_mul_ps(rs[0], w[j]);
         in = _mm512_mul_ps(rs[0], w[j + 1]);
         rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
@@ -595,295 +496,73 @@ class UnitaryCalculatorAVX512 final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
+          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[l], w[j], in);
 
           j += 2;
         }
 
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate2LL(const std::vector<unsigned>& qs,
-                    const fp_type* matrix, Unitary& state) const {
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = UnitarySpace::Create(3);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      auto p0 = rstate + row_size * r + 32 * ii;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0);
-        is[4 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
+        uint64_t p = _pdep_u64(k, qmaskh);
 
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
       }
     };
 
-    fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate3HHH(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m512 ru, iu, rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2])
-          | (128 * ii & ms[3]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
+    __m512 w[1 << (1 + 2 * H)];
 
-    fp_type* rstate = state.get();
+    auto m = GetMasks4<H, 4>(state.num_qubits(), qs, cqs, cvals);
+    FillControlledMatrixH<H, 4>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
 
-    unsigned k = 7;
+    unsigned k = 4 + H + cqs.size() - m.cl;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
+    for_.Run(size * size2, f,
+             w, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get());
   }
 
-  void ApplyGate3HHL(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = UnitarySpace::Create(5);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
                 const __m512i* idx, uint64_t size, uint64_t row_size,
                 fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
       __m512 rn, in;
-      __m512 rs[8], is[8];
+      __m512 rs[gsize], is[gsize];
 
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]);
+      uint64_t r = i % size;
+      uint64_t s = i / size;
 
-      auto p0 = rstate + row_size * r + 2 * c;
+      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
 
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
 
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        rs[k2] = _mm512_load_ps(p0 + p);
+        is[k2] = _mm512_load_ps(p0 + p + 16);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
+          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
         }
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 4; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm512_mul_ps(rs[0], w[j]);
         in = _mm512_mul_ps(rs[0], w[j + 1]);
         rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
@@ -891,5489 +570,69 @@ class UnitaryCalculatorAVX512 final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
+          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[l], w[j], in);
 
           j += 2;
         }
 
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    __m512i idx[1 << L];
+    __m512 w[1 << (1 + 2 * H + L)];
 
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
+    if (CH) {
+      auto m = GetMasks5<H, L, 4>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
 
-  void ApplyGate3HLL(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
+      unsigned k = 4 + H + cqs.size();
+      unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+      uint64_t size = uint64_t{1} << n;
 
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+      for_.Run(size * size2, f, w, m.imaskh, m.qmaskh,
+               m.cvalsh, idx, size, raw_size, state.get());
+    } else {
+      auto m = GetMasks6<H, L, 4>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillControlledMatrixL<H, L, 4>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
 
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
+      unsigned k = 4 + H + cqs.size() - m.cl;
+      unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+      uint64_t size = uint64_t{1} << n;
 
-    unsigned p[16];
-    __m512i idx[3];
+      for_.Run(size * size2, f, w, m.imaskh, m.qmaskh,
+               m.cvalsh, idx, size, raw_size, state.get());
+    }
+  }
 
-    auto s = UnitarySpace::Create(4);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
+  template <unsigned L>
+  static void FillPermutationIndices(unsigned qmaskl, __m512i* idx) {
+    constexpr unsigned lsize = 1 << L;
 
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+    for (unsigned i = 0; i < lsize; ++i) {
+      unsigned p[16];
 
-    for (unsigned i = 0; i < 3; ++i) {
       for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+        p[j] = MaskedAdd<4>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl));
       }
 
       idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
                                 p[9], p[8], p[7], p[6], p[5], p[4],
                                 p[3], p[2], p[1], p[0]);
     }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate3LLL(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, Unitary& state) const {
-    unsigned p[16];
-    __m512i idx[7];
-
-    auto s = UnitarySpace::Create(4);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      auto p0 = rstate + row_size * r + 32 * ii;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[8 * l] = _mm512_load_ps(p0);
-        is[8 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
-          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate4HHHH(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m512 ru, iu, rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2])
-          | (128 * ii & ms[3]) | (256 * ii & ms[4]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
-  }
-
-  void ApplyGate4HHHL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = UnitarySpace::Create(6);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2])
-          | (128 * ii & ms[3]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate4HHLL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = UnitarySpace::Create(5);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate4HLLL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[7];
-
-    auto s = UnitarySpace::Create(5);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
-          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate4LLLL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, Unitary& state) const {
-    unsigned p[16];
-    __m512i idx[15];
-
-    auto s = UnitarySpace::Create(4);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
-
-    for (unsigned i = 0; i < 15; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      auto p0 = rstate + row_size * r + 32 * ii;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[16 * l] = _mm512_load_ps(p0);
-        is[16 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 16; ++j) {
-          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
-          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate5HHHHH(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m512 ru, iu, rn, in;
-      __m512 rs[32], is[32];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2])
-          | (128 * ii & ms[3]) | (256 * ii & ms[4]) | (512 * ii & ms[5]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 9;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
-  }
-
-  void ApplyGate5HHHHL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = UnitarySpace::Create(7);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (64 * i + 32 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[32], is[32];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2])
-          | (128 * ii & ms[3]) | (256 * ii & ms[4]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate5HHHLL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = UnitarySpace::Create(6);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (128 * i + 32 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[32], is[32];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2])
-          | (128 * ii & ms[3]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate5HHLLL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[7];
-
-    auto s = UnitarySpace::Create(6);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (256 * i + 32 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[32], is[32];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
-          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate5HLLLL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[4] + 1);
-    ms[0] = (uint64_t{1} << qs[4]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[15];
-
-    auto s = UnitarySpace::Create(5);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
-
-    for (unsigned i = 0; i < 15; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (512 * i + 32 * k + 16 * (m / 16) + (k + m) % 16);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[32], is[32];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[16 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[16 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 16; ++j) {
-          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
-          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate6HHHHHH(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[6];
-    uint64_t ms[7];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 6; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1);
-
-    uint64_t xss[64];
-    for (unsigned i = 0; i < 64; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 6; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m512 ru, iu, rn, in;
-      __m512 rs[64], is[64];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2])
-          | (128 * ii & ms[3]) | (256 * ii & ms[4]) | (512 * ii & ms[5])
-          | (1024 * ii & ms[6]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 10;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
-  }
-
-  void ApplyGate6HHHHHL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = UnitarySpace::Create(8);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 32; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (128 * i + 64 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[64], is[64];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2])
-          | (128 * ii & ms[3]) | (256 * ii & ms[4]) | (512 * ii & ms[5]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 9;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate6HHHHLL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = UnitarySpace::Create(7);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (256 * i + 64 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[64], is[64];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2])
-          | (128 * ii & ms[3]) | (256 * ii & ms[4]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate6HHHLLL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[7];
-
-    auto s = UnitarySpace::Create(7);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (512 * i + 64 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[64], is[64];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2])
-          | (128 * ii & ms[3]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
-          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyGate6HHLLLL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[4] + 1);
-    ms[0] = (uint64_t{1} << qs[4]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 4] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 4]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[16];
-    __m512i idx[15];
-
-    auto s = UnitarySpace::Create(6);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
-
-    for (unsigned i = 0; i < 15; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (1024 * i + 64 * k + 16 * (m / 16) + (k + m) % 16);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[64], is[64];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[16 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[16 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 16; ++j) {
-          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
-          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate1H_H(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m512 ru, iu, rn, in;
-      __m512 rs[2], is[2];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate1H_L(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-
-    auto s = UnitarySpace::Create(3);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (2 * i + 2 * k + m);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[2], is[2];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate1L_H(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               Unitary& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = UnitarySpace::Create(3);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[2], is[2];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0);
-        is[2 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate1L_L(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               Unitary& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = UnitarySpace::Create(3);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[2], is[2];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0);
-        is[2 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate2HH_H(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m512 ru, iu, rn, in;
-      __m512 rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate2HH_L(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-
-    auto s = UnitarySpace::Create(4);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (4 * i + 4 * k + m);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate2HL_H(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = UnitarySpace::Create(4);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate2HL_L(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = UnitarySpace::Create(4);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate2LL_H(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                Unitary& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = UnitarySpace::Create(3);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0);
-        is[4 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate2LL_L(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                Unitary& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = UnitarySpace::Create(3);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0);
-        is[4 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3HHH_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m512 ru, iu, rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3HHH_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-
-    auto s = UnitarySpace::Create(5);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (8 * i + 8 * k + m);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3HHL_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = UnitarySpace::Create(5);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3HHL_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = UnitarySpace::Create(5);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3HLL_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = UnitarySpace::Create(4);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3HLL_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = UnitarySpace::Create(4);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3LLL_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[7];
-
-    auto s = UnitarySpace::Create(4);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[8 * l] = _mm512_load_ps(p0);
-        is[8 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
-          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3LLL_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[7];
-
-    auto s = UnitarySpace::Create(4);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[8 * l] = _mm512_load_ps(p0);
-        is[8 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
-          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HHHH_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m512 ru, iu, rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[n], ru, rn);
-          in = _mm512_fmadd_ps(rs[n], iu, in);
-          rn = _mm512_fnmadd_ps(is[n], iu, rn);
-          in = _mm512_fmadd_ps(is[n], ru, in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HHHH_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-
-    auto s = UnitarySpace::Create(6);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (16 * i + 16 * k + m);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = _mm512_load_ps(p0 + xss[l]);
-        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HHHL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = UnitarySpace::Create(6);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HHHL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[1];
-
-    auto s = UnitarySpace::Create(6);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 2; ++j) {
-          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
-          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HHLL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = UnitarySpace::Create(5);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HHLL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[3];
-
-    auto s = UnitarySpace::Create(5);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 4; ++j) {
-          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
-          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HLLL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[7];
-
-    auto s = UnitarySpace::Create(5);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
-          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HLLL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[3] + 1);
-    ms[0] = (uint64_t{1} << qs[3]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[7];
-
-    auto s = UnitarySpace::Create(5);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
-
-    for (unsigned i = 0; i < 7; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
-        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
-
-        for (unsigned j = 1; j < 8; ++j) {
-          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
-          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0 + xss[l], rn);
-        _mm512_store_ps(p0 + xss[l] + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4LLLL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[15];
-
-    auto s = UnitarySpace::Create(4);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
-
-    for (unsigned i = 0; i < 15; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[16 * l] = _mm512_load_ps(p0);
-        is[16 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 16; ++j) {
-          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
-          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4LLLL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
-
-    for (auto q : qs) {
-      if (q > 3) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 15;
-
-    unsigned p[16];
-    __m512i idx[15];
-
-    auto s = UnitarySpace::Create(4);
-    __m512* w = (__m512*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
-
-    for (unsigned i = 0; i < 15; ++i) {
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 16; ++j) {
-          unsigned k = bits::CompressBits(j, 4, qmask);
-          p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 16; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 16; ++j) {
-          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m512 rn, in;
-      __m512 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[16 * l] = _mm512_load_ps(p0);
-        is[16 * l] = _mm512_load_ps(p0 + 16);
-
-        for (unsigned j = 1; j < 16; ++j) {
-          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
-          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
-          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[n], w[j], in);
-
-          j += 2;
-        }
-
-        _mm512_store_ps(p0, rn);
-        _mm512_store_ps(p0 + 16, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w,
-             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
-  }
-
-  static unsigned MaskedAdd(
-      unsigned a, unsigned b, unsigned mask, unsigned lsize) {
-    unsigned c = bits::CompressBits(a, 4, mask);
-    return bits::ExpandBits((c + b) % lsize, 4, mask);
   }
 
   For for_;
diff --git a/lib/unitary_calculator_basic.h b/lib/unitary_calculator_basic.h
index df5a3067..6b1821a9 100644
--- a/lib/unitary_calculator_basic.h
+++ b/lib/unitary_calculator_basic.h
@@ -15,12 +15,12 @@
 #ifndef UNITARY_CALCULATOR_BASIC_H_
 #define UNITARY_CALCULATOR_BASIC_H_
 
-
-#include <algorithm>
 #include <complex>
 #include <cstdint>
+#include <functional>
+#include <vector>
 
-#include "bits.h"
+#include "simulator.h"
 #include "unitaryspace_basic.h"
 
 namespace qsim {
@@ -30,7 +30,7 @@ namespace unitary {
  * Quantum circuit unitary calculator without vectorization.
  */
 template <typename For, typename FP = float>
-class UnitaryCalculatorBasic final {
+class UnitaryCalculatorBasic final : public SimulatorBase {
  public:
   using UnitarySpace = UnitarySpaceBasic<For, FP>;
   using Unitary = typename UnitarySpace::Unitary;
@@ -49,27 +49,27 @@ class UnitaryCalculatorBasic final {
    * @param state The state of the system, to be updated by this method.
    */
   void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, Unitary& state) const {
+                 const fp_type* matrix, State& state) const {
     // Assume qs[0] < qs[1] < qs[2] < ... .
 
     switch (qs.size()) {
     case 1:
-      ApplyGate1H(qs, matrix, state);
+      ApplyGateH<1>(qs, matrix, state);
       break;
     case 2:
-      ApplyGate2H(qs, matrix, state);
+      ApplyGateH<2>(qs, matrix, state);
       break;
     case 3:
-      ApplyGate3H(qs, matrix, state);
+      ApplyGateH<3>(qs, matrix, state);
       break;
     case 4:
-      ApplyGate4H(qs, matrix, state);
+      ApplyGateH<4>(qs, matrix, state);
       break;
     case 5:
-      ApplyGate5H(qs, matrix, state);
+      ApplyGateH<5>(qs, matrix, state);
       break;
     case 6:
-      ApplyGate6H(qs, matrix, state);
+      ApplyGateH<6>(qs, matrix, state);
       break;
     default:
       // Not implemented.
@@ -81,13 +81,15 @@ class UnitaryCalculatorBasic final {
    * Applies a controlled gate using non-vectorized instructions.
    * @param qs Indices of the qubits affected by this gate.
    * @param cqs Indices of control qubits.
-   * @param cmask Bit mask of control qubit values.
+   * @param cvals Bit mask of control qubit values.
    * @param matrix Matrix representation of the gate to be applied.
    * @param state The state of the system, to be updated by this method.
    */
   void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cmask,
-                           const fp_type* matrix, Unitary& state) const {
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
     if (cqs.size() == 0) {
       ApplyGate(qs, matrix, state);
       return;
@@ -95,16 +97,16 @@ class UnitaryCalculatorBasic final {
 
     switch (qs.size()) {
     case 1:
-      ApplyControlledGate1H(qs, cqs, cmask, matrix, state);
+      ApplyControlledGateH<1>(qs, cqs, cvals, matrix, state);
       break;
     case 2:
-      ApplyControlledGate2H(qs, cqs, cmask, matrix, state);
+      ApplyControlledGateH<2>(qs, cqs, cvals, matrix, state);
       break;
     case 3:
-      ApplyControlledGate3H(qs, cqs, cmask, matrix, state);
+      ApplyControlledGateH<3>(qs, cqs, cvals, matrix, state);
       break;
     case 4:
-      ApplyControlledGate4H(qs, cqs, cmask, matrix, state);
+      ApplyControlledGateH<4>(qs, cqs, cvals, matrix, state);
       break;
     default:
       // Not implemented.
@@ -120,793 +122,132 @@ class UnitaryCalculatorBasic final {
   }
 
  private:
-  void ApplyGate1H(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      fp_type rn, in;
-      fp_type rs[2], is[2];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (1 * ii & ms[0]) | (2 * ii & ms[1]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
-
-          j += 2;
-        }
-
-        *(p0 + xss[l]) = rn;
-        *(p0 + xss[l] + 1) = in;
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 1;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
-  }
-
-  void ApplyGate2H(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
+                const uint64_t* ms, const uint64_t* xss, uint64_t size,
+                uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
 
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
       fp_type rn, in;
-      fp_type rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (1 * ii & ms[0]) | (2 * ii & ms[1]) | (4 * ii & ms[2]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
-
-          j += 2;
-        }
-
-        *(p0 + xss[l]) = rn;
-        *(p0 + xss[l] + 1) = in;
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 2;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
-  }
+      fp_type rs[hsize], is[hsize];
 
-  void ApplyGate3H(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
+      uint64_t r = i % size;
+      uint64_t s = i / size;
 
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
       }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      fp_type rn, in;
-      fp_type rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (1 * ii & ms[0]) | (2 * ii & ms[1]) | (4 * ii & ms[2])
-          | (8 * ii & ms[3]);
 
-      auto p0 = rstate + row_size * r + 2 * c;
+      auto p0 = rstate + row_size * s + 2 * t;
 
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = *(p0 + xss[k]);
+        is[k] = *(p0 + xss[k] + 1);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 8; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = rs[0] * v[j] - is[0] * v[j + 1];
         in = rs[0] * v[j + 1] + is[0] * v[j];
 
         j += 2;
 
-        for (unsigned n = 1; n < 8; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn += rs[l] * v[j] - is[l] * v[j + 1];
+          in += rs[l] * v[j + 1] + is[l] * v[j];
 
           j += 2;
         }
 
-        *(p0 + xss[l]) = rn;
-        *(p0 + xss[l] + 1) = in;
+        *(p0 + xss[k]) = rn;
+        *(p0 + xss[k] + 1) = in;
       }
     };
 
-    fp_type* rstate = state.get();
-
-    unsigned k = 3;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
-  }
-
-  void ApplyGate4H(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      fp_type rn, in;
-      fp_type rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (1 * ii & ms[0]) | (2 * ii & ms[1]) | (4 * ii & ms[2])
-          | (8 * ii & ms[3]) | (16 * ii & ms[4]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
-
-          j += 2;
-        }
-
-        *(p0 + xss[l]) = rn;
-        *(p0 + xss[l] + 1) = in;
-      }
-    };
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
 
-    fp_type* rstate = state.get();
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
 
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
     uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
+    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get());
   }
 
-  void ApplyGate5H(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
+  template <unsigned H>
+  void ApplyControlledGateH(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs,
+                            uint64_t cvals, const fp_type* matrix,
+                            State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      fp_type rn, in;
-      fp_type rs[32], is[32];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (1 * ii & ms[0]) | (2 * ii & ms[1]) | (4 * ii & ms[2])
-          | (8 * ii & ms[3]) | (16 * ii & ms[4]) | (32 * ii & ms[5]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
 
-      for (unsigned l = 0; l < 32; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
-
-          j += 2;
-        }
-
-        *(p0 + xss[l]) = rn;
-        *(p0 + xss[l] + 1) = in;
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
-  }
-
-  void ApplyGate6H(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[6];
-    uint64_t ms[7];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 6; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1);
-
-    uint64_t xss[64];
-    for (unsigned i = 0; i < 64; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 6; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
       fp_type rn, in;
-      fp_type rs[64], is[64];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (1 * ii & ms[0]) | (2 * ii & ms[1]) | (4 * ii & ms[2])
-          | (8 * ii & ms[3]) | (16 * ii & ms[4]) | (32 * ii & ms[5])
-          | (64 * ii & ms[6]);
+      fp_type rs[hsize], is[hsize];
 
-      auto p0 = rstate + row_size * r + 2 * c;
+      uint64_t r = i % size;
+      uint64_t s = i / size;
 
-      for (unsigned l = 0; l < 64; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
       }
 
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
+      if ((t & cmaskh) == cvalsh) {
+        auto p0 = rstate + row_size * s + 2 * t;
 
-          j += 2;
+        for (unsigned k = 0; k < hsize; ++k) {
+          rs[k] = *(p0 + xss[k]);
+          is[k] = *(p0 + xss[k] + 1);
         }
 
-        *(p0 + xss[l]) = rn;
-        *(p0 + xss[l] + 1) = in;
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate1H(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs,
-                             uint64_t cmask, const fp_type* matrix,
-                             Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      fp_type rn, in;
-      fp_type rs[2], is[2];
+        uint64_t j = 0;
 
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
+        for (unsigned k = 0; k < hsize; ++k) {
+          rn = rs[0] * v[j] - is[0] * v[j + 1];
+          in = rs[0] * v[j + 1] + is[0] * v[j];
 
           j += 2;
-        }
-
-        *(p0 + xss[l]) = rn;
-        *(p0 + xss[l] + 1) = in;
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 1 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate2H(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs,
-                             uint64_t cmask, const fp_type* matrix,
-                             Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
 
-    emaskh = ~emaskh;
+          for (unsigned l = 1; l < hsize; ++l) {
+            rn += rs[l] * v[j] - is[l] * v[j + 1];
+            in += rs[l] * v[j + 1] + is[l] * v[j];
 
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      fp_type rn, in;
-      fp_type rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
-      }
-
-      uint64_t j = 0;
+            j += 2;
+          }
 
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
-
-          j += 2;
+          *(p0 + xss[k]) = rn;
+          *(p0 + xss[k] + 1) = in;
         }
-
-        *(p0 + xss[l]) = rn;
-        *(p0 + xss[l] + 1) = in;
       }
     };
 
-    fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
 
-    unsigned k = 2 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3H(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs,
-                             uint64_t cmask, const fp_type* matrix,
-                             Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      fp_type rn, in;
-      fp_type rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
-
-          j += 2;
-        }
-
-        *(p0 + xss[l]) = rn;
-        *(p0 + xss[l] + 1) = in;
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4H(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs,
-                             uint64_t cmask, const fp_type* matrix,
-                             Unitary& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      fp_type rn, in;
-      fp_type rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = *(p0 + xss[l]);
-        is[l] = *(p0 + xss[l] + 1);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn += rs[n] * v[j] - is[n] * v[j + 1];
-          in += rs[n] * v[j + 1] + is[n] * v[j];
-
-          j += 2;
-        }
-
-        *(p0 + xss[l]) = rn;
-        *(p0 + xss[l] + 1) = in;
-      }
-    };
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
 
-    fp_type* rstate = state.get();
+    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
 
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
     uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
+    for_.Run(size * size2, f,
+             matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
   }
 
   For for_;
diff --git a/lib/unitary_calculator_sse.h b/lib/unitary_calculator_sse.h
index 2e458526..a3c3f2eb 100644
--- a/lib/unitary_calculator_sse.h
+++ b/lib/unitary_calculator_sse.h
@@ -17,11 +17,12 @@
 
 #include <smmintrin.h>
 
-#include <algorithm>
 #include <complex>
 #include <cstdint>
+#include <functional>
+#include <vector>
 
-#include "bits.h"
+#include "simulator.h"
 #include "unitaryspace_sse.h"
 
 namespace qsim {
@@ -31,7 +32,7 @@ namespace unitary {
  * Quantum circuit unitary calculator with SSE vectorization.
  */
 template <typename For>
-class UnitaryCalculatorSSE final {
+class UnitaryCalculatorSSE final : public SimulatorBase {
  public:
   using UnitarySpace = UnitarySpaceSSE<For>;
   using Unitary = typename UnitarySpace::Unitary;
@@ -50,60 +51,60 @@ class UnitaryCalculatorSSE final {
    * @param state The state of the system, to be updated by this method.
    */
   void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, Unitary& state) const {
+                 const fp_type* matrix, State& state) const {
     // Assume qs[0] < qs[1] < qs[2] < ... .
 
     switch (qs.size()) {
     case 1:
       if (qs[0] > 1) {
-        ApplyGate1H(qs, matrix, state);
+        ApplyGateH<1>(qs, matrix, state);
       } else {
-        ApplyGate1L(qs, matrix, state);
+        ApplyGateL<0, 1>(qs, matrix, state);
       }
       break;
     case 2:
       if (qs[0] > 1) {
-        ApplyGate2HH(qs, matrix, state);
+        ApplyGateH<2>(qs, matrix, state);
       } else if (qs[1] > 1) {
-        ApplyGate2HL(qs, matrix, state);
+        ApplyGateL<1, 1>(qs, matrix, state);
       } else {
-        ApplyGate2LL(qs, matrix, state);
+        ApplyGateL<0, 2>(qs, matrix, state);
       }
       break;
     case 3:
       if (qs[0] > 1) {
-        ApplyGate3HHH(qs, matrix, state);
+        ApplyGateH<3>(qs, matrix, state);
       } else if (qs[1] > 1) {
-        ApplyGate3HHL(qs, matrix, state);
+        ApplyGateL<2, 1>(qs, matrix, state);
       } else {
-        ApplyGate3HLL(qs, matrix, state);
+        ApplyGateL<1, 2>(qs, matrix, state);
       }
       break;
     case 4:
       if (qs[0] > 1) {
-        ApplyGate4HHHH(qs, matrix, state);
+        ApplyGateH<4>(qs, matrix, state);
       } else if (qs[1] > 1) {
-        ApplyGate4HHHL(qs, matrix, state);
+        ApplyGateL<3, 1>(qs, matrix, state);
       } else {
-        ApplyGate4HHLL(qs, matrix, state);
+        ApplyGateL<2, 2>(qs, matrix, state);
       }
       break;
     case 5:
       if (qs[0] > 1) {
-        ApplyGate5HHHHH(qs, matrix, state);
+        ApplyGateH<5>(qs, matrix, state);
       } else if (qs[1] > 1) {
-        ApplyGate5HHHHL(qs, matrix, state);
+        ApplyGateL<4, 1>(qs, matrix, state);
       } else {
-        ApplyGate5HHHLL(qs, matrix, state);
+        ApplyGateL<3, 2>(qs, matrix, state);
       }
       break;
     case 6:
       if (qs[0] > 1) {
-        ApplyGate6HHHHHH(qs, matrix, state);
+        ApplyGateH<6>(qs, matrix, state);
       } else if (qs[1] > 1) {
-        ApplyGate6HHHHHL(qs, matrix, state);
+        ApplyGateL<5, 1>(qs, matrix, state);
       } else {
-        ApplyGate6HHHHLL(qs, matrix, state);
+        ApplyGateL<4, 2>(qs, matrix, state);
       }
       break;
     default:
@@ -116,13 +117,16 @@ class UnitaryCalculatorSSE final {
    * Applies a controlled gate using SSE instructions.
    * @param qs Indices of the qubits affected by this gate.
    * @param cqs Indices of control qubits.
-   * @param cmask Bit mask of control qubit values.
+   * @param cvals Bit mask of control qubit values.
    * @param matrix Matrix representation of the gate to be applied.
    * @param state The state of the system, to be updated by this method.
    */
   void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cmask,
-                           const fp_type* matrix, Unitary& state) const {
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
+
     if (cqs.size() == 0) {
       ApplyGate(qs, matrix, state);
       return;
@@ -132,78 +136,78 @@ class UnitaryCalculatorSSE final {
     case 1:
       if (qs[0] > 1) {
         if (cqs[0] > 1) {
-          ApplyControlledGate1H_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate1H_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 1) {
-          ApplyControlledGate1L_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate1L_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
     case 2:
       if (qs[0] > 1) {
         if (cqs[0] > 1) {
-          ApplyControlledGate2HH_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate2HH_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[1] > 1) {
         if (cqs[0] > 1) {
-          ApplyControlledGate2HL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate2HL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 1) {
-          ApplyControlledGate2LL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate2LL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
     case 3:
       if (qs[0] > 1) {
         if (cqs[0] > 1) {
-          ApplyControlledGate3HHH_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3HHH_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[1] > 1) {
         if (cqs[0] > 1) {
-          ApplyControlledGate3HHL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3HHL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 1) {
-          ApplyControlledGate3HLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate3HLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
     case 4:
       if (qs[0] > 1) {
         if (cqs[0] > 1) {
-          ApplyControlledGate4HHHH_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HHHH_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
         }
       } else if (qs[1] > 1) {
         if (cqs[0] > 1) {
-          ApplyControlledGate4HHHL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HHHL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
         }
       } else {
         if (cqs[0] > 1) {
-          ApplyControlledGate4HHLL_H(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
         } else {
-          ApplyControlledGate4HHLL_L(qs, cqs, cmask, matrix, state);
+          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
         }
       }
       break;
@@ -221,46 +225,36 @@ class UnitaryCalculatorSSE final {
   }
 
  private:
-  void ApplyGate1H(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
+                const uint64_t* ms, const uint64_t* xss, uint64_t size,
+                uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
       __m128 ru, iu, rn, in;
-      __m128 rs[2], is[2];
+      __m128 rs[hsize], is[hsize];
 
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (4 * ii & ms[0]) | (8 * ii & ms[1]);
+      uint64_t r = 4 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
 
-      auto p0 = rstate + row_size * r + 2 * c;
+      auto p0 = rstate + row_size * s + 2 * t;
 
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm_load_ps(p0 + xss[k]);
+        is[k] = _mm_load_ps(p0 + xss[k] + 4);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 2; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         ru = _mm_set1_ps(v[j]);
         iu = _mm_set1_ps(v[j + 1]);
         rn = _mm_mul_ps(rs[0], ru);
@@ -270,85 +264,84 @@ class UnitaryCalculatorSSE final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 2; ++n) {
+        for (unsigned l = 1; l < hsize; ++l) {
           ru = _mm_set1_ps(v[j]);
           iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
 
           j += 2;
         }
 
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
 
-    unsigned k = 3;
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 2 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
+    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get());
   }
 
-  void ApplyGate1L(const std::vector<unsigned>& qs,
-                   const fp_type* matrix, Unitary& state) const {
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(2);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
+                const uint64_t* ms, const uint64_t* xss, unsigned q0,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
 
-        unsigned l = 2 * (2 * i + m);
+      __m128 rn, in;
+      __m128 rs[gsize], is[gsize];
 
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
+      uint64_t r = 4 * (i % size);
+      uint64_t s = i / size;
 
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
       }
-    }
 
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                unsigned q0, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[2], is[2];
+      auto p0 = rstate + row_size * s + 2 * t;
 
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      auto p0 = rstate + row_size * r + 8 * ii;
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
 
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[2 * l] = _mm_load_ps(p0);
-        is[2 * l] = _mm_load_ps(p0 + 4);
+        rs[k2] = _mm_load_ps(p0 + xss[k]);
+        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
 
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
+        if (L == 1) {
+          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
+                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
+                               : _mm_shuffle_ps(is[k2], is[k2], 78);
+        } else if (L == 2) {
+          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
+          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
+          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
+          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
+          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
+        }
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 1; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm_mul_ps(rs[0], w[j]);
         in = _mm_mul_ps(rs[0], w[j + 1]);
         rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
@@ -356,75 +349,72 @@ class UnitaryCalculatorSSE final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
 
           j += 2;
         }
 
-        _mm_store_ps(p0, rn);
-        _mm_store_ps(p0 + 4, in);
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m128 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks11<L>(qs);
 
-    unsigned k = 2;
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+    FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 2 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, w, qs[0], size, raw_size, rstate);
+    for_.Run(size * size2, f,  w, ms, xss, qs[0], size, raw_size, state.get());
   }
 
-  void ApplyGate2HH(const std::vector<unsigned>& qs,
-                    const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
 
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+      __m128 ru, iu, rn, in;
+      __m128 rs[hsize], is[hsize];
 
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
+      uint64_t r = 4 * (i % size);
+      uint64_t s = i / size;
 
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[4], is[4];
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
 
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (4 * ii & ms[0]) | (8 * ii & ms[1]) | (16 * ii & ms[2]);
+      if ((t & cmaskh) != cvalsh) return;
 
-      auto p0 = rstate + row_size * r + 2 * c;
+      auto p0 = rstate + row_size * s + 2 * t;
 
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm_load_ps(p0 + xss[k]);
+        is[k] = _mm_load_ps(p0 + xss[k] + 4);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 4; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         ru = _mm_set1_ps(v[j]);
         iu = _mm_set1_ps(v[j + 1]);
         rn = _mm_mul_ps(rs[0], ru);
@@ -434,191 +424,72 @@ class UnitaryCalculatorSSE final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 4; ++n) {
+        for (unsigned l = 1; l < hsize; ++l) {
           ru = _mm_set1_ps(v[j]);
           iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
 
           j += 2;
         }
 
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
       }
     };
 
-    fp_type* rstate = state.get();
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
 
-    unsigned k = 4;
+    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 2 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
+    for_.Run(size * size2, f,
+             matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
   }
 
-  void ApplyGate2HL(const std::vector<unsigned>& qs,
-                    const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(3);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned q0, uint64_t size, uint64_t row_size,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, uint64_t size, uint64_t row_size,
                 fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (4 * ii & ms[0]) | (8 * ii & ms[1]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, qs[0], size, raw_size, rstate);
-  }
-
-  void ApplyGate2LL(const std::vector<unsigned>& qs,
-                    const fp_type* matrix, Unitary& state) const {
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(2);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+      constexpr unsigned hsize = 1 << H;
 
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
+      __m128 rn, in;
+      __m128 rs[hsize], is[hsize];
 
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
+      uint64_t r = 4 * (i % size);
+      uint64_t s = i / size;
 
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
       }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[4], is[4];
 
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      auto p0 = rstate + row_size * r + 8 * ii;
+      if ((t & cmaskh) != cvalsh) return;
 
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[4 * l] = _mm_load_ps(p0);
-        is[4 * l] = _mm_load_ps(p0 + 4);
+      auto p0 = rstate + row_size * s + 2 * t;
 
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm_load_ps(p0 + xss[k]);
+        is[k] = _mm_load_ps(p0 + xss[k] + 4);
       }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 1; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm_mul_ps(rs[0], w[j]);
         in = _mm_mul_ps(rs[0], w[j + 1]);
         rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
@@ -626,295 +497,90 @@ class UnitaryCalculatorSSE final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
 
           j += 2;
         }
 
-        _mm_store_ps(p0, rn);
-        _mm_store_ps(p0 + 4, in);
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
       }
     };
 
-    fp_type* rstate = state.get();
-
-    unsigned k = 2;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, size, raw_size, rstate);
-  }
-
-  void ApplyGate3HHH(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (4 * ii & ms[0]) | (8 * ii & ms[1]) | (16 * ii & ms[2])
-          | (32 * ii & ms[3]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m128 w[1 << (1 + 2 * H)];
 
-    fp_type* rstate = state.get();
+    auto m = GetMasks8<2>(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+    FillControlledMatrixH<H, 2>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
 
-    unsigned k = 5;
+    unsigned k = 2 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
+    for_.Run(size * size2, f,
+             w, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
   }
 
-  void ApplyGate3HHL(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(4);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
     auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned q0, uint64_t size, uint64_t row_size,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, unsigned q0, uint64_t size, uint64_t row_size,
                 fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (4 * ii & ms[0]) | (8 * ii & ms[1]) | (16 * ii & ms[2]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, qs[0], size, raw_size, rstate);
-  }
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
 
-  void ApplyGate3HLL(const std::vector<unsigned>& qs,
-                     const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
+      __m128 rn, in;
+      __m128 rs[gsize], is[gsize];
 
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+      uint64_t r = 4 * (i % size);
+      uint64_t s = i / size;
 
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
       }
-      xss[i] = a;
-    }
 
-    unsigned p[4];
+      if ((t & cmaskh) != cvalsh) return;
 
-    auto s = UnitarySpace::Create(3);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
+      auto p0 = rstate + row_size * s + 2 * t;
 
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
 
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
+        rs[k2] = _mm_load_ps(p0 + xss[k]);
+        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
 
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
+        if (L == 1) {
+          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
+                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
+                               : _mm_shuffle_ps(is[k2], is[k2], 78);
+        } else if (L == 2) {
+          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
+          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
+          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
+          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
+          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
         }
       }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (4 * ii & ms[0]) | (8 * ii & ms[1]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[4 * l] = _mm_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
 
       uint64_t j = 0;
 
-      for (unsigned l = 0; l < 2; ++l) {
+      for (unsigned k = 0; k < hsize; ++k) {
         rn = _mm_mul_ps(rs[0], w[j]);
         in = _mm_mul_ps(rs[0], w[j + 1]);
         rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
@@ -922,3612 +588,46 @@ class UnitaryCalculatorSSE final {
 
         j += 2;
 
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
 
           j += 2;
         }
 
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
       }
     };
 
-    fp_type* rstate = state.get();
-
-    unsigned k = 3;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, size, raw_size, rstate);
-  }
-
-  void ApplyGate4HHHH(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (4 * ii & ms[0]) | (8 * ii & ms[1]) | (16 * ii & ms[2])
-          | (32 * ii & ms[3]) | (64 * ii & ms[4]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m128 w[1 << (1 + 2 * H + L)];
 
-    fp_type* rstate = state.get();
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
 
-    unsigned k = 6;
+    unsigned k = 2 + H;
     unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
     uint64_t size = uint64_t{1} << n;
     uint64_t size2 = uint64_t{1} << state.num_qubits();
     uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
 
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
-  }
+    if (CH) {
+      auto m = GetMasks9<L>(state.num_qubits(), qs, cqs, cvals);
+      FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
 
-  void ApplyGate4HHHL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
+      for_.Run(size * size2, f, w, ms, xss,
+               m.cvalsh, m.cmaskh, qs[0], size, raw_size, state.get());
+    } else {
+      auto m = GetMasks10<L, 2>(state.num_qubits(), qs, cqs, cvals);
+      FillControlledMatrixL<H, L, 2>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
 
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
+      for_.Run(size * size2, f, w, ms, xss,
+               m.cvalsh, m.cmaskh, qs[0], size, raw_size, state.get());
     }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(5);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned q0, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (4 * ii & ms[0]) | (8 * ii & ms[1]) | (16 * ii & ms[2])
-          | (32 * ii & ms[3]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, qs[0], size, raw_size, rstate);
-  }
-
-  void ApplyGate4HHLL(const std::vector<unsigned>& qs,
-                      const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(4);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (4 * ii & ms[0]) | (8 * ii & ms[1]) | (16 * ii & ms[2]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[4 * l] = _mm_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, size, raw_size, rstate);
-  }
-
-  void ApplyGate5HHHHH(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[32], is[32];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (4 * ii & ms[0]) | (8 * ii & ms[1]) | (16 * ii & ms[2])
-          | (32 * ii & ms[3]) | (64 * ii & ms[4]) | (128 * ii & ms[5]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
-  }
-
-  void ApplyGate5HHHHL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(6);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (64 * i + 32 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned q0, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[32], is[32];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (4 * ii & ms[0]) | (8 * ii & ms[1]) | (16 * ii & ms[2])
-          | (32 * ii & ms[3]) | (64 * ii & ms[4]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, qs[0], size, raw_size, rstate);
-  }
-
-  void ApplyGate5HHHLL(const std::vector<unsigned>& qs,
-                       const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(5);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 32; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (128 * i + 32 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (32 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[32], is[32];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (4 * ii & ms[0]) | (8 * ii & ms[1]) | (16 * ii & ms[2])
-          | (32 * ii & ms[3]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[4 * l] = _mm_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 32; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, size, raw_size, rstate);
-  }
-
-  void ApplyGate6HHHHHH(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[6];
-    uint64_t ms[7];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 6; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1);
-
-    uint64_t xss[64];
-    for (unsigned i = 0; i < 64; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 6; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[64], is[64];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (4 * ii & ms[0]) | (8 * ii & ms[1]) | (16 * ii & ms[2])
-          | (32 * ii & ms[3]) | (64 * ii & ms[4]) | (128 * ii & ms[5])
-          | (256 * ii & ms[6]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 64; ++l) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 8;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
-  }
-
-  void ApplyGate6HHHHHL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[5];
-    uint64_t ms[6];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 5; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
-
-    uint64_t xss[32];
-    for (unsigned i = 0; i < 32; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 5; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(7);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 32; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (128 * i + 64 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned q0, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[64], is[64];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (4 * ii & ms[0]) | (8 * ii & ms[1]) | (16 * ii & ms[2])
-          | (32 * ii & ms[3]) | (64 * ii & ms[4]) | (128 * ii & ms[5]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 32; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 7;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, qs[0], size, raw_size, rstate);
-  }
-
-  void ApplyGate6HHHHLL(const std::vector<unsigned>& qs,
-                        const fp_type* matrix, Unitary& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(6);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 64; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (256 * i + 64 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (64 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[64], is[64];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = (4 * ii & ms[0]) | (8 * ii & ms[1]) | (16 * ii & ms[2])
-          | (32 * ii & ms[3]) | (64 * ii & ms[4]);
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[4 * l] = _mm_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 64; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate1H_H(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[2], is[2];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate1H_L(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(2);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (2 * i + 2 * k + m);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[2], is[2];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate1L_H(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               Unitary& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(2);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                unsigned q0, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[2], is[2];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[2 * l] = _mm_load_ps(p0);
-        is[2 * l] = _mm_load_ps(p0 + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0, rn);
-        _mm_store_ps(p0 + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 2 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w,
-             state.num_qubits(), cmaskh, emaskh, qs[0], size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate1L_L(const std::vector<unsigned>& qs,
-                               const std::vector<unsigned>& cqs,
-                               uint64_t cmask, const fp_type* matrix,
-                               Unitary& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(2);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 2; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (2 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                unsigned q0, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[2], is[2];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[2 * l] = _mm_load_ps(p0);
-        is[2 * l] = _mm_load_ps(p0 + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 2; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0, rn);
-        _mm_store_ps(p0 + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 2 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w,
-             state.num_qubits(), cmaskh, emaskh, qs[0], size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate2HH_H(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate2HH_L(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(3);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (4 * i + 4 * k + m);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate2HL_H(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(3);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                unsigned q0, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, qs[0], size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate2HL_L(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(3);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                unsigned q0, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, qs[0], size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate2LL_H(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                Unitary& state) const {
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(2);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[4 * l] = _mm_load_ps(p0);
-        is[4 * l] = _mm_load_ps(p0 + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0, rn);
-        _mm_store_ps(p0 + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 2 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate2LL_L(const std::vector<unsigned>& qs,
-                                const std::vector<unsigned>& cqs,
-                                uint64_t cmask, const fp_type* matrix,
-                                Unitary& state) const {
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(2);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 1; ++i) {
-      for (unsigned m = 0; m < 4; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (4 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[4], is[4];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rs[4 * l] = _mm_load_ps(p0);
-        is[4 * l] = _mm_load_ps(p0 + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 1; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 4; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0, rn);
-        _mm_store_ps(p0 + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 2 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3HHH_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3HHH_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(4);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (8 * i + 8 * k + m);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3HHL_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(4);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                unsigned q0, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, qs[0], size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3HHL_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(4);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                unsigned q0, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, qs[0], size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3HLL_H(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(3);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[4 * l] = _mm_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate3HLL_L(const std::vector<unsigned>& qs,
-                                 const std::vector<unsigned>& cqs,
-                                 uint64_t cmask, const fp_type* matrix,
-                                 Unitary& state) const {
-    uint64_t xs[1];
-    uint64_t ms[2];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
-
-    uint64_t xss[2];
-    for (unsigned i = 0; i < 2; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 1; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(3);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned m = 0; m < 8; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (8 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[8], is[8];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rs[4 * l] = _mm_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 2; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 8; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 3 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HHHH_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 ru, iu, rn, in;
-      __m128 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HHHH_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[4];
-    uint64_t ms[5];
-
-    xs[0] = uint64_t{1} << (qs[0] + 1);
-    ms[0] = (uint64_t{1} << qs[0]) - 1;
-    for (unsigned i = 1; i < 4; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
-
-    uint64_t xss[16];
-    for (unsigned i = 0; i < 16; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 4; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(5);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 16; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (16 * i + 16 * k + m);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rs[l] = _mm_load_ps(p0 + xss[l]);
-        is[l] = _mm_load_ps(p0 + xss[l] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 16; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 6 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HHHL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(5);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                unsigned q0, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, qs[0], size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HHHL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[3];
-    uint64_t ms[4];
-
-    xs[0] = uint64_t{1} << (qs[1] + 1);
-    ms[0] = (uint64_t{1} << qs[1]) - 1;
-    for (unsigned i = 1; i < 3; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
-
-    uint64_t xss[8];
-    for (unsigned i = 0; i < 8; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 3; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(5);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]);
-
-    for (unsigned i = 0; i < 8; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                unsigned q0, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rs[2 * l] = _mm_load_ps(p0 + xss[l]);
-        is[2 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(rs[2 * l], rs[2 * l], 177)
-                                : _mm_shuffle_ps(rs[2 * l], rs[2 * l], 78);
-        is[2 * l + 1] = q0 == 0 ? _mm_shuffle_ps(is[2 * l], is[2 * l], 177)
-                                : _mm_shuffle_ps(is[2 * l], is[2 * l], 78);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 8; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 5 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, qs[0], size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HHLL_H(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      emaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(4);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j] = matrix[p[j]];
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = matrix[p[j] + 1];
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[4 * l] = _mm_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  void ApplyControlledGate4HHLL_L(const std::vector<unsigned>& qs,
-                                  const std::vector<unsigned>& cqs,
-                                  uint64_t cmask, const fp_type* matrix,
-                                  Unitary& state) const {
-    uint64_t xs[2];
-    uint64_t ms[3];
-
-    xs[0] = uint64_t{1} << (qs[2] + 1);
-    ms[0] = (uint64_t{1} << qs[2]) - 1;
-    for (unsigned i = 1; i < 2; ++i) {
-      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
-      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
-    }
-    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
-
-    uint64_t xss[4];
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t a = 0;
-      for (uint64_t k = 0; k < 2; ++k) {
-        if (((i >> k) & 1) == 1) {
-          a += xs[k];
-        }
-      }
-      xss[i] = a;
-    }
-
-    unsigned cl = 0;
-    uint64_t emaskl = 0;
-    uint64_t emaskh = 0;
-
-    for (auto q : cqs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        emaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
-    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 2, emaskl);
-
-    for (auto q : qs) {
-      if (q > 1) {
-        emaskh |= uint64_t{1} << q;
-      }
-    }
-
-    emaskh = ~emaskh ^ 3;
-
-    unsigned p[4];
-
-    auto s = UnitarySpace::Create(4);
-    __m128* w = (__m128*) s.get();
-    fp_type* wf = (fp_type*) w;
-
-    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
-
-    for (unsigned i = 0; i < 4; ++i) {
-      for (unsigned m = 0; m < 16; ++m) {
-        for (unsigned j = 0; j < 4; ++j) {
-          unsigned k = bits::CompressBits(j, 2, qmask);
-          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
-        }
-
-        unsigned l = 2 * (16 * i + m);
-
-        for (unsigned j = 0; j < 4; ++j) {
-          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
-          wf[4 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
-        }
-
-        for (unsigned j = 0; j < 4; ++j) {
-          wf[4 * l + j + 4] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
-        }
-      }
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      __m128 rn, in;
-      __m128 rs[16], is[16];
-
-      uint64_t ii = i % size;
-      uint64_t r = i / size;
-      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
-
-      auto p0 = rstate + row_size * r + 2 * c;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rs[4 * l] = _mm_load_ps(p0 + xss[l]);
-        is[4 * l] = _mm_load_ps(p0 + xss[l] + 4);
-
-        rs[4 * l + 1] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 57);
-        is[4 * l + 1] = _mm_shuffle_ps(is[4 * l], is[4 * l], 57);
-        rs[4 * l + 2] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 78);
-        is[4 * l + 2] = _mm_shuffle_ps(is[4 * l], is[4 * l], 78);
-        rs[4 * l + 3] = _mm_shuffle_ps(rs[4 * l], rs[4 * l], 147);
-        is[4 * l + 3] = _mm_shuffle_ps(is[4 * l], is[4 * l], 147);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned l = 0; l < 4; ++l) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned n = 1; n < 16; ++n) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[n], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[n], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[n], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[n], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[l], rn);
-        _mm_store_ps(p0 + xss[l] + 4, in);
-      }
-    };
-
-    fp_type* rstate = state.get();
-
-    unsigned k = 4 + cqs.size() - cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss,
-             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
-  }
-
-  static unsigned MaskedAdd(
-      unsigned a, unsigned b, unsigned mask, unsigned lsize) {
-    unsigned c = bits::CompressBits(a, 2, mask);
-    return bits::ExpandBits((c + b) % lsize, 2, mask);
   }
 
   For for_;
diff --git a/lib/util_cpu.h b/lib/util_cpu.h
new file mode 100644
index 00000000..8e024252
--- /dev/null
+++ b/lib/util_cpu.h
@@ -0,0 +1,43 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UTIL_CPU_H_
+#define UTIL_CPU_H_
+
+#ifdef __SSE2__
+# include <immintrin.h>
+#endif
+
+namespace qsim {
+
+// This function sets flush-to-zero and denormals-are-zeros MXCSR control
+// flags. This prevents rare cases of performance slowdown potentially at
+// the cost of a tiny precision loss.
+inline void SetFlushToZeroAndDenormalsAreZeros() {
+#ifdef __SSE2__
+  _mm_setcsr(_mm_getcsr() | 0x8040);
+#endif
+}
+
+// This function clears flush-to-zero and denormals-are-zeros MXCSR control
+// flags.
+inline void ClearFlushToZeroAndDenormalsAreZeros() {
+#ifdef __SSE2__
+  _mm_setcsr(_mm_getcsr() & ~unsigned{0x8040});
+#endif
+}
+
+}  // namespace qsim
+
+#endif  // UTIL_CPU_H_
diff --git a/lib/util_cuda.h b/lib/util_cuda.h
index 43da065c..591d852d 100644
--- a/lib/util_cuda.h
+++ b/lib/util_cuda.h
@@ -34,15 +34,16 @@ inline void ErrorAssert(cudaError_t code, const char* file, unsigned line) {
 
 template <typename T>
 struct Complex {
-  __device__ __forceinline__ Complex() {}
+  __host__ __device__ __forceinline__ Complex() {}
 
-  __device__ __forceinline__ Complex(const T& re) : re(re), im(0) {}
+  __host__ __device__ __forceinline__ Complex(const T& re) : re(re), im(0) {}
 
-  __device__ __forceinline__ Complex(const T& re, const T& im)
+  __host__ __device__ __forceinline__ Complex(const T& re, const T& im)
       : re(re), im(im) {}
 
   template <typename U>
-  __device__ __forceinline__ Complex<T>& operator=(const Complex<U>& r) {
+  __host__ __device__ __forceinline__ Complex<T>& operator=(
+      const Complex<U>& r) {
     re = r.re;
     im = r.im;
 
@@ -54,13 +55,13 @@ struct Complex {
 };
 
 template <typename T>
-__device__ __forceinline__ Complex<T> operator+(
+__host__ __device__ __forceinline__ Complex<T> operator+(
     const Complex<T>& l, const Complex<T>& r) {
   return Complex<T>(l.re + r.re, l.im + r.im);
 }
 
 template <typename T, typename U>
-__device__ __forceinline__ Complex<T> operator+(
+__host__ __device__ __forceinline__ Complex<T> operator+(
     const Complex<T>& l, const Complex<U>& r) {
   return Complex<T>(l.re + r.re, l.im + r.im);
 }
diff --git a/lib/util_custatevec.h b/lib/util_custatevec.h
new file mode 100644
index 00000000..36f29efa
--- /dev/null
+++ b/lib/util_custatevec.h
@@ -0,0 +1,44 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UTIL_CUSTATEVEC_H_
+#define UTIL_CUSTATEVEC_H_
+
+#include <cublas_v2.h>
+#include <custatevec.h>
+
+#include "io.h"
+#include "util_cuda.h"
+
+namespace qsim {
+
+inline void ErrorAssert(cublasStatus_t code, const char* file, unsigned line) {
+  if (code != CUBLAS_STATUS_SUCCESS) {
+    IO::errorf("cuBLAS error %i: %s %d\n", code, file, line);
+    exit(code);
+  }
+}
+
+inline void ErrorAssert(
+    custatevecStatus_t code, const char* file, unsigned line) {
+  if (code != CUSTATEVEC_STATUS_SUCCESS) {
+    IO::errorf("custatevec error: %s %s %d\n",
+                custatevecGetErrorString(code), file, line);
+    exit(code);
+  }
+}
+
+}  // namespace qsim
+
+#endif  // UTIL_CUSTATEVEC_H_
diff --git a/lib/vectorspace.h b/lib/vectorspace.h
index dc871d9f..5a5a6c94 100644
--- a/lib/vectorspace.h
+++ b/lib/vectorspace.h
@@ -74,6 +74,10 @@ class VectorSpace {
       return num_qubits_;
     }
 
+    bool requires_copy_to_host() const {
+      return false;
+    }
+
    private:
     Pointer ptr_;
     unsigned num_qubits_;
diff --git a/lib/vectorspace_cuda.h b/lib/vectorspace_cuda.h
index c064a90b..ac228c63 100644
--- a/lib/vectorspace_cuda.h
+++ b/lib/vectorspace_cuda.h
@@ -67,6 +67,10 @@ class VectorSpaceCUDA {
       return num_qubits_;
     }
 
+    bool requires_copy_to_host() const {
+      return true;
+    }
+
    private:
     Pointer ptr_;
     unsigned num_qubits_;
diff --git a/pybind_interface/Dockerfile b/pybind_interface/Dockerfile
index c74fbacc..b826d197 100644
--- a/pybind_interface/Dockerfile
+++ b/pybind_interface/Dockerfile
@@ -6,7 +6,7 @@ RUN apt-get install -y python3-dev python3-pybind11 python3-pytest python3-pip
 
 # The --force flag is used mainly so that the old numpy installation from pybind
 # gets replaced with the one cirq requires
-RUN pip3 install cirq --force
+RUN pip3 install --prefer-binary cirq-core --force
 
 # Copy relevant files
 COPY ./pybind_interface/ /qsim/pybind_interface/
diff --git a/pybind_interface/GetPybind11.cmake b/pybind_interface/GetPybind11.cmake
new file mode 100644
index 00000000..f93c2bf6
--- /dev/null
+++ b/pybind_interface/GetPybind11.cmake
@@ -0,0 +1,14 @@
+include(FetchContent)
+
+set(MIN_PYBIND_VERSION "2.2.4")
+FetchContent_Declare(
+  pybind11
+  GIT_REPOSITORY https://github.com/pybind/pybind11
+  GIT_TAG "v${MIN_PYBIND_VERSION}"
+)
+FetchContent_GetProperties(pybind11)
+find_package(pybind11 "${MIN_PYBIND_VERSION}" CONFIG)
+if((NOT pybind11_FOUND) AND (NOT pybind11_POPULATED)) # check first on system path, then attempt git fetch
+  FetchContent_Populate(pybind11)
+  add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR})
+endif()
diff --git a/pybind_interface/Makefile b/pybind_interface/Makefile
index 8e48ff57..8450bd17 100644
--- a/pybind_interface/Makefile
+++ b/pybind_interface/Makefile
@@ -3,27 +3,67 @@ QSIMLIB_BASIC = ../qsimcirq/qsim_basic`python3-config --extension-suffix`
 QSIMLIB_SSE = ../qsimcirq/qsim_sse`python3-config --extension-suffix`
 QSIMLIB_AVX2 = ../qsimcirq/qsim_avx2`python3-config --extension-suffix`
 QSIMLIB_AVX512 = ../qsimcirq/qsim_avx512`python3-config --extension-suffix`
+QSIMLIB_CUDA = ../qsimcirq/qsim_cuda`python3-config --extension-suffix`
+QSIMLIB_CUSTATEVEC = ../qsimcirq/qsim_custatevec`python3-config --extension-suffix`
 QSIMLIB_DECIDE = ../qsimcirq/qsim_decide`python3-config --extension-suffix`
 
-
 # The flags for the compilation of the simd-specific Pybind11 interfaces
 PYBINDFLAGS_BASIC = -Wall -shared -std=c++17 -fPIC `python3 -m pybind11 --includes`
 PYBINDFLAGS_SSE = -msse4.1 -Wall -shared -std=c++17 -fPIC `python3 -m pybind11 --includes`
 PYBINDFLAGS_AVX2 = -mavx2 -mfma -Wall -shared -std=c++17 -fPIC `python3 -m pybind11 --includes`
-PYBINDFLAGS_AVX512 = -mavx512f -Wall -shared -std=c++17 -fPIC `python3 -m pybind11 --includes`
+PYBINDFLAGS_AVX512 = -mavx512f -mbmi2 -Wall -shared -std=c++17 -fPIC `python3 -m pybind11 --includes`
+
+# The flags for the compilation of GPU-specific Pybind11 interfaces
+PYBINDFLAGS_CUDA = -std=c++14 -x cu -Xcompiler "-Wall -shared -fPIC `python3 -m pybind11 --includes`"
+
+# The flags for the compilation of cuStateVec-specific Pybind11 interfaces
+PYBINDFLAGS_CUSTATEVEC = $(CUSTATEVECFLAGS) $(PYBINDFLAGS_CUDA)
 
-.PHONY: pybind
-pybind:
+# Check for nvcc to decide compilation mode.
+ifeq ($(shell which $(NVCC)),)
+pybind: pybind-cpu decide-cpu
+else
+# Check for the cuStateVec library.
+ifeq ($(CUQUANTUM_DIR),)
+pybind: pybind-cpu pybind-gpu decide-gpu
+else
+pybind: pybind-cpu pybind-gpu pybind-custatevec decide-custatevec
+endif
+endif
+
+.PHONY: pybind-cpu
+pybind-cpu:
 	$(CXX) basic/pybind_main_basic.cpp -o $(QSIMLIB_BASIC) $(CXXFLAGS) $(PYBINDFLAGS_BASIC)
 	$(CXX) sse/pybind_main_sse.cpp -o $(QSIMLIB_SSE) $(CXXFLAGS) $(PYBINDFLAGS_SSE)
 	$(CXX) avx2/pybind_main_avx2.cpp -o $(QSIMLIB_AVX2) $(CXXFLAGS) $(PYBINDFLAGS_AVX2)
 	$(CXX) avx512/pybind_main_avx512.cpp -o $(QSIMLIB_AVX512) $(CXXFLAGS) $(PYBINDFLAGS_AVX512)
+
+.PHONY: decide-cpu
+decide-cpu:
 	$(CXX) decide/decide.cpp -o $(QSIMLIB_DECIDE) $(CXXFLAGS) $(PYBINDFLAGS_BASIC)
 
+.PHONY: pybind-gpu
+pybind-gpu:
+	$(NVCC) cuda/pybind_main_cuda.cpp -o $(QSIMLIB_CUDA) $(NVCCFLAGS) $(PYBINDFLAGS_CUDA)
+
+.PHONY: decide-gpu
+decide-gpu:
+	$(NVCC) decide/decide.cpp -o $(QSIMLIB_DECIDE) $(NVCCFLAGS) $(PYBINDFLAGS_CUDA)
+
+.PHONY: pybind-custatevec
+pybind-custatevec:
+	$(NVCC) custatevec/pybind_main_custatevec.cpp -o $(QSIMLIB_CUSTATEVEC) $(NVCCFLAGS) $(PYBINDFLAGS_CUSTATEVEC)
+
+.PHONY: decide-custatevec
+decide-custatevec:
+	$(NVCC) decide/decide.cpp -D__CUSTATEVEC__ -o $(QSIMLIB_DECIDE) $(NVCCFLAGS) $(PYBINDFLAGS_CUDA)
+
 .PHONY: clean
 clean:
 	-rm -f ./basic/*.x ./basic/*.a ./basic/*.so ./basic/*.mod $(QSIMLIB_BASIC)
 	-rm -f ./sse/*.x ./sse/*.a ./sse/*.so ./sse/*.mod $(QSIMLIB_SSE)
 	-rm -f ./avx2/*.x ./avx2/*.a ./avx2/*.so ./avx2/*.mod $(QSIMLIB_AVX2)
 	-rm -f ./avx512/*.x ./avx512/*.a ./avx512/*.so ./avx512/*.mod $(QSIMLIB_AVX512)
+	-rm -f ./cuda/*.x ./cuda/*.a ./cuda/*.so ./cuda/*.mod $(QSIMLIB_CUDA)
+	-rm -f ./custatevec/*.x ./custatevec/*.a ./custatevec/*.so ./custatevec/*.mod $(QSIMLIB_CUSTATEVEC)
 	-rm -f ./decide/*.x ./decide/*.a ./decide/*.so ./decide/*.mod $(QSIMLIB_DECIDE)
diff --git a/pybind_interface/avx2/CMakeLists.txt b/pybind_interface/avx2/CMakeLists.txt
index 4382de95..eebba584 100644
--- a/pybind_interface/avx2/CMakeLists.txt
+++ b/pybind_interface/avx2/CMakeLists.txt
@@ -12,17 +12,5 @@ if(APPLE)
     include_directories("/usr/local/include" "/usr/local/opt/llvm/include")
     link_directories("/usr/local/lib" "/usr/local/opt/llvm/lib")
 endif()
-
-include(FetchContent)
-
-FetchContent_Declare(
-  pybind11
-  GIT_REPOSITORY https://github.com/pybind/pybind11
-  GIT_TAG v2.2.4
-)
-FetchContent_GetProperties(pybind11)
-if(NOT pybind11_POPULATED)
-  FetchContent_Populate(pybind11)
-  add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR})
-endif()
+INCLUDE(../GetPybind11.cmake)
 pybind11_add_module(qsim_avx2 pybind_main_avx2.cpp)
diff --git a/pybind_interface/avx2/pybind_main_avx2.cpp b/pybind_interface/avx2/pybind_main_avx2.cpp
index 29e5e81f..1f7dc084 100644
--- a/pybind_interface/avx2/pybind_main_avx2.cpp
+++ b/pybind_interface/avx2/pybind_main_avx2.cpp
@@ -14,11 +14,35 @@
 
 #include "pybind_main_avx2.h"
 
+#include "../../lib/formux.h"
 #include "../../lib/simulator_avx.h"
+#include "../../lib/util_cpu.h"
 
 namespace qsim {
   template <typename For>
   using Simulator = SimulatorAVX<For>;
+
+  struct Factory {
+    // num_state_threads and num_dblocks are unused, but kept for consistency
+    // with the GPU Factory.
+    Factory(
+      unsigned num_sim_threads,
+      unsigned num_state_threads,
+      unsigned num_dblocks) : num_threads(num_sim_threads) {}
+
+    using Simulator = qsim::Simulator<For>;
+    using StateSpace = Simulator::StateSpace;
+
+    StateSpace CreateStateSpace() const {
+      return StateSpace(num_threads);
+    }
+
+    Simulator CreateSimulator() const {
+      return Simulator(num_threads);
+    }
+
+    unsigned num_threads;
+  };
 }
 
 #include "../pybind_main.cpp"
diff --git a/pybind_interface/avx512/CMakeLists.txt b/pybind_interface/avx512/CMakeLists.txt
index c265fe13..86cfdfa8 100644
--- a/pybind_interface/avx512/CMakeLists.txt
+++ b/pybind_interface/avx512/CMakeLists.txt
@@ -5,7 +5,7 @@ project(qsim)
 IF (WIN32)
     set(CMAKE_CXX_FLAGS "/arch:AVX512 /O2 /openmp")
 ELSE()
-    set(CMAKE_CXX_FLAGS "-mavx512f -O3 -fopenmp")
+    set(CMAKE_CXX_FLAGS "-mavx512f -mbmi2 -O3 -fopenmp")
 ENDIF()
 
 if(APPLE)
@@ -14,16 +14,5 @@ if(APPLE)
     link_directories("/usr/local/lib" "/usr/local/opt/llvm/lib")
 endif()
 
-include(FetchContent)
-
-FetchContent_Declare(
-  pybind11
-  GIT_REPOSITORY https://github.com/pybind/pybind11
-  GIT_TAG v2.2.4
-)
-FetchContent_GetProperties(pybind11)
-if(NOT pybind11_POPULATED)
-  FetchContent_Populate(pybind11)
-  add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR})
-endif()
+INCLUDE(../GetPybind11.cmake)
 pybind11_add_module(qsim_avx512 pybind_main_avx512.cpp)
diff --git a/pybind_interface/avx512/pybind_main_avx512.cpp b/pybind_interface/avx512/pybind_main_avx512.cpp
index 97c50117..b87ececb 100644
--- a/pybind_interface/avx512/pybind_main_avx512.cpp
+++ b/pybind_interface/avx512/pybind_main_avx512.cpp
@@ -14,11 +14,35 @@
 
 #include "pybind_main_avx512.h"
 
+#include "../../lib/formux.h"
 #include "../../lib/simulator_avx512.h"
+#include "../../lib/util_cpu.h"
 
 namespace qsim {
   template <typename For>
   using Simulator = SimulatorAVX512<For>;
+
+  struct Factory {
+    // num_state_threads and num_dblocks are unused, but kept for consistency
+    // with the GPU Factory.
+    Factory(
+      unsigned num_sim_threads,
+      unsigned num_state_threads,
+      unsigned num_dblocks) : num_threads(num_sim_threads) {}
+
+    using Simulator = qsim::Simulator<For>;
+    using StateSpace = Simulator::StateSpace;
+
+    StateSpace CreateStateSpace() const {
+      return StateSpace(num_threads);
+    }
+
+    Simulator CreateSimulator() const {
+      return Simulator(num_threads);
+    }
+
+    unsigned num_threads;
+  };
 }
 
 #include "../pybind_main.cpp"
diff --git a/pybind_interface/basic/CMakeLists.txt b/pybind_interface/basic/CMakeLists.txt
index 9380f02d..35347211 100644
--- a/pybind_interface/basic/CMakeLists.txt
+++ b/pybind_interface/basic/CMakeLists.txt
@@ -14,16 +14,5 @@ if(APPLE)
     link_directories("/usr/local/lib" "/usr/local/opt/llvm/lib")
 endif()
 
-include(FetchContent)
-
-FetchContent_Declare(
-  pybind11
-  GIT_REPOSITORY https://github.com/pybind/pybind11
-  GIT_TAG v2.2.4
-)
-FetchContent_GetProperties(pybind11)
-if(NOT pybind11_POPULATED)
-  FetchContent_Populate(pybind11)
-  add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR})
-endif()
+INCLUDE(../GetPybind11.cmake)
 pybind11_add_module(qsim_basic pybind_main_basic.cpp)
diff --git a/pybind_interface/basic/pybind_main_basic.cpp b/pybind_interface/basic/pybind_main_basic.cpp
index afeba392..51fdc608 100644
--- a/pybind_interface/basic/pybind_main_basic.cpp
+++ b/pybind_interface/basic/pybind_main_basic.cpp
@@ -14,11 +14,35 @@
 
 #include "pybind_main_basic.h"
 
+#include "../../lib/formux.h"
 #include "../../lib/simulator_basic.h"
+#include "../../lib/util_cpu.h"
 
 namespace qsim {
   template <typename For>
   using Simulator = SimulatorBasic<For>;
+
+  struct Factory {
+    // num_state_threads and num_dblocks are unused, but kept for consistency
+    // with the GPU Factory.
+    Factory(
+      unsigned num_sim_threads,
+      unsigned num_state_threads,
+      unsigned num_dblocks) : num_threads(num_sim_threads) {}
+
+    using Simulator = qsim::Simulator<For>;
+    using StateSpace = Simulator::StateSpace;
+
+    StateSpace CreateStateSpace() const {
+      return StateSpace(num_threads);
+    }
+
+    Simulator CreateSimulator() const {
+      return Simulator(num_threads);
+    }
+
+    unsigned num_threads;
+  };
 }
 
 #include "../pybind_main.cpp"
diff --git a/pybind_interface/cuda/CMakeLists.txt b/pybind_interface/cuda/CMakeLists.txt
new file mode 100644
index 00000000..7da45dac
--- /dev/null
+++ b/pybind_interface/cuda/CMakeLists.txt
@@ -0,0 +1,28 @@
+cmake_minimum_required(VERSION 3.11)
+project(qsim LANGUAGES CXX CUDA)
+
+IF (WIN32)
+    set(CMAKE_CXX_FLAGS "/O2 /openmp")
+ELSE()
+    set(CMAKE_CXX_FLAGS "-O3 -fopenmp")
+ENDIF()
+
+
+if(APPLE)
+    set(CMAKE_CXX_STANDARD 14)
+    include_directories("/usr/local/include" "/usr/local/opt/llvm/include")
+    link_directories("/usr/local/lib" "/usr/local/opt/llvm/lib")
+endif()
+
+INCLUDE(../GetPybind11.cmake)
+find_package(PythonLibs 3.6 REQUIRED)
+find_package(CUDA REQUIRED)
+
+include_directories(${PYTHON_INCLUDE_DIRS} ${pybind11_SOURCE_DIR}/include)
+
+cuda_add_library(qsim_cuda MODULE pybind_main_cuda.cpp)
+set_target_properties(qsim_cuda PROPERTIES
+       PREFIX "${PYTHON_MODULE_PREFIX}"
+       SUFFIX "${PYTHON_MODULE_EXTENSION}"
+)
+set_source_files_properties(pybind_main_cuda.cpp PROPERTIES LANGUAGE CUDA)
diff --git a/pybind_interface/cuda/pybind_main_cuda.cpp b/pybind_interface/cuda/pybind_main_cuda.cpp
new file mode 100644
index 00000000..88fa3a61
--- /dev/null
+++ b/pybind_interface/cuda/pybind_main_cuda.cpp
@@ -0,0 +1,49 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pybind_main_cuda.h"
+
+#include "../../lib/simulator_cuda.h"
+
+namespace qsim {
+  using Simulator = SimulatorCUDA<float>;
+
+  struct Factory {
+    using Simulator = qsim::Simulator;
+    using StateSpace = Simulator::StateSpace;
+
+    Factory(
+      unsigned num_sim_threads,
+      unsigned num_state_threads,
+      unsigned num_dblocks
+    ) : ss_params{num_state_threads, num_dblocks},
+        sim_params{num_sim_threads} {}
+
+    StateSpace CreateStateSpace() const {
+      return StateSpace(ss_params);
+    }
+
+    Simulator CreateSimulator() const {
+      return Simulator(sim_params);
+    }
+
+    StateSpace::Parameter ss_params;
+    Simulator::Parameter sim_params;
+  };
+
+  inline void SetFlushToZeroAndDenormalsAreZeros() {}
+  inline void ClearFlushToZeroAndDenormalsAreZeros() {}
+}
+
+#include "../pybind_main.cpp"
diff --git a/pybind_interface/cuda/pybind_main_cuda.h b/pybind_interface/cuda/pybind_main_cuda.h
new file mode 100644
index 00000000..b5b5979f
--- /dev/null
+++ b/pybind_interface/cuda/pybind_main_cuda.h
@@ -0,0 +1,17 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "../pybind_main.h"
+
+PYBIND11_MODULE(qsim_cuda, m) { GPU_MODULE_BINDINGS }
diff --git a/pybind_interface/custatevec/CMakeLists.txt b/pybind_interface/custatevec/CMakeLists.txt
new file mode 100644
index 00000000..687ac2be
--- /dev/null
+++ b/pybind_interface/custatevec/CMakeLists.txt
@@ -0,0 +1,46 @@
+# Copyright 2019 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.11)
+project(qsim LANGUAGES CXX CUDA)
+
+IF (WIN32)
+    set(CMAKE_CXX_FLAGS "/O2 /openmp")
+ELSE()
+    set(CMAKE_CXX_FLAGS "-O3 -fopenmp")
+ENDIF()
+
+if(APPLE)
+    set(CMAKE_CXX_STANDARD 14)
+    include_directories("/usr/local/include" "/usr/local/opt/llvm/include")
+    link_directories("/usr/local/lib" "/usr/local/opt/llvm/lib")
+endif()
+
+INCLUDE(../GetPybind11.cmake)
+find_package(PythonLibs 3.6 REQUIRED)
+find_package(CUDA REQUIRED)
+
+include_directories(${pybind11_INCLUDE_DIRS})
+
+include_directories($ENV{CUQUANTUM_DIR}/include)
+link_directories($ENV{CUQUANTUM_DIR}/lib $ENV{CUQUANTUM_DIR}/lib64)
+
+cuda_add_library(qsim_custatevec MODULE pybind_main_custatevec.cpp)
+target_link_libraries(qsim_custatevec -lcustatevec -lcublas)
+
+set_target_properties(qsim_custatevec PROPERTIES
+       PREFIX "${PYTHON_MODULE_PREFIX}"
+       SUFFIX "${PYTHON_MODULE_EXTENSION}"
+)
+set_source_files_properties(pybind_main_custatevec.cpp PROPERTIES LANGUAGE CUDA)
diff --git a/pybind_interface/custatevec/pybind_main_custatevec.cpp b/pybind_interface/custatevec/pybind_main_custatevec.cpp
new file mode 100644
index 00000000..1e399633
--- /dev/null
+++ b/pybind_interface/custatevec/pybind_main_custatevec.cpp
@@ -0,0 +1,61 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cublas_v2.h>
+#include <custatevec.h>
+
+#include "pybind_main_custatevec.h"
+
+#include "../../lib/simulator_custatevec.h"
+
+namespace qsim {
+
+using Simulator = SimulatorCuStateVec<float>;
+
+struct Factory {
+  using Simulator = qsim::Simulator;
+  using StateSpace = Simulator::StateSpace;
+
+  // num_sim_threads, num_state_threads and num_dblocks are unused, but kept
+  // for consistency with other factories.
+  Factory(unsigned num_sim_threads,
+          unsigned num_state_threads,
+          unsigned num_dblocks) {
+    ErrorCheck(cublasCreate(&cublas_handle));
+    ErrorCheck(custatevecCreate(&custatevec_handle));
+  }
+
+  ~Factory() {
+    ErrorCheck(cublasDestroy(cublas_handle));
+    ErrorCheck(custatevecDestroy(custatevec_handle));
+  }
+
+  StateSpace CreateStateSpace() const {
+    return StateSpace(cublas_handle, custatevec_handle);
+  }
+
+  Simulator CreateSimulator() const {
+    return Simulator(custatevec_handle);
+  }
+
+  cublasHandle_t cublas_handle;
+  custatevecHandle_t custatevec_handle;
+};
+
+inline void SetFlushToZeroAndDenormalsAreZeros() {}
+inline void ClearFlushToZeroAndDenormalsAreZeros() {}
+
+}
+
+#include "../pybind_main.cpp"
diff --git a/pybind_interface/custatevec/pybind_main_custatevec.h b/pybind_interface/custatevec/pybind_main_custatevec.h
new file mode 100644
index 00000000..b6722bf4
--- /dev/null
+++ b/pybind_interface/custatevec/pybind_main_custatevec.h
@@ -0,0 +1,17 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "../pybind_main.h"
+
+PYBIND11_MODULE(qsim_custatevec, m) { GPU_MODULE_BINDINGS }
diff --git a/pybind_interface/decide/CMakeLists.txt b/pybind_interface/decide/CMakeLists.txt
index b7d28714..0808ccf6 100644
--- a/pybind_interface/decide/CMakeLists.txt
+++ b/pybind_interface/decide/CMakeLists.txt
@@ -1,5 +1,11 @@
 cmake_minimum_required(VERSION 3.11)
-project(qsim)
+
+execute_process(COMMAND which nvcc OUTPUT_VARIABLE has_nvcc)
+if(has_nvcc STREQUAL "")
+  project(qsim)
+else()
+  project(qsim LANGUAGES CXX CUDA)
+endif()
 
 IF (WIN32)
     set(CMAKE_CXX_FLAGS "/O2 /openmp")
@@ -13,16 +19,27 @@ if(APPLE)
     link_directories("/usr/local/lib" "/usr/local/opt/llvm/lib")
 endif()
 
-include(FetchContent)
-
-FetchContent_Declare(
-  pybind11
-  GIT_REPOSITORY https://github.com/pybind/pybind11
-  GIT_TAG v2.2.4
-)
-FetchContent_GetProperties(pybind11)
-if(NOT pybind11_POPULATED)
-  FetchContent_Populate(pybind11)
-  add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR})
+INCLUDE(../GetPybind11.cmake)
+
+if(has_nvcc STREQUAL "")
+  pybind11_add_module(qsim_decide decide.cpp)
+else()
+  find_package(PythonLibs 3.6 REQUIRED)
+  find_package(CUDA REQUIRED)
+
+  include_directories(${PYTHON_INCLUDE_DIRS} ${pybind11_SOURCE_DIR}/include)
+
+  cuda_add_library(qsim_decide MODULE decide.cpp)
+
+  if(DEFINED ENV{CUQUANTUM_DIR})
+      target_compile_options(qsim_decide PRIVATE
+          $<$<COMPILE_LANGUAGE:CUDA>:-D__CUSTATEVEC__>
+      )
+  endif()
+
+  set_target_properties(qsim_decide PROPERTIES
+        PREFIX "${PYTHON_MODULE_PREFIX}"
+        SUFFIX "${PYTHON_MODULE_EXTENSION}"
+  )
+  set_source_files_properties(decide.cpp PROPERTIES LANGUAGE CUDA)
 endif()
-pybind11_add_module(qsim_decide decide.cpp)
diff --git a/pybind_interface/decide/decide.cpp b/pybind_interface/decide/decide.cpp
index 2fb41848..6355ad3b 100644
--- a/pybind_interface/decide/decide.cpp
+++ b/pybind_interface/decide/decide.cpp
@@ -1,3 +1,17 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include <pybind11/pybind11.h>
 
 namespace py = pybind11;
@@ -33,7 +47,7 @@ int detect_instructions() {
   }
   if (nIds >= 7) {
     cpuid(info, 7);
-    if ((info[1] & (1 <<  5) )!= 0) {
+    if ((info[1] & (1 <<  5))!= 0) {
       instr = AVX2;
     }
     if ((info[1] & (1 << 16)) != 0) {
@@ -45,9 +59,46 @@ int detect_instructions() {
   return static_cast<int>(instr);
 }
 
+enum GPUCapabilities {
+    CUDA = 0, CUSTATEVEC = 1, NO_GPU = 10, NO_CUSTATEVEC = 11 };
+
+// For now, GPU detection is performed at compile time, as our wheels are
+// generated on Github Actions runners which do not have GPU support.
+//
+// Users wishing to use qsim with GPU will need to compile locally on a device
+// which has the necessary CUDA toolkit.
+int detect_gpu() {
+  #ifdef __NVCC__
+  GPUCapabilities gpu = CUDA;
+  #else
+  GPUCapabilities gpu = NO_GPU;
+  #endif
+  return gpu;
+}
+
+// For now, cuStateVec detection is performed at compile time, as our wheels
+// are generated on Github Actions runners which do not have GPU support.
+//
+// Users wishing to use qsim with cuStateVec will need to compile locally on
+// a device which has the necessary CUDA toolkit and cuStateVec library.
+int detect_custatevec() {
+  #if defined(__NVCC__) && defined(__CUSTATEVEC__)
+  GPUCapabilities gpu = CUSTATEVEC;
+  #else
+  GPUCapabilities gpu = NO_CUSTATEVEC;
+  #endif
+  return gpu;
+}
+
 PYBIND11_MODULE(qsim_decide, m) {
   m.doc() = "pybind11 plugin";  // optional module docstring
 
   // Methods for returning amplitudes
   m.def("detect_instructions", &detect_instructions, "Detect SIMD");
+
+  // Detect available GPUs.
+  m.def("detect_gpu", &detect_gpu, "Detect GPU");
+
+  // Detect cuStateVec.
+  m.def("detect_custatevec", &detect_custatevec, "Detect cuStateVec");
 }
diff --git a/pybind_interface/pybind_main.cpp b/pybind_interface/pybind_main.cpp
index 150dbc43..5ddee99f 100644
--- a/pybind_interface/pybind_main.cpp
+++ b/pybind_interface/pybind_main.cpp
@@ -22,6 +22,7 @@
 #include <vector>
 
 #include "../lib/bitstring.h"
+#include "../lib/channel.h"
 #include "../lib/expect.h"
 #include "../lib/formux.h"
 #include "../lib/fuser_mqubit.h"
@@ -30,29 +31,11 @@
 #include "../lib/qtrajectory.h"
 #include "../lib/run_qsim.h"
 #include "../lib/run_qsimh.h"
-#include "../lib/util.h"
 
 using namespace qsim;
 
 namespace {
 
-struct Factory {
-  Factory(unsigned num_threads) : num_threads(num_threads) {}
-
-  using Simulator = qsim::Simulator<For>;
-  using StateSpace = Simulator::StateSpace;
-
-  StateSpace CreateStateSpace() const {
-    return StateSpace(num_threads);
-  }
-
-  Simulator CreateSimulator() const {
-    return Simulator(num_threads);
-  }
-
-  unsigned num_threads;
-};
-
 template <typename T>
 T parseOptions(const py::dict &options, const char *key) {
   if (!options.contains(key)) {
@@ -356,6 +339,9 @@ void add_channel(const unsigned time,
     channel.emplace_back(KrausOperator<Gate>{
       KrausOperator<Gate>::kNormal, is_unitary, prob, {gate}
     });
+    if (!is_unitary) {
+      channel.back().CalculateKdKMatrix();
+    }
   }
   ncircuit->channels.push_back(channel);
 }
@@ -399,10 +385,24 @@ std::vector<std::complex<float>> qsim_simulate(const py::dict &options) {
   using Runner = QSimRunner<IO, MultiQubitGateFuser<IO, Cirq::GateCirq<float>>,
                             Factory>;
 
-  unsigned num_threads;
+  bool use_gpu;
+  bool denormals_are_zeros;
+  unsigned gpu_mode;
+  unsigned num_sim_threads = 0;
+  unsigned num_state_threads = 0;
+  unsigned num_dblocks = 0;
   Runner::Parameter param;
   try {
-    num_threads = parseOptions<unsigned>(options, "t\0");
+    use_gpu = parseOptions<unsigned>(options, "g\0");
+    gpu_mode = parseOptions<unsigned>(options, "gmode\0");
+    denormals_are_zeros = parseOptions<unsigned>(options, "z\0");
+    if (use_gpu == 0) {
+      num_sim_threads = parseOptions<unsigned>(options, "t\0");
+    } else if (gpu_mode == 0) {
+      num_sim_threads = parseOptions<unsigned>(options, "gsmt\0");
+      num_state_threads = parseOptions<unsigned>(options, "gsst\0");
+      num_dblocks = parseOptions<unsigned>(options, "gdb\0");
+    }
     param.max_fused_size = parseOptions<unsigned>(options, "f\0");
     param.verbosity = parseOptions<unsigned>(options, "v\0");
     param.seed = parseOptions<unsigned>(options, "s\0");
@@ -410,7 +410,16 @@ std::vector<std::complex<float>> qsim_simulate(const py::dict &options) {
     IO::errorf(exp.what());
     return {};
   }
-  Runner::Run(param, Factory(num_threads), circuit, measure);
+
+  if (denormals_are_zeros) {
+    SetFlushToZeroAndDenormalsAreZeros();
+  } else {
+    ClearFlushToZeroAndDenormalsAreZeros();
+  }
+
+  Runner::Run(
+    param, Factory(num_sim_threads, num_state_threads, num_dblocks), circuit,
+    measure);
   return amplitudes;
 }
 
@@ -427,7 +436,7 @@ std::vector<std::complex<float>> qtrajectory_simulate(const py::dict &options) {
     return {};
   }
 
-  using Simulator = qsim::Simulator<For>;
+  using Simulator = Factory::Simulator;
   using StateSpace = Simulator::StateSpace;
   using State = StateSpace::State;
 
@@ -440,11 +449,25 @@ std::vector<std::complex<float>> qtrajectory_simulate(const py::dict &options) {
                                                   Simulator>;
 
   Runner::Parameter param;
-  unsigned num_threads;
+  bool use_gpu;
+  bool denormals_are_zeros;
+  unsigned gpu_mode;
+  unsigned num_sim_threads = 0;
+  unsigned num_state_threads = 0;
+  unsigned num_dblocks = 0;
   uint64_t seed;
 
   try {
-    num_threads = parseOptions<unsigned>(options, "t\0");
+    use_gpu = parseOptions<unsigned>(options, "g\0");
+    gpu_mode = parseOptions<unsigned>(options, "gmode\0");
+    denormals_are_zeros = parseOptions<unsigned>(options, "z\0");
+    if (use_gpu == 0) {
+      num_sim_threads = parseOptions<unsigned>(options, "t\0");
+    } else if (gpu_mode == 0) {
+      num_sim_threads = parseOptions<unsigned>(options, "gsmt\0");
+      num_state_threads = parseOptions<unsigned>(options, "gsst\0");
+      num_dblocks = parseOptions<unsigned>(options, "gdb\0");
+    }
     param.max_fused_size = parseOptions<unsigned>(options, "f\0");
     param.verbosity = parseOptions<unsigned>(options, "v\0");
     seed = parseOptions<unsigned>(options, "s\0");
@@ -453,17 +476,23 @@ std::vector<std::complex<float>> qtrajectory_simulate(const py::dict &options) {
     return {};
   }
 
-  Simulator simulator = Factory(num_threads).CreateSimulator();
-  StateSpace state_space = Factory(num_threads).CreateStateSpace();
+  Factory factory(num_sim_threads, num_state_threads, num_dblocks);
+  Simulator simulator = factory.CreateSimulator();
+  StateSpace state_space = factory.CreateStateSpace();
 
   auto measure = [&bitstrings, &ncircuit, &amplitudes, &state_space](
-                  unsigned k, const State &state,
-                  std::vector<uint64_t>& stat) {
+                  unsigned k, const State &state, Runner::Stat& stat) {
     for (const auto &b : bitstrings) {
       amplitudes.push_back(state_space.GetAmpl(state, b));
     }
   };
 
+  if (denormals_are_zeros) {
+    SetFlushToZeroAndDenormalsAreZeros();
+  } else {
+    ClearFlushToZeroAndDenormalsAreZeros();
+  }
+
   if (!Runner::RunBatch(param, ncircuit, seed, seed + 1, state_space,
                         simulator, measure)) {
     IO::errorf("qtrajectory simulation of the circuit errored out.\n");
@@ -476,7 +505,7 @@ std::vector<std::complex<float>> qtrajectory_simulate(const py::dict &options) {
 class SimulatorHelper {
  public:
   using Simulator = Factory::Simulator;
-  using StateSpace = Simulator::StateSpace;
+  using StateSpace = Factory::StateSpace;
   using State = StateSpace::State;
 
   using Gate = Cirq::GateCirq<float>;
@@ -496,6 +525,15 @@ class SimulatorHelper {
     return helper.release_state_to_python();
   }
 
+  static std::vector<uint64_t> sample_final_state(
+      const py::dict &options, bool is_noisy, uint64_t num_samples) {
+    auto helper = SimulatorHelper(options, is_noisy);
+    if (!helper.is_valid || !helper.simulate(0)) {
+      return {};
+    }
+    return helper.sample(num_samples);
+  }
+
   template <typename StateType>
   static std::vector<std::complex<double>> simulate_expectation_values(
       const py::dict &options,
@@ -533,10 +571,76 @@ class SimulatorHelper {
     return results;
   }
 
+  template <typename StateType>
+  static std::vector<std::vector<std::complex<double>>>
+  simulate_moment_expectation_values(
+      const py::dict &options,
+      const std::vector<std::tuple<uint64_t, std::vector<
+        std::tuple<std::vector<OpString<Cirq::GateCirq<float>>>, unsigned>
+      >>>& opsums_and_qubit_counts,
+      bool is_noisy, const StateType& input_state) {
+    auto helper = SimulatorHelper(options, is_noisy);
+    if (!helper.is_valid) {
+      return {};
+    }
+    std::vector<std::vector<std::complex<double>>> results(
+      opsums_and_qubit_counts.size()
+    );
+    if (!is_noisy) {
+      // Init outside of simulation to enable stepping.
+      helper.init_state(input_state);
+      uint64_t begin = 0;
+      for (unsigned i = 0; i < opsums_and_qubit_counts.size(); ++i) {
+        auto& pair = opsums_and_qubit_counts[i];
+        uint64_t end = std::get<0>(pair);
+        auto& counts = std::get<1>(pair);
+        if (!helper.simulate_subcircuit(begin, end)) {
+          return {};
+        }
+        results[i] = helper.get_expectation_value(counts);
+        begin = end;
+      }
+      return results;
+    }
+
+    // Aggregate expectation values for noisy circuits.
+    for (unsigned i = 0; i < opsums_and_qubit_counts.size(); ++i) {
+      auto& counts = std::get<1>(opsums_and_qubit_counts[i]);
+      results[i].resize(counts.size(), 0);
+    }
+    for (unsigned rep = 0; rep < helper.noisy_reps; ++rep) {
+      // Init outside of simulation to enable stepping.
+      helper.init_state(input_state);
+      uint64_t begin = 0;
+      for (unsigned i = 0; i < opsums_and_qubit_counts.size(); ++i) {
+        auto& pair = opsums_and_qubit_counts[i];
+        uint64_t end = std::get<0>(pair);
+        auto& counts = std::get<1>(pair);
+        if (!helper.simulate_subcircuit(begin, end)) {
+          return {};
+        }
+        auto evs = helper.get_expectation_value(counts);
+        for (unsigned j = 0; j < evs.size(); ++j) {
+          results[i][j] += evs[j];
+        }
+        begin = end;
+      }
+    }
+    double inverse_num_reps = 1.0 / helper.noisy_reps;
+    for (unsigned i = 0; i < results.size(); ++i) {
+      for (unsigned j = 0; j < results[i].size(); ++j) {
+        results[i][j] *= inverse_num_reps;
+      }
+    }
+    return results;
+  }
+
  private:
   SimulatorHelper(const py::dict &options, bool noisy)
-      : state_space(Factory(1).CreateStateSpace()), state(StateSpace::Null()),
+      : factory(Factory(1, 1, 1)),
+        state(StateSpace::Null()),
         scratch(StateSpace::Null()) {
+    bool denormals_are_zeros;
     is_valid = false;
     is_noisy = noisy;
     try {
@@ -548,14 +652,34 @@ class SimulatorHelper {
         circuit = getCircuit(options);
         num_qubits = circuit.num_qubits;
       }
-      num_threads = parseOptions<unsigned>(options, "t\0");
+
+      use_gpu = parseOptions<unsigned>(options, "g\0");
+      gpu_mode = parseOptions<unsigned>(options, "gmode\0");
+      denormals_are_zeros = parseOptions<unsigned>(options, "z\0");
+      if (use_gpu == 0) {
+        num_sim_threads = parseOptions<unsigned>(options, "t\0");
+      } else if (gpu_mode == 0) {
+        num_sim_threads = parseOptions<unsigned>(options, "gsmt\0");
+        num_state_threads = parseOptions<unsigned>(options, "gsst\0");
+        num_dblocks = parseOptions<unsigned>(options, "gdb\0");
+      }
       max_fused_size = parseOptions<unsigned>(options, "f\0");
       verbosity = parseOptions<unsigned>(options, "v\0");
       seed = parseOptions<unsigned>(options, "s\0");
 
-      state_space = Factory(num_threads).CreateStateSpace();
+      if (use_gpu == 0 || gpu_mode == 0) {
+        factory = Factory(num_sim_threads, num_state_threads, num_dblocks);
+      }
+
+      StateSpace state_space = factory.CreateStateSpace();
       state = state_space.Create(num_qubits);
       is_valid = true;
+
+      if (denormals_are_zeros) {
+        SetFlushToZeroAndDenormalsAreZeros();
+      } else {
+        ClearFlushToZeroAndDenormalsAreZeros();
+      }
     } catch (const std::invalid_argument &exp) {
       // If this triggers, is_valid is false.
       IO::errorf(exp.what());
@@ -563,17 +687,14 @@ class SimulatorHelper {
   }
 
   void init_state(uint64_t input_state) {
+    StateSpace state_space = factory.CreateStateSpace();
     state_space.SetAllZeros(state);
     state_space.SetAmpl(state, input_state, 1, 0);
   }
 
   void init_state(const py::array_t<float> &input_vector) {
-    const float* ptr = input_vector.data();
-    auto f = [](unsigned n, unsigned m, uint64_t i, const float* ptr,
-                float* fsv) {
-      fsv[i] = ptr[i];
-    };
-    For(num_threads).Run(input_vector.size(), f, ptr, state.get());
+    StateSpace state_space = factory.CreateStateSpace();
+    state_space.Copy(input_vector.data(), state);
     state_space.NormalToInternalOrder(state);
   }
 
@@ -596,35 +717,80 @@ class SimulatorHelper {
   bool simulate(const StateType& input_state) {
     init_state(input_state);
     bool result = false;
+
     if (is_noisy) {
-      std::vector<uint64_t> stat;
+      NoisyRunner::Stat stat;
       auto params = get_noisy_params();
 
-      Simulator simulator = Factory(num_threads).CreateSimulator();
-      StateSpace state_space = Factory(num_threads).CreateStateSpace();
+      Simulator simulator = factory.CreateSimulator();
+      StateSpace state_space = factory.CreateStateSpace();
 
       result = NoisyRunner::RunOnce(params, ncircuit, seed, state_space,
-                                    simulator, scratch, state, stat);
+                                    simulator, state, stat);
+    } else {
+      result = Runner::Run(get_params(), factory, circuit, state);
+    }
+    seed += 1;
+    return result;
+  }
+
+  bool simulate_subcircuit(uint64_t begin, uint64_t end) {
+    bool result = false;
+
+    if (is_noisy) {
+      NoisyRunner::Stat stat;
+      auto params = get_noisy_params();
+      Simulator simulator = factory.CreateSimulator();
+      StateSpace state_space = factory.CreateStateSpace();
+
+      result = NoisyRunner::RunOnce(
+        params, ncircuit.num_qubits,
+        ncircuit.channels.begin() + begin,
+        ncircuit.channels.begin() + end,
+        seed, state_space, simulator, state, stat
+      );
     } else {
-      result = Runner::Run(get_params(), Factory(num_threads), circuit, state);
+      Circuit<Gate> subcircuit;
+      subcircuit.num_qubits = circuit.num_qubits;
+      subcircuit.gates = std::vector<Gate>(
+        circuit.gates.begin() + begin,
+        circuit.gates.begin() + end
+      );
+      result = Runner::Run(get_params(), factory, subcircuit, state);
     }
     seed += 1;
     return result;
   }
 
+  std::vector<uint64_t> sample(uint64_t num_samples) {
+    StateSpace state_space = factory.CreateStateSpace();
+    return state_space.Sample(state, num_samples, seed);
+  }
+
   py::array_t<float> release_state_to_python() {
+    StateSpace state_space = factory.CreateStateSpace();
     state_space.InternalToNormalOrder(state);
     uint64_t fsv_size = 2 * (uint64_t{1} << num_qubits);
-    float* fsv = state.release();
-    auto capsule = py::capsule(
-        fsv, [](void *data) { detail::free(data); });
-    return py::array_t<float>(fsv_size, fsv, capsule);
+    if (state.requires_copy_to_host()) {
+      auto* fsv = new float[state_space.MinSize(state.num_qubits())];
+      state_space.Copy(state, fsv);
+      // Cast on delete to silence warnings.
+      auto capsule = py::capsule(
+        fsv, [](void *data) { delete [] (float*)data; });
+      return py::array_t<float>(fsv_size, fsv, capsule);
+    } else {
+      float* fsv = state.release();
+      auto capsule = py::capsule(
+          fsv, [](void *data) { detail::free(data); });
+      return py::array_t<float>(fsv_size, fsv, capsule);
+    }
   }
 
   std::vector<std::complex<double>> get_expectation_value(
       const std::vector<std::tuple<std::vector<OpString<Gate>>,
                                    unsigned>>& opsums_and_qubit_counts) {
-    Simulator simulator = Factory(num_threads).CreateSimulator();
+    Simulator simulator = factory.CreateSimulator();
+    StateSpace state_space = factory.CreateStateSpace();
     using Fuser = MultiQubitGateFuser<IO, Gate>;
 
     std::vector<std::complex<double>> results;
@@ -635,9 +801,11 @@ class SimulatorHelper {
       if (opsum_qubits <= 6) {
         results.push_back(ExpectationValue<IO, Fuser>(opsum, simulator, state));
       } else {
-        Fuser::Parameter param;
-        results.push_back(ExpectationValue<Fuser>(
-            param, opsum, state_space, simulator, state, scratch));
+        Fuser::Parameter params;
+        params.max_fused_size = max_fused_size;
+        params.verbosity = verbosity;
+        results.push_back(ExpectationValue<IO, Fuser>(
+            params, opsum, state_space, simulator, state, scratch));
       }
     }
     return results;
@@ -648,12 +816,16 @@ class SimulatorHelper {
   Circuit<Gate> circuit;
   NoisyCircuit<Gate> ncircuit;
 
-  StateSpace state_space;
+  Factory factory;
   State state;
   State scratch;
 
+  bool use_gpu;
+  unsigned gpu_mode;
   unsigned num_qubits;
-  unsigned num_threads;
+  unsigned num_sim_threads;
+  unsigned num_state_threads;
+  unsigned num_dblocks;
   unsigned noisy_reps;
   unsigned max_fused_size;
   unsigned verbosity;
@@ -707,6 +879,28 @@ std::vector<std::complex<double>> qsim_simulate_expectation_values(
     options, opsums_and_qubit_counts, false, input_vector);
 }
 
+std::vector<std::vector<std::complex<double>>>
+qsim_simulate_moment_expectation_values(
+    const py::dict &options,
+    const std::vector<std::tuple<uint64_t, std::vector<
+      std::tuple<std::vector<OpString<Cirq::GateCirq<float>>>, unsigned>
+    >>>& opsums_and_qubit_counts,
+    uint64_t input_state) {
+  return SimulatorHelper::simulate_moment_expectation_values(
+    options, opsums_and_qubit_counts, false, input_state);
+}
+
+std::vector<std::vector<std::complex<double>>>
+qsim_simulate_moment_expectation_values(
+    const py::dict &options,
+    const std::vector<std::tuple<uint64_t, std::vector<
+      std::tuple<std::vector<OpString<Cirq::GateCirq<float>>>, unsigned>
+    >>>& opsums_and_qubit_counts,
+    const py::array_t<float> &input_vector) {
+  return SimulatorHelper::simulate_moment_expectation_values(
+    options, opsums_and_qubit_counts, false, input_vector);
+}
+
 std::vector<std::complex<double>> qtrajectory_simulate_expectation_values(
     const py::dict &options,
     const std::vector<std::tuple<
@@ -727,8 +921,40 @@ std::vector<std::complex<double>> qtrajectory_simulate_expectation_values(
     options, opsums_and_qubit_counts, true, input_vector);
 }
 
+std::vector<std::vector<std::complex<double>>>
+qtrajectory_simulate_moment_expectation_values(
+    const py::dict &options,
+    const std::vector<std::tuple<uint64_t, std::vector<
+      std::tuple<std::vector<OpString<Cirq::GateCirq<float>>>, unsigned>
+    >>>& opsums_and_qubit_counts,
+    uint64_t input_state) {
+  return SimulatorHelper::simulate_moment_expectation_values(
+    options, opsums_and_qubit_counts, true, input_state);
+}
+
+std::vector<std::vector<std::complex<double>>>
+qtrajectory_simulate_moment_expectation_values(
+    const py::dict &options,
+    const std::vector<std::tuple<uint64_t, std::vector<
+      std::tuple<std::vector<OpString<Cirq::GateCirq<float>>>, unsigned>
+    >>>& opsums_and_qubit_counts,
+    const py::array_t<float> &input_vector) {
+  return SimulatorHelper::simulate_moment_expectation_values(
+    options, opsums_and_qubit_counts, true, input_vector);
+}
+
 // Methods for sampling.
 
+std::vector<uint64_t> qsim_sample_final(
+    const py::dict &options, uint64_t num_samples) {
+  return SimulatorHelper::sample_final_state(options, false, num_samples);
+}
+
+std::vector<uint64_t> qtrajectory_sample_final(
+    const py::dict &options, uint64_t num_samples) {
+  return SimulatorHelper::sample_final_state(options, true, num_samples);
+}
+
 std::vector<unsigned> qsim_sample(const py::dict &options) {
   Circuit<Cirq::GateCirq<float>> circuit;
   try {
@@ -745,10 +971,24 @@ std::vector<unsigned> qsim_sample(const py::dict &options) {
   using Runner = QSimRunner<IO, MultiQubitGateFuser<IO, Cirq::GateCirq<float>>,
                             Factory>;
 
-  unsigned num_threads;
+  bool use_gpu;
+  bool denormals_are_zeros;
+  unsigned gpu_mode;
+  unsigned num_sim_threads = 0;
+  unsigned num_state_threads = 0;
+  unsigned num_dblocks = 0;
   Runner::Parameter param;
   try {
-    num_threads = parseOptions<unsigned>(options, "t\0");
+    use_gpu = parseOptions<unsigned>(options, "g\0");
+    gpu_mode = parseOptions<unsigned>(options, "gmode\0");
+    denormals_are_zeros = parseOptions<unsigned>(options, "z\0");
+    if (use_gpu == 0) {
+      num_sim_threads = parseOptions<unsigned>(options, "t\0");
+    } else if (gpu_mode == 0) {
+      num_sim_threads = parseOptions<unsigned>(options, "gsmt\0");
+      num_state_threads = parseOptions<unsigned>(options, "gsst\0");
+      num_dblocks = parseOptions<unsigned>(options, "gdb\0");
+    }
     param.max_fused_size = parseOptions<unsigned>(options, "f\0");
     param.verbosity = parseOptions<unsigned>(options, "v\0");
     param.seed = parseOptions<unsigned>(options, "s\0");
@@ -758,11 +998,18 @@ std::vector<unsigned> qsim_sample(const py::dict &options) {
   }
 
   std::vector<MeasurementResult> results;
-  StateSpace state_space = Factory(num_threads).CreateStateSpace();
+  Factory factory(num_sim_threads, num_state_threads, num_dblocks);
+  StateSpace state_space = factory.CreateStateSpace();
   State state = state_space.Create(circuit.num_qubits);
   state_space.SetStateZero(state);
 
-  if (!Runner::Run(param, Factory(num_threads), circuit, state, results)) {
+  if (denormals_are_zeros) {
+    SetFlushToZeroAndDenormalsAreZeros();
+  } else {
+    ClearFlushToZeroAndDenormalsAreZeros();
+  }
+
+  if (!Runner::Run(param, factory, circuit, state, results)) {
     IO::errorf("qsim sampling of the circuit errored out.\n");
     return {};
   }
@@ -792,11 +1039,25 @@ std::vector<unsigned> qtrajectory_sample(const py::dict &options) {
                                                   Simulator>;
 
   Runner::Parameter param;
-  unsigned num_threads;
+  bool use_gpu;
+  bool denormals_are_zeros;
+  unsigned gpu_mode;
+  unsigned num_sim_threads = 0;
+  unsigned num_state_threads = 0;
+  unsigned num_dblocks = 0;
   uint64_t seed;
 
   try {
-    num_threads = parseOptions<unsigned>(options, "t\0");
+    use_gpu = parseOptions<unsigned>(options, "g\0");
+    gpu_mode = parseOptions<unsigned>(options, "gmode\0");
+    denormals_are_zeros = parseOptions<unsigned>(options, "z\0");
+    if (use_gpu == 0) {
+      num_sim_threads = parseOptions<unsigned>(options, "t\0");
+    } else if (gpu_mode == 0) {
+      num_sim_threads = parseOptions<unsigned>(options, "gsmt\0");
+      num_state_threads = parseOptions<unsigned>(options, "gsst\0");
+      num_dblocks = parseOptions<unsigned>(options, "gdb\0");
+    }
     param.max_fused_size = parseOptions<unsigned>(options, "f\0");
     param.verbosity = parseOptions<unsigned>(options, "v\0");
     seed = parseOptions<unsigned>(options, "s\0");
@@ -806,35 +1067,41 @@ std::vector<unsigned> qtrajectory_sample(const py::dict &options) {
     return {};
   }
 
-  Simulator simulator = Factory(num_threads).CreateSimulator();
-  StateSpace state_space = Factory(num_threads).CreateStateSpace();
+  Factory factory(num_sim_threads, num_state_threads, num_dblocks);
+  Simulator simulator = factory.CreateSimulator();
+  StateSpace state_space = factory.CreateStateSpace();
 
   std::vector<std::vector<unsigned>> results;
 
   auto measure = [&results, &ncircuit, &state_space](
-                  unsigned k, const State &state,
-                  std::vector<uint64_t>& stat) {
+                  unsigned k, const State& state, Runner::Stat& stat) {
     // Converts stat (which matches the MeasurementResult 'bits' field) into
     // bitstrings matching the MeasurementResult 'bitstring' field.
     unsigned idx = 0;
     for (const auto& channel : ncircuit.channels) {
       if (channel[0].kind != gate::kMeasurement)
         continue;
-      for (const auto &op : channel[0].ops) {
+      for (const auto& op : channel[0].ops) {
         std::vector<unsigned> bitstring;
-        uint64_t val = stat[idx];
-        for (const auto &q : op.qubits) {
+        uint64_t val = stat.samples[idx];
+        for (const auto& q : op.qubits) {
           bitstring.push_back((val >> q) & 1);
         }
         results.push_back(bitstring);
 
         idx += 1;
-        if (idx >= stat.size())
+        if (idx >= stat.samples.size())
           return;
       }
     }
   };
 
+  if (denormals_are_zeros) {
+    SetFlushToZeroAndDenormalsAreZeros();
+  } else {
+    ClearFlushToZeroAndDenormalsAreZeros();
+  }
+
   if (!Runner::RunBatch(param, ncircuit, seed, seed + 1,
                         state_space, simulator, measure)) {
     IO::errorf("qtrajectory sampling of the circuit errored out.\n");
@@ -888,7 +1155,7 @@ std::vector<std::complex<float>> qsimh_simulate(const py::dict &options) {
   // Define container for amplitudes
   std::vector<std::complex<float>> amplitudes(bitstrings.size(), 0);
 
-  Factory factory(param.num_threads);
+  Factory factory(param.num_threads, 0, 0);
 
   if (Runner::Run(param, factory, circuit, parts, bitstrings, amplitudes)) {
     return amplitudes;
diff --git a/pybind_interface/pybind_main.h b/pybind_interface/pybind_main.h
index 8de3d455..3263fdd8 100644
--- a/pybind_interface/pybind_main.h
+++ b/pybind_interface/pybind_main.h
@@ -90,6 +90,8 @@ py::array_t<float> qsim_simulate_fullstate(
       const py::dict &options, const py::array_t<float> &input_vector);
 
 std::vector<unsigned> qsim_sample(const py::dict &options);
+std::vector<uint64_t> qsim_sample_final(
+  const py::dict &options, uint64_t num_samples);
 
 // Methods for simulating noisy circuits.
 std::vector<std::complex<float>> qtrajectory_simulate(const py::dict &options);
@@ -100,6 +102,8 @@ py::array_t<float> qtrajectory_simulate_fullstate(
       const py::dict &options, const py::array_t<float> &input_vector);
 
 std::vector<unsigned> qtrajectory_sample(const py::dict &options);
+std::vector<uint64_t> qtrajectory_sample_final(
+  const py::dict &options, uint64_t num_samples);
 
 // As above, but returning expectation values instead.
 std::vector<std::complex<double>> qsim_simulate_expectation_values(
@@ -116,6 +120,24 @@ std::vector<std::complex<double>> qsim_simulate_expectation_values(
                               qsim::Cirq::GateCirq<float>>>,
                           unsigned>>& opsums_and_qubit_counts,
     const py::array_t<float> &input_vector);
+std::vector<std::vector<std::complex<double>>>
+qsim_simulate_moment_expectation_values(
+    const py::dict &options,
+    const std::vector<std::tuple<uint64_t, std::vector<
+      std::tuple<
+        std::vector<qsim::OpString<qsim::Cirq::GateCirq<float>>>,
+        unsigned
+    >>>>& opsums_and_qubit_counts,
+    uint64_t input_state);
+std::vector<std::vector<std::complex<double>>>
+qsim_simulate_moment_expectation_values(
+    const py::dict &options,
+    const std::vector<std::tuple<uint64_t, std::vector<
+      std::tuple<
+        std::vector<qsim::OpString<qsim::Cirq::GateCirq<float>>>,
+        unsigned
+    >>>>& opsums_and_qubit_counts,
+    const py::array_t<float> &input_vector);
 std::vector<std::complex<double>> qtrajectory_simulate_expectation_values(
     const py::dict &options,
     const std::vector<std::tuple<
@@ -130,6 +152,24 @@ std::vector<std::complex<double>> qtrajectory_simulate_expectation_values(
                               qsim::Cirq::GateCirq<float>>>,
                           unsigned>>& opsums_and_qubit_counts,
     const py::array_t<float> &input_vector);
+std::vector<std::vector<std::complex<double>>>
+qtrajectory_simulate_moment_expectation_values(
+    const py::dict &options,
+    const std::vector<std::tuple<uint64_t, std::vector<
+      std::tuple<
+        std::vector<qsim::OpString<qsim::Cirq::GateCirq<float>>>,
+        unsigned
+    >>>>& opsums_and_qubit_counts,
+    uint64_t input_state);
+std::vector<std::vector<std::complex<double>>>
+qtrajectory_simulate_moment_expectation_values(
+    const py::dict &options,
+    const std::vector<std::tuple<uint64_t, std::vector<
+      std::tuple<
+        std::vector<qsim::OpString<qsim::Cirq::GateCirq<float>>>,
+        unsigned
+    >>>>& opsums_and_qubit_counts,
+    const py::array_t<float> &input_vector);
 
 // Hybrid simulator.
 std::vector<std::complex<float>> qsimh_simulate(const py::dict &options);
@@ -164,8 +204,12 @@ std::vector<std::complex<float>> qsimh_simulate(const py::dict &options);
                                                                                       \
       /* Methods for returning samples */                                             \
       m.def("qsim_sample", &qsim_sample, "Call the qsim sampler");                    \
+      m.def("qsim_sample_final", &qsim_sample_final,                                  \
+            "Call the qsim final-state sampler");                                     \
       m.def("qtrajectory_sample", &qtrajectory_sample,                                \
             "Call the qtrajectory sampler");                                          \
+      m.def("qtrajectory_sample_final", &qtrajectory_sample_final,                    \
+            "Call the qtrajectory final-state sampler");                              \
                                                                                       \
       using GateCirq = qsim::Cirq::GateCirq<float>;                                   \
       using OpString = qsim::OpString<GateCirq>;                                      \
@@ -186,6 +230,25 @@ std::vector<std::complex<float>> qsimh_simulate(const py::dict &options);
               &qsim_simulate_expectation_values),                                     \
             "Call the qsim simulator for expectation value simulation");              \
                                                                                       \
+      m.def("qsim_simulate_moment_expectation_values",                                \
+            static_cast<std::vector<std::vector<std::complex<double>>>(*)(            \
+                const py::dict&,                                                      \
+                const std::vector<std::tuple<uint64_t, std::vector<                   \
+                  std::tuple<std::vector<OpString>, unsigned>                         \
+                >>>&,                                                                 \
+                uint64_t)>(                                                           \
+              &qsim_simulate_moment_expectation_values),                              \
+            "Call the qsim simulator for step-by-step expectation value simulation"); \
+      m.def("qsim_simulate_moment_expectation_values",                                \
+            static_cast<std::vector<std::vector<std::complex<double>>>(*)(            \
+                const py::dict&,                                                      \
+                const std::vector<std::tuple<uint64_t, std::vector<                   \
+                  std::tuple<std::vector<OpString>, unsigned>                         \
+                >>>&,                                                                 \
+                const py::array_t<float>&)>(                                          \
+              &qsim_simulate_moment_expectation_values),                              \
+            "Call the qsim simulator for step-by-step expectation value simulation"); \
+                                                                                      \
       m.def("qtrajectory_simulate_expectation_values",                                \
             static_cast<std::vector<std::complex<double>>(*)(                         \
                 const py::dict&,                                                      \
@@ -201,6 +264,27 @@ std::vector<std::complex<float>> qsimh_simulate(const py::dict &options);
               &qtrajectory_simulate_expectation_values),                              \
             "Call the qtrajectory simulator for expectation value simulation");       \
                                                                                       \
+      m.def("qtrajectory_simulate_moment_expectation_values",                         \
+            static_cast<std::vector<std::vector<std::complex<double>>>(*)(            \
+                const py::dict&,                                                      \
+                const std::vector<std::tuple<uint64_t, std::vector<                   \
+                  std::tuple<std::vector<OpString>, unsigned>                         \
+                >>>&,                                                                 \
+                uint64_t)>(                                                           \
+              &qtrajectory_simulate_moment_expectation_values),                       \
+            "Call the qtrajectory simulator for step-by-step "                        \
+            "expectation value simulation");                                          \
+      m.def("qtrajectory_simulate_moment_expectation_values",                         \
+            static_cast<std::vector<std::vector<std::complex<double>>>(*)(            \
+                const py::dict&,                                                      \
+                const std::vector<std::tuple<uint64_t, std::vector<                   \
+                  std::tuple<std::vector<OpString>, unsigned>                         \
+                >>>&,                                                                 \
+                const py::array_t<float>&)>(                                          \
+              &qtrajectory_simulate_moment_expectation_values),                       \
+            "Call the qtrajectory simulator for step-by-step "                        \
+            "expectation value simulation");                                          \
+                                                                                      \
       /* Method for hybrid simulation */                                              \
       m.def("qsimh_simulate", &qsimh_simulate, "Call the qsimh simulator");           \
                                                                                       \
@@ -293,4 +377,119 @@ std::vector<std::complex<float>> qsimh_simulate(const py::dict &options);
                                                                                       \
       m.def("add_gate_to_opstring", &add_gate_to_opstring,                            \
             "Adds a gate to the given opstring.");
+
+#define GPU_MODULE_BINDINGS                                                                  \
+      m.doc() = "pybind11 plugin";  /* optional module docstring */                   \
+      /* Methods for returning amplitudes */                                          \
+      m.def("qsim_simulate", &qsim_simulate, "Call the qsim simulator");              \
+      m.def("qtrajectory_simulate", &qtrajectory_simulate,                            \
+            "Call the qtrajectory simulator");                                        \
+                                                                                      \
+      /* Methods for returning full state */                                          \
+      m.def("qsim_simulate_fullstate",                                                \
+            static_cast<py::array_t<float>(*)(const py::dict&, uint64_t)>(            \
+                &qsim_simulate_fullstate),                                            \
+            "Call the qsim simulator for full state vector simulation");              \
+      m.def("qsim_simulate_fullstate",                                                \
+            static_cast<py::array_t<float>(*)(const py::dict&,                        \
+                                              const py::array_t<float>&)>(            \
+                &qsim_simulate_fullstate),                                            \
+            "Call the qsim simulator for full state vector simulation");              \
+                                                                                      \
+      m.def("qtrajectory_simulate_fullstate",                                         \
+            static_cast<py::array_t<float>(*)(const py::dict&, uint64_t)>(            \
+                &qtrajectory_simulate_fullstate),                                     \
+            "Call the qtrajectory simulator for full state vector simulation");       \
+      m.def("qtrajectory_simulate_fullstate",                                         \
+            static_cast<py::array_t<float>(*)(const py::dict&,                        \
+                                              const py::array_t<float>&)>(            \
+                &qtrajectory_simulate_fullstate),                                     \
+            "Call the qtrajectory simulator for full state vector simulation");       \
+                                                                                      \
+      /* Methods for returning samples */                                             \
+      m.def("qsim_sample", &qsim_sample, "Call the qsim sampler");                    \
+      m.def("qsim_sample_final", &qsim_sample_final,                                  \
+            "Call the qsim final-state sampler");                                     \
+      m.def("qtrajectory_sample", &qtrajectory_sample,                                \
+            "Call the qtrajectory sampler");                                          \
+      m.def("qtrajectory_sample_final", &qtrajectory_sample_final,                    \
+            "Call the qtrajectory final-state sampler");                              \
+                                                                                      \
+      using GateCirq = qsim::Cirq::GateCirq<float>;                                   \
+      using OpString = qsim::OpString<GateCirq>;                                      \
+                                                                                      \
+      /* Methods for returning expectation values */                                  \
+      m.def("qsim_simulate_expectation_values",                                       \
+            static_cast<std::vector<std::complex<double>>(*)(                         \
+                const py::dict&,                                                      \
+                const std::vector<std::tuple<std::vector<OpString>, unsigned>>&,      \
+                uint64_t)>(                                                           \
+              &qsim_simulate_expectation_values),                                     \
+            "Call the qsim simulator for expectation value simulation");              \
+      m.def("qsim_simulate_expectation_values",                                       \
+            static_cast<std::vector<std::complex<double>>(*)(                         \
+                const py::dict&,                                                      \
+                const std::vector<std::tuple<std::vector<OpString>, unsigned>>&,      \
+                const py::array_t<float>&)>(                                          \
+              &qsim_simulate_expectation_values),                                     \
+            "Call the qsim simulator for expectation value simulation");              \
+                                                                                      \
+      m.def("qsim_simulate_moment_expectation_values",                                \
+            static_cast<std::vector<std::vector<std::complex<double>>>(*)(            \
+                const py::dict&,                                                      \
+                const std::vector<std::tuple<uint64_t, std::vector<                   \
+                  std::tuple<std::vector<OpString>, unsigned>                         \
+                >>>&,                                                                 \
+                uint64_t)>(                                                           \
+              &qsim_simulate_moment_expectation_values),                              \
+            "Call the qsim simulator for step-by-step expectation value simulation"); \
+      m.def("qsim_simulate_moment_expectation_values",                                \
+            static_cast<std::vector<std::vector<std::complex<double>>>(*)(            \
+                const py::dict&,                                                      \
+                const std::vector<std::tuple<uint64_t, std::vector<                   \
+                  std::tuple<std::vector<OpString>, unsigned>                         \
+                >>>&,                                                                 \
+                const py::array_t<float>&)>(                                          \
+              &qsim_simulate_moment_expectation_values),                              \
+            "Call the qsim simulator for step-by-step expectation value simulation"); \
+                                                                                      \
+                                                                                      \
+      m.def("qtrajectory_simulate_expectation_values",                                \
+            static_cast<std::vector<std::complex<double>>(*)(                         \
+                const py::dict&,                                                      \
+                const std::vector<std::tuple<std::vector<OpString>, unsigned>>&,      \
+                uint64_t)>(                                                           \
+              &qtrajectory_simulate_expectation_values),                              \
+            "Call the qtrajectory simulator for expectation value simulation");       \
+      m.def("qtrajectory_simulate_expectation_values",                                \
+            static_cast<std::vector<std::complex<double>>(*)(                         \
+                const py::dict&,                                                      \
+                const std::vector<std::tuple<std::vector<OpString>, unsigned>>&,      \
+                const py::array_t<float>&)>(                                          \
+              &qtrajectory_simulate_expectation_values),                              \
+            "Call the qtrajectory simulator for expectation value simulation");       \
+                                                                                      \
+      m.def("qtrajectory_simulate_moment_expectation_values",                         \
+            static_cast<std::vector<std::vector<std::complex<double>>>(*)(            \
+                const py::dict&,                                                      \
+                const std::vector<std::tuple<uint64_t, std::vector<                   \
+                  std::tuple<std::vector<OpString>, unsigned>                         \
+                >>>&,                                                                 \
+                uint64_t)>(                                                           \
+              &qtrajectory_simulate_moment_expectation_values),                       \
+            "Call the qtrajectory simulator for step-by-step "                        \
+            "expectation value simulation");                                          \
+      m.def("qtrajectory_simulate_moment_expectation_values",                         \
+            static_cast<std::vector<std::vector<std::complex<double>>>(*)(            \
+                const py::dict&,                                                      \
+                const std::vector<std::tuple<uint64_t, std::vector<                   \
+                  std::tuple<std::vector<OpString>, unsigned>                         \
+                >>>&,                                                                 \
+                const py::array_t<float>&)>(                                          \
+              &qtrajectory_simulate_moment_expectation_values),                       \
+            "Call the qtrajectory simulator for step-by-step "                        \
+            "expectation value simulation");                                          \
+                                                                                      \
+      /* Method for hybrid simulation */                                              \
+      m.def("qsimh_simulate", &qsimh_simulate, "Call the qsimh simulator");
 #endif
diff --git a/pybind_interface/sse/CMakeLists.txt b/pybind_interface/sse/CMakeLists.txt
index 6fa7201f..fe9b218e 100644
--- a/pybind_interface/sse/CMakeLists.txt
+++ b/pybind_interface/sse/CMakeLists.txt
@@ -14,16 +14,5 @@ if(APPLE)
     link_directories("/usr/local/lib" "/usr/local/opt/llvm/lib")
 endif()
 
-include(FetchContent)
-
-FetchContent_Declare(
-  pybind11
-  GIT_REPOSITORY https://github.com/pybind/pybind11
-  GIT_TAG v2.2.4
-)
-FetchContent_GetProperties(pybind11)
-if(NOT pybind11_POPULATED)
-  FetchContent_Populate(pybind11)
-  add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR})
-endif()
+INCLUDE(../GetPybind11.cmake)
 pybind11_add_module(qsim_sse pybind_main_sse.cpp)
diff --git a/pybind_interface/sse/pybind_main_sse.cpp b/pybind_interface/sse/pybind_main_sse.cpp
index 0ff8c731..674aecad 100644
--- a/pybind_interface/sse/pybind_main_sse.cpp
+++ b/pybind_interface/sse/pybind_main_sse.cpp
@@ -14,11 +14,35 @@
 
 #include "pybind_main_sse.h"
 
+#include "../../lib/formux.h"
 #include "../../lib/simulator_sse.h"
+#include "../../lib/util_cpu.h"
 
 namespace qsim {
   template <typename For>
   using Simulator = SimulatorSSE<For>;
+
+  struct Factory {
+    // num_state_threads and num_dblocks are unused, but kept for consistency
+    // with the GPU Factory.
+    Factory(
+      unsigned num_sim_threads,
+      unsigned num_state_threads,
+      unsigned num_dblocks) : num_threads(num_sim_threads) {}
+
+    using Simulator = qsim::Simulator<For>;
+    using StateSpace = Simulator::StateSpace;
+
+    StateSpace CreateStateSpace() const {
+      return StateSpace(num_threads);
+    }
+
+    Simulator CreateSimulator() const {
+      return Simulator(num_threads);
+    }
+
+    unsigned num_threads;
+  };
 }
 
 #include "../pybind_main.cpp"
diff --git a/qsimcirq/__init__.py b/qsimcirq/__init__.py
index d0518841..89c6197e 100644
--- a/qsimcirq/__init__.py
+++ b/qsimcirq/__init__.py
@@ -1,3 +1,18 @@
+# Copyright 2019 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 import importlib
 from qsimcirq import qsim_decide
 
@@ -15,10 +30,35 @@ def _load_simd_qsim():
     return qsim
 
 
+def _load_qsim_gpu():
+    instr = qsim_decide.detect_gpu()
+    if instr == 0:
+        qsim_gpu = importlib.import_module("qsimcirq.qsim_cuda")
+    else:
+        qsim_gpu = None
+    return qsim_gpu
+
+
+def _load_qsim_custatevec():
+    instr = qsim_decide.detect_custatevec()
+    if instr == 1:
+        qsim_custatevec = importlib.import_module("qsimcirq.qsim_custatevec")
+    else:
+        qsim_custatevec = None
+    return qsim_custatevec
+
+
 qsim = _load_simd_qsim()
+qsim_gpu = _load_qsim_gpu()
+qsim_custatevec = _load_qsim_custatevec()
 
 from .qsim_circuit import add_op_to_opstring, add_op_to_circuit, QSimCircuit
-from .qsim_simulator import QSimSimulatorState, QSimSimulatorTrialResult, QSimSimulator
+from .qsim_simulator import (
+    QSimOptions,
+    QSimSimulatorState,
+    QSimSimulatorTrialResult,
+    QSimSimulator,
+)
 from .qsimh_simulator import QSimhSimulator
 
 from qsimcirq._version import (
diff --git a/qsimcirq/_version.py b/qsimcirq/_version.py
index 30919655..837c5a31 100644
--- a/qsimcirq/_version.py
+++ b/qsimcirq/_version.py
@@ -1,3 +1,3 @@
 """The version number defined here is read automatically in setup.py."""
 
-__version__ = "0.10.1"
+__version__ = "0.13.0"
diff --git a/qsimcirq/qsim_circuit.py b/qsimcirq/qsim_circuit.py
index 4d63fcd6..fd7d9450 100644
--- a/qsimcirq/qsim_circuit.py
+++ b/qsimcirq/qsim_circuit.py
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
 import warnings
+from typing import Dict, Union
 
 import cirq
+import numpy as np
+
 from . import qsim
-from typing import Dict, Union
 
 
 # List of parameter names that appear in valid Cirq protos.
@@ -33,103 +34,183 @@
 ]
 
 
-def _cirq_gate_kind(gate: cirq.ops.Gate):
-    if isinstance(gate, cirq.ops.ControlledGate):
-        return _cirq_gate_kind(gate.sub_gate)
-    if isinstance(gate, cirq.ops.identity.IdentityGate):
-        # Identity gates will decompose to no-ops.
-        pass
-    if isinstance(gate, cirq.ops.XPowGate):
-        # cirq.rx also uses this path.
-        if gate.exponent == 1 and gate.global_shift == 0:
-            return qsim.kX
-        return qsim.kXPowGate
-    if isinstance(gate, cirq.ops.YPowGate):
-        # cirq.ry also uses this path.
-        if gate.exponent == 1 and gate.global_shift == 0:
-            return qsim.kY
-        return qsim.kYPowGate
-    if isinstance(gate, cirq.ops.ZPowGate):
-        # cirq.rz also uses this path.
-        if gate.global_shift == 0:
-            if gate.exponent == 1:
-                return qsim.kZ
-            if gate.exponent == 0.5:
-                return qsim.kS
-            if gate.exponent == 0.25:
-                return qsim.kT
-        return qsim.kZPowGate
-    if isinstance(gate, cirq.ops.HPowGate):
-        if gate.exponent == 1 and gate.global_shift == 0:
-            return qsim.kH
-        return qsim.kHPowGate
-    if isinstance(gate, cirq.ops.CZPowGate):
-        if gate.exponent == 1 and gate.global_shift == 0:
-            return qsim.kCZ
-        return qsim.kCZPowGate
-    if isinstance(gate, cirq.ops.CXPowGate):
-        if gate.exponent == 1 and gate.global_shift == 0:
-            return qsim.kCX
-        return qsim.kCXPowGate
-    if isinstance(gate, cirq.ops.PhasedXPowGate):
-        return qsim.kPhasedXPowGate
-    if isinstance(gate, cirq.ops.PhasedXZGate):
-        return qsim.kPhasedXZGate
-    if isinstance(gate, cirq.ops.XXPowGate):
-        if gate.exponent == 1 and gate.global_shift == 0:
-            return qsim.kXX
-        return qsim.kXXPowGate
-    if isinstance(gate, cirq.ops.YYPowGate):
-        if gate.exponent == 1 and gate.global_shift == 0:
-            return qsim.kYY
-        return qsim.kYYPowGate
-    if isinstance(gate, cirq.ops.ZZPowGate):
-        if gate.exponent == 1 and gate.global_shift == 0:
-            return qsim.kZZ
-        return qsim.kZZPowGate
-    if isinstance(gate, cirq.ops.SwapPowGate):
-        if gate.exponent == 1 and gate.global_shift == 0:
-            return qsim.kSWAP
-        return qsim.kSwapPowGate
-    if isinstance(gate, cirq.ops.ISwapPowGate):
-        # cirq.riswap also uses this path.
-        if gate.exponent == 1 and gate.global_shift == 0:
-            return qsim.kISWAP
-        return qsim.kISwapPowGate
-    if isinstance(gate, cirq.ops.PhasedISwapPowGate):
-        # cirq.givens also uses this path.
-        return qsim.kPhasedISwapPowGate
-    if isinstance(gate, cirq.ops.FSimGate):
-        return qsim.kFSimGate
-    if isinstance(gate, cirq.ops.TwoQubitDiagonalGate):
-        return qsim.kTwoQubitDiagonalGate
-    if isinstance(gate, cirq.ops.ThreeQubitDiagonalGate):
-        return qsim.kThreeQubitDiagonalGate
-    if isinstance(gate, cirq.ops.CCZPowGate):
-        if gate.exponent == 1 and gate.global_shift == 0:
-            return qsim.kCCZ
-        return qsim.kCCZPowGate
-    if isinstance(gate, cirq.ops.CCXPowGate):
-        if gate.exponent == 1 and gate.global_shift == 0:
-            return qsim.kCCX
-        return qsim.kCCXPowGate
-    if isinstance(gate, cirq.ops.CSwapGate):
-        return qsim.kCSwapGate
-    if isinstance(gate, cirq.ops.MatrixGate):
-        if gate.num_qubits() <= 6:
-            return qsim.kMatrixGate
-        raise NotImplementedError(
-            f"Received matrix on {gate.num_qubits()} qubits; "
-            + "only up to 6-qubit gates are supported."
-        )
-    if isinstance(gate, cirq.ops.MeasurementGate):
-        # needed to inherit SimulatesSamples in sims
-        return qsim.kMeasurement
+def _translate_ControlledGate(gate: cirq.ControlledGate):
+    return _cirq_gate_kind(gate.sub_gate)
+
+
+def _translate_XPowGate(gate: cirq.XPowGate):
+    # cirq.rx also uses this path.
+    if gate.exponent == 1 and gate.global_shift == 0:
+        return qsim.kX
+    return qsim.kXPowGate
+
+
+def _translate_YPowGate(gate: cirq.YPowGate):
+    # cirq.ry also uses this path.
+    if gate.exponent == 1 and gate.global_shift == 0:
+        return qsim.kY
+    return qsim.kYPowGate
+
+
+def _translate_ZPowGate(gate: cirq.ZPowGate):
+    # cirq.rz also uses this path.
+    if gate.global_shift == 0:
+        if gate.exponent == 1:
+            return qsim.kZ
+        if gate.exponent == 0.5:
+            return qsim.kS
+        if gate.exponent == 0.25:
+            return qsim.kT
+    return qsim.kZPowGate
+
+
+def _translate_HPowGate(gate: cirq.HPowGate):
+    if gate.exponent == 1 and gate.global_shift == 0:
+        return qsim.kH
+    return qsim.kHPowGate
+
+
+def _translate_CZPowGate(gate: cirq.CZPowGate):
+    if gate.exponent == 1 and gate.global_shift == 0:
+        return qsim.kCZ
+    return qsim.kCZPowGate
+
+
+def _translate_CXPowGate(gate: cirq.CXPowGate):
+    if gate.exponent == 1 and gate.global_shift == 0:
+        return qsim.kCX
+    return qsim.kCXPowGate
+
+
+def _translate_PhasedXPowGate(gate: cirq.PhasedXPowGate):
+    return qsim.kPhasedXPowGate
+
+
+def _translate_PhasedXZGate(gate: cirq.PhasedXZGate):
+    return qsim.kPhasedXZGate
+
+
+def _translate_XXPowGate(gate: cirq.XXPowGate):
+    if gate.exponent == 1 and gate.global_shift == 0:
+        return qsim.kXX
+    return qsim.kXXPowGate
+
+
+def _translate_YYPowGate(gate: cirq.YYPowGate):
+    if gate.exponent == 1 and gate.global_shift == 0:
+        return qsim.kYY
+    return qsim.kYYPowGate
+
+
+def _translate_ZZPowGate(gate: cirq.ZZPowGate):
+    if gate.exponent == 1 and gate.global_shift == 0:
+        return qsim.kZZ
+    return qsim.kZZPowGate
+
+
+def _translate_SwapPowGate(gate: cirq.SwapPowGate):
+    if gate.exponent == 1 and gate.global_shift == 0:
+        return qsim.kSWAP
+    return qsim.kSwapPowGate
+
+
+def _translate_ISwapPowGate(gate: cirq.ISwapPowGate):
+    # cirq.riswap also uses this path.
+    if gate.exponent == 1 and gate.global_shift == 0:
+        return qsim.kISWAP
+    return qsim.kISwapPowGate
+
+
+def _translate_PhasedISwapPowGate(gate: cirq.PhasedISwapPowGate):
+    # cirq.givens also uses this path.
+    return qsim.kPhasedISwapPowGate
+
+
+def _translate_FSimGate(gate: cirq.FSimGate):
+    return qsim.kFSimGate
+
+
+def _translate_TwoQubitDiagonalGate(gate: cirq.TwoQubitDiagonalGate):
+    return qsim.kTwoQubitDiagonalGate
+
+
+def _translate_ThreeQubitDiagonalGate(gate: cirq.ThreeQubitDiagonalGate):
+    return qsim.kThreeQubitDiagonalGate
+
+
+def _translate_CCZPowGate(gate: cirq.CCZPowGate):
+    if gate.exponent == 1 and gate.global_shift == 0:
+        return qsim.kCCZ
+    return qsim.kCCZPowGate
+
+
+def _translate_CCXPowGate(gate: cirq.CCXPowGate):
+    if gate.exponent == 1 and gate.global_shift == 0:
+        return qsim.kCCX
+    return qsim.kCCXPowGate
+
+
+def _translate_CSwapGate(gate: cirq.CSwapGate):
+    return qsim.kCSwapGate
+
+
+def _translate_MatrixGate(gate: cirq.MatrixGate):
+    if gate.num_qubits() <= 6:
+        return qsim.kMatrixGate
+    raise NotImplementedError(
+        f"Received matrix on {gate.num_qubits()} qubits; "
+        + "only up to 6-qubit gates are supported."
+    )
+
+
+def _translate_MeasurementGate(gate: cirq.MeasurementGate):
+    # needed to inherit SimulatesSamples in sims
+    return qsim.kMeasurement
+
+
+TYPE_TRANSLATOR = {
+    cirq.ControlledGate: _translate_ControlledGate,
+    cirq.XPowGate: _translate_XPowGate,
+    cirq.YPowGate: _translate_YPowGate,
+    cirq.ZPowGate: _translate_ZPowGate,
+    cirq.HPowGate: _translate_HPowGate,
+    cirq.CZPowGate: _translate_CZPowGate,
+    cirq.CXPowGate: _translate_CXPowGate,
+    cirq.PhasedXPowGate: _translate_PhasedXPowGate,
+    cirq.PhasedXZGate: _translate_PhasedXZGate,
+    cirq.XXPowGate: _translate_XXPowGate,
+    cirq.YYPowGate: _translate_YYPowGate,
+    cirq.ZZPowGate: _translate_ZZPowGate,
+    cirq.SwapPowGate: _translate_SwapPowGate,
+    cirq.ISwapPowGate: _translate_ISwapPowGate,
+    cirq.PhasedISwapPowGate: _translate_PhasedISwapPowGate,
+    cirq.FSimGate: _translate_FSimGate,
+    cirq.TwoQubitDiagonalGate: _translate_TwoQubitDiagonalGate,
+    cirq.ThreeQubitDiagonalGate: _translate_ThreeQubitDiagonalGate,
+    cirq.CCZPowGate: _translate_CCZPowGate,
+    cirq.CCXPowGate: _translate_CCXPowGate,
+    cirq.CSwapGate: _translate_CSwapGate,
+    cirq.MatrixGate: _translate_MatrixGate,
+    cirq.MeasurementGate: _translate_MeasurementGate,
+}
+
+
+def _cirq_gate_kind(gate: cirq.Gate):
+    for gate_type in type(gate).mro():
+        translator = TYPE_TRANSLATOR.get(gate_type, None)
+        if translator is not None:
+            return translator(gate)
     # Unrecognized gates will be decomposed.
     return None
 
 
-def _control_details(gate: cirq.ops.ControlledGate, qubits):
+def _has_cirq_gate_kind(op: cirq.Operation):
+    if isinstance(op, cirq.ControlledOperation):
+        return _has_cirq_gate_kind(op.sub_operation)
+    return any(t in TYPE_TRANSLATOR for t in type(op.gate).mro())
+
+
+def _control_details(gate: cirq.ControlledGate, qubits):
     control_qubits = []
     control_values = []
     # TODO: support qudit control
@@ -168,7 +249,7 @@ def add_op_to_opstring(
     if len(qsim_op.qubits) != 1:
         raise ValueError(f"OpString ops should have 1 qubit; got {len(qsim_op.qubits)}")
 
-    is_controlled = isinstance(qsim_gate, cirq.ops.ControlledGate)
+    is_controlled = isinstance(qsim_gate, cirq.ControlledGate)
     if is_controlled:
         raise ValueError(f"OpString ops should not be controlled.")
 
@@ -188,7 +269,7 @@ def add_op_to_circuit(
     qubits = [qubit_to_index_dict[q] for q in qsim_op.qubits]
 
     qsim_qubits = qubits
-    is_controlled = isinstance(qsim_gate, cirq.ops.ControlledGate)
+    is_controlled = isinstance(qsim_gate, cirq.ControlledGate)
     if is_controlled:
         control_qubits, control_values = _control_details(qsim_gate, qubits)
         if control_qubits is None:
@@ -251,18 +332,17 @@ class QSimCircuit(cirq.Circuit):
     def __init__(
         self,
         cirq_circuit: cirq.Circuit,
-        device: cirq.devices = cirq.devices.UNCONSTRAINED_DEVICE,
         allow_decomposition: bool = False,
     ):
 
         if allow_decomposition:
-            super().__init__([], device=device)
+            super().__init__()
             for moment in cirq_circuit:
                 for op in moment:
                     # This should call decompose on the gates
                     self.append(op)
         else:
-            super().__init__(cirq_circuit, device=device)
+            super().__init__(cirq_circuit)
 
     def __eq__(self, other):
         if not isinstance(other, QSimCircuit):
@@ -273,22 +353,20 @@ def __eq__(self, other):
     def _resolve_parameters_(
         self, param_resolver: cirq.study.ParamResolver, recursive: bool = True
     ):
-        return QSimCircuit(
-            cirq.resolve_parameters(super(), param_resolver, recursive),
-            device=self.device,
-        )
+        return QSimCircuit(cirq.resolve_parameters(super(), param_resolver, recursive))
 
     def translate_cirq_to_qsim(
-        self, qubit_order: cirq.ops.QubitOrderOrList = cirq.ops.QubitOrder.DEFAULT
+        self, qubit_order: cirq.QubitOrderOrList = cirq.QubitOrder.DEFAULT
     ) -> qsim.Circuit:
         """
         Translates this Cirq circuit to the qsim representation.
         :qubit_order: Ordering of qubits
-        :return: a C++ qsim Circuit object
+        :return: a tuple of (C++ qsim Circuit object, moment boundary
+            gate indices)
         """
 
         qsim_circuit = qsim.Circuit()
-        ordered_qubits = cirq.ops.QubitOrder.as_qubit_order(qubit_order).order_for(
+        ordered_qubits = cirq.QubitOrder.as_qubit_order(qubit_order).order_for(
             self.all_qubits()
         )
         qsim_circuit.num_qubits = len(ordered_qubits)
@@ -296,21 +374,22 @@ def translate_cirq_to_qsim(
         # qsim numbers qubits in reverse order from cirq
         ordered_qubits = list(reversed(ordered_qubits))
 
-        def has_qsim_kind(op: cirq.ops.GateOperation):
-            return _cirq_gate_kind(op.gate) != None
-
-        def to_matrix(op: cirq.ops.GateOperation):
-            mat = cirq.protocols.unitary(op.gate, None)
+        def to_matrix(op: cirq.GateOperation):
+            mat = cirq.unitary(op.gate, None)
             if mat is None:
                 return NotImplemented
 
-            return cirq.ops.MatrixGate(mat).on(*op.qubits)
+            return cirq.MatrixGate(mat).on(*op.qubits)
 
         qubit_to_index_dict = {q: i for i, q in enumerate(ordered_qubits)}
         time_offset = 0
+        gate_count = 0
+        moment_indices = []
         for moment in self:
             ops_by_gate = [
-                cirq.decompose(op, fallback_decomposer=to_matrix, keep=has_qsim_kind)
+                cirq.decompose(
+                    op, fallback_decomposer=to_matrix, keep=_has_cirq_gate_kind
+                )
                 for op in moment
             ]
             moment_length = max((len(gate_ops) for gate_ops in ops_by_gate), default=0)
@@ -322,22 +401,24 @@ def to_matrix(op: cirq.ops.GateOperation):
                         continue
                     qsim_op = gate_ops[gi]
                     time = time_offset + gi
-                    gate_kind = _cirq_gate_kind(qsim_op.gate)
                     add_op_to_circuit(qsim_op, time, qubit_to_index_dict, qsim_circuit)
+                    gate_count += 1
             time_offset += moment_length
+            moment_indices.append(gate_count)
 
-        return qsim_circuit
+        return qsim_circuit, moment_indices
 
     def translate_cirq_to_qtrajectory(
-        self, qubit_order: cirq.ops.QubitOrderOrList = cirq.ops.QubitOrder.DEFAULT
+        self, qubit_order: cirq.QubitOrderOrList = cirq.QubitOrder.DEFAULT
     ) -> qsim.NoisyCircuit:
         """
         Translates this noisy Cirq circuit to the qsim representation.
         :qubit_order: Ordering of qubits
-        :return: a C++ qsim NoisyCircuit object
+        :return: a tuple of (C++ qsim NoisyCircuit object, moment boundary
+            gate indices)
         """
         qsim_ncircuit = qsim.NoisyCircuit()
-        ordered_qubits = cirq.ops.QubitOrder.as_qubit_order(qubit_order).order_for(
+        ordered_qubits = cirq.QubitOrder.as_qubit_order(qubit_order).order_for(
             self.all_qubits()
         )
 
@@ -346,18 +427,17 @@ def translate_cirq_to_qtrajectory(
 
         qsim_ncircuit.num_qubits = len(ordered_qubits)
 
-        def has_qsim_kind(op: cirq.ops.GateOperation):
-            return _cirq_gate_kind(op.gate) != None
-
-        def to_matrix(op: cirq.ops.GateOperation):
+        def to_matrix(op: cirq.GateOperation):
             mat = cirq.unitary(op.gate, None)
             if mat is None:
                 return NotImplemented
 
-            return cirq.ops.MatrixGate(mat).on(*op.qubits)
+            return cirq.MatrixGate(mat).on(*op.qubits)
 
         qubit_to_index_dict = {q: i for i, q in enumerate(ordered_qubits)}
         time_offset = 0
+        gate_count = 0
+        moment_indices = []
         for moment in self:
             moment_length = 0
             ops_by_gate = []
@@ -367,7 +447,7 @@ def to_matrix(op: cirq.ops.GateOperation):
             for qsim_op in moment:
                 if cirq.has_unitary(qsim_op) or cirq.is_measurement(qsim_op):
                     oplist = cirq.decompose(
-                        qsim_op, fallback_decomposer=to_matrix, keep=has_qsim_kind
+                        qsim_op, fallback_decomposer=to_matrix, keep=_has_cirq_gate_kind
                     )
                     ops_by_gate.append(oplist)
                     moment_length = max(moment_length, len(oplist))
@@ -376,7 +456,7 @@ def to_matrix(op: cirq.ops.GateOperation):
                     ops_by_mix.append(qsim_op)
                     moment_length = max(moment_length, 1)
                     pass
-                elif cirq.has_channel(qsim_op):
+                elif cirq.has_kraus(qsim_op):
                     ops_by_channel.append(qsim_op)
                     moment_length = max(moment_length, 1)
                     pass
@@ -391,8 +471,8 @@ def to_matrix(op: cirq.ops.GateOperation):
                         continue
                     qsim_op = gate_ops[gi]
                     time = time_offset + gi
-                    gate_kind = _cirq_gate_kind(qsim_op.gate)
                     add_op_to_circuit(qsim_op, time, qubit_to_index_dict, qsim_ncircuit)
+                    gate_count += 1
                 # Handle mixture output.
                 for mixture in ops_by_mix:
                     mixdata = []
@@ -404,10 +484,11 @@ def to_matrix(op: cirq.ops.GateOperation):
                         mixdata.append((prob, mat.view(np.float32), unitary))
                     qubits = [qubit_to_index_dict[q] for q in mixture.qubits]
                     qsim.add_channel(time_offset, qubits, mixdata, qsim_ncircuit)
+                    gate_count += 1
                 # Handle channel output.
                 for channel in ops_by_channel:
                     chdata = []
-                    for i, mat in enumerate(cirq.channel(channel)):
+                    for i, mat in enumerate(cirq.kraus(channel)):
                         square_mat = np.reshape(mat, (int(np.sqrt(mat.size)), -1))
                         unitary = cirq.is_unitary(square_mat)
                         singular_vals = np.linalg.svd(square_mat)[1]
@@ -417,6 +498,8 @@ def to_matrix(op: cirq.ops.GateOperation):
                         chdata.append((lower_bound_prob, mat.view(np.float32), unitary))
                     qubits = [qubit_to_index_dict[q] for q in channel.qubits]
                     qsim.add_channel(time_offset, qubits, chdata, qsim_ncircuit)
+                    gate_count += 1
             time_offset += moment_length
+            moment_indices.append(gate_count)
 
-        return qsim_ncircuit
+        return qsim_ncircuit, moment_indices
diff --git a/qsimcirq/qsim_simulator.py b/qsimcirq/qsim_simulator.py
index 7d7fbe8f..5835c574 100644
--- a/qsimcirq/qsim_simulator.py
+++ b/qsimcirq/qsim_simulator.py
@@ -13,41 +13,28 @@
 # limitations under the License.
 
 from collections import deque
+from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
-from cirq import (
-    circuits,
-    linalg,
-    ops,
-    protocols,
-    sim,
-    study,
-    value,
-    SimulatesAmplitudes,
-    SimulatesFinalState,
-    SimulatesSamples,
-)
-
-# TODO: import from cirq directly when fix is released
-from cirq.sim.simulator import SimulatesExpectationValues
+import cirq
 
 import numpy as np
 
-from . import qsim
+from . import qsim, qsim_gpu, qsim_custatevec
 import qsimcirq.qsim_circuit as qsimc
 
 
-class QSimSimulatorState(sim.StateVectorSimulatorState):
-    def __init__(self, qsim_data: np.ndarray, qubit_map: Dict[ops.Qid, int]):
+class QSimSimulatorState(cirq.StateVectorSimulatorState):
+    def __init__(self, qsim_data: np.ndarray, qubit_map: Dict[cirq.Qid, int]):
         state_vector = qsim_data.view(np.complex64)
         super().__init__(state_vector=state_vector, qubit_map=qubit_map)
 
 
-@value.value_equality(unhashable=True)
-class QSimSimulatorTrialResult(sim.StateVectorMixin, sim.SimulationTrialResult):
+@cirq.value_equality(unhashable=True)
+class QSimSimulatorTrialResult(cirq.StateVectorMixin, cirq.SimulationTrialResult):
     def __init__(
         self,
-        params: study.ParamResolver,
+        params: cirq.ParamResolver,
         measurements: Dict[str, np.ndarray],
         final_simulator_state: QSimSimulatorState,
     ):
@@ -101,48 +88,126 @@ def __repr__(self) -> str:
 
 # This should probably live in Cirq...
 # TODO: update to support CircuitOperations.
-def _needs_trajectories(circuit: circuits.Circuit) -> bool:
+def _needs_trajectories(circuit: cirq.Circuit) -> bool:
     """Checks if the circuit requires trajectory simulation."""
     for op in circuit.all_operations():
         test_op = (
             op
-            if not protocols.is_parameterized(op)
-            else protocols.resolve_parameters(
-                op, {param: 1 for param in protocols.parameter_names(op)}
+            if not cirq.is_parameterized(op)
+            else cirq.resolve_parameters(
+                op, {param: 1 for param in cirq.parameter_names(op)}
             )
         )
-        if not (protocols.has_unitary(test_op) or protocols.is_measurement(test_op)):
+        if not (cirq.is_measurement(test_op) or cirq.has_unitary(test_op)):
             return True
     return False
 
 
+@dataclass
+class QSimOptions:
+    """Container for options to the QSimSimulator.
+
+    Options for the simulator can also be provided as a {string: value} dict,
+    using the format shown in the 'as_dict' function for this class.
+
+    Args:
+        max_fused_gate_size: maximum number of qubits allowed per fused gate.
+            Circuits of less than 22 qubits usually perform best with this set
+            to 2 or 3, while larger circuits (with >= 22 qubits) typically
+            perform better with it set to 3 or 4.
+        cpu_threads: number of threads to use when running on CPU. For best
+            performance, this should equal the number of cores on the device.
+        ev_noisy_repetitions: number of repetitions used for estimating
+            expectation values of a noisy circuit. Does not affect other
+            simulation modes.
+        use_gpu: whether to use GPU instead of CPU for simulation. The "gpu_*"
+            arguments below are only considered if this is set to True.
+        gpu_mode: use CUDA if set to 0 (default value) or use the NVIDIA
+            cuStateVec library if set to any other value. The "gpu_*"
+            arguments below are only considered if this is set to 0.
+        gpu_sim_threads: number of threads per CUDA block to use for the GPU
+            Simulator. This must be a power of 2 in the range [32, 256].
+        gpu_state_threads: number of threads per CUDA block to use for the GPU
+            StateSpace. This must be a power of 2 in the range [32, 1024].
+        gpu_data_blocks: number of data blocks to use on GPU. Below 16 data
+            blocks, performance is noticeably reduced.
+        verbosity: Logging verbosity.
+        denormals_are_zeros: if true, set flush-to-zero and denormals-are-zeros
+            MXCSR control flags. This prevents rare cases of performance
+            slowdown potentially at the cost of a tiny precision loss.
+    """
+
+    max_fused_gate_size: int = 2
+    cpu_threads: int = 1
+    ev_noisy_repetitions: int = 1
+    use_gpu: bool = False
+    gpu_mode: int = 0
+    gpu_sim_threads: int = 256
+    gpu_state_threads: int = 512
+    gpu_data_blocks: int = 16
+    verbosity: int = 0
+    denormals_are_zeros: bool = False
+
+    def as_dict(self):
+        """Generates an options dict from this object.
+
+        Options to QSimSimulator can also be provided in this format directly.
+        """
+        return {
+            "f": self.max_fused_gate_size,
+            "t": self.cpu_threads,
+            "r": self.ev_noisy_repetitions,
+            "g": self.use_gpu,
+            "gmode": self.gpu_mode,
+            "gsmt": self.gpu_sim_threads,
+            "gsst": self.gpu_state_threads,
+            "gdb": self.gpu_data_blocks,
+            "v": self.verbosity,
+            "z": self.denormals_are_zeros,
+        }
+
+
+@dataclass
+class MeasInfo:
+    """Info about each measure operation in the circuit being simulated.
+
+    Attributes:
+        key: The measurement key.
+        idx: The "instance" of a possibly-repeated measurement key.
+        invert_mask: True for any measurement bits that should be inverted.
+        start: Start index in qsim's output array for this measurement.
+        end: End index (non-inclusive) in qsim's output array.
+    """
+
+    key: str
+    idx: int
+    invert_mask: Tuple[bool, ...]
+    start: int
+    end: int
+
+
 class QSimSimulator(
-    SimulatesSamples,
-    SimulatesAmplitudes,
-    SimulatesFinalState,
-    SimulatesExpectationValues,
+    cirq.SimulatesSamples,
+    cirq.SimulatesAmplitudes,
+    cirq.SimulatesFinalState,
+    cirq.SimulatesExpectationValues,
 ):
     def __init__(
         self,
-        qsim_options: dict = {},
-        seed: value.RANDOM_STATE_OR_SEED_LIKE = None,
+        qsim_options: Union[None, Dict, QSimOptions] = None,
+        seed: cirq.RANDOM_STATE_OR_SEED_LIKE = None,
+        noise: cirq.NOISE_MODEL_LIKE = None,
         circuit_memoization_size: int = 0,
     ):
         """Creates a new QSimSimulator using the given options and seed.
 
         Args:
-            qsim_options: A map of circuit options for the simulator. These will be
-                applied to all circuits run using this simulator. Accepted keys and
-                their behavior are as follows:
-                    - 'f': int (> 0). Maximum size of fused gates. Default: 2.
-                    - 'r': int (> 0). Noisy repetitions (see below). Default: 1.
-                    - 't': int (> 0). Number of threads to run on. Default: 1.
-                    - 'v': int (>= 0). Log verbosity. Default: 0.
-                See qsim/docs/usage.md for more details on these options.
-                "Noisy repetitions" specifies how many repetitions to aggregate
-                over when calculating expectation values for a noisy circuit.
-                Note that this does not apply to other simulation types.
+            qsim_options: An options dict or QSimOptions object with options
+                to use for all circuits run using this simulator. See the
+                QSimOptions class for details.
             seed: A random state or seed object, as defined in cirq.value.
+            noise: A cirq.NoiseModel to apply to all circuits simulated with
+                this simulator.
             circuit_memoization_size: The number of last translated circuits
                 to be memoized from simulation executions, to eliminate
                 translation overhead. Every simulation will perform a linear
@@ -155,25 +220,60 @@ def __init__(
         Raises:
             ValueError if internal keys 'c', 'i' or 's' are included in 'qsim_options'.
         """
+        if isinstance(qsim_options, QSimOptions):
+            qsim_options = qsim_options.as_dict()
+        else:
+            qsim_options = qsim_options or {}
+
         if any(k in qsim_options for k in ("c", "i", "s")):
             raise ValueError(
                 'Keys {"c", "i", "s"} are reserved for internal use and cannot be '
                 "used in QSimCircuit instantiation."
             )
-        self._prng = value.parse_random_state(seed)
-        self.qsim_options = {"t": 1, "f": 2, "v": 0, "r": 1}
+        self._prng = cirq.value.parse_random_state(seed)
+        self.qsim_options = QSimOptions().as_dict()
         self.qsim_options.update(qsim_options)
-        # Deque of (<original cirq circuit>, <translated qsim circuit>) tuples.
+        self.noise = cirq.NoiseModel.from_noise_model_like(noise)
+
+        # module to use for simulation
+        if self.qsim_options["g"]:
+            if self.qsim_options["gmode"] == 0:
+                if qsim_gpu is None:
+                    raise ValueError(
+                        "GPU execution requested, but not supported. If your "
+                        "device has GPU support, you may need to compile qsim "
+                        "locally."
+                    )
+                else:
+                    self._sim_module = qsim_gpu
+            else:
+                if qsim_custatevec is None:
+                    raise ValueError(
+                        "cuStateVec GPU execution requested, but not "
+                        "supported. If your device has GPU support and the "
+                        "NVIDIA cuStateVec library is installed, you may need "
+                        "to compile qsim locally."
+                    )
+                else:
+                    self._sim_module = qsim_custatevec
+        else:
+            self._sim_module = qsim
+
+        # Deque of (
+        #   <original cirq circuit>,
+        #   <translated qsim circuit>,
+        #   <moment_gate_indices>
+        # ) tuples.
         self._translated_circuits = deque(maxlen=circuit_memoization_size)
 
     def get_seed(self):
         # Limit seed size to 32-bit integer for C++ conversion.
-        return self._prng.randint(2 ** 31 - 1)
+        return self._prng.randint(2**31 - 1)
 
     def _run(
         self,
-        circuit: circuits.Circuit,
-        param_resolver: study.ParamResolver,
+        circuit: cirq.Circuit,
+        param_resolver: cirq.ParamResolver,
         repetitions: int,
     ) -> Dict[str, np.ndarray]:
         """Run a simulation, mimicking quantum hardware.
@@ -187,14 +287,14 @@ def _run(
             A dictionary from measurement gate key to measurement
             results.
         """
-        param_resolver = param_resolver or study.ParamResolver({})
-        solved_circuit = protocols.resolve_parameters(circuit, param_resolver)
+        param_resolver = param_resolver or cirq.ParamResolver({})
+        solved_circuit = cirq.resolve_parameters(circuit, param_resolver)
 
         return self._sample_measure_results(solved_circuit, repetitions)
 
     def _sample_measure_results(
         self,
-        program: circuits.Circuit,
+        program: cirq.Circuit,
         repetitions: int = 1,
     ) -> Dict[str, np.ndarray]:
         """Samples from measurement gates in the circuit.
@@ -216,112 +316,134 @@ def _sample_measure_results(
             ValueError: If there are multiple MeasurementGates with the same key,
                 or if repetitions is negative.
         """
-        if not isinstance(program, qsimc.QSimCircuit):
-            program = qsimc.QSimCircuit(program, device=program.device)
+
+        # Add noise to the circuit if a noise model was provided.
+        all_qubits = program.all_qubits()
+        program = qsimc.QSimCircuit(
+            self.noise.noisy_moments(program, sorted(all_qubits))
+            if self.noise is not cirq.NO_NOISE
+            else program,
+        )
 
         # Compute indices of measured qubits
-        ordered_qubits = ops.QubitOrder.DEFAULT.order_for(program.all_qubits())
+        ordered_qubits = cirq.QubitOrder.DEFAULT.order_for(all_qubits)
         num_qubits = len(ordered_qubits)
 
         qubit_map = {qubit: index for index, qubit in enumerate(ordered_qubits)}
 
-        # Computes
-        # - the list of qubits to be measured
-        # - the start (inclusive) and end (exclusive) indices of each measurement
-        # - a mapping from measurement key to measurement gate
+        # Compute:
+        # - number of qubits for each measurement key.
+        # - measurement ops for each measurement key.
+        # - measurement info for each measurement.
+        # - total number of measured bits.
         measurement_ops = [
             op
             for _, op, _ in program.findall_operations_with_gate_type(
-                ops.MeasurementGate
+                cirq.MeasurementGate
             )
         ]
-        measured_qubits = []  # type: List[ops.Qid]
-        bounds = {}  # type: Dict[str, Tuple]
-        meas_ops = {}  # type: Dict[str, cirq.GateOperation]
-        current_index = 0
+        num_qubits_by_key: Dict[str, int] = {}
+        meas_ops: Dict[str, List[cirq.GateOperation]] = {}
+        meas_infos: List[MeasInfo] = []
+        num_bits = 0
         for op in measurement_ops:
             gate = op.gate
-            key = protocols.measurement_key(gate)
-            meas_ops[key] = op
-            if key in bounds:
-                raise ValueError(f"Duplicate MeasurementGate with key {key}")
-            bounds[key] = (current_index, current_index + len(op.qubits))
-            measured_qubits.extend(op.qubits)
-            current_index += len(op.qubits)
+            key = cirq.measurement_key_name(gate)
+            meas_ops.setdefault(key, [])
+            i = len(meas_ops[key])
+            meas_ops[key].append(op)
+            n = len(op.qubits)
+            if key in num_qubits_by_key:
+                if n != num_qubits_by_key[key]:
+                    raise ValueError(
+                        f"repeated key {key!r} with different numbers of qubits: "
+                        f"{num_qubits_by_key[key]} != {n}"
+                    )
+            else:
+                num_qubits_by_key[key] = n
+            meas_infos.append(
+                MeasInfo(
+                    key=key,
+                    idx=i,
+                    invert_mask=gate.full_invert_mask(),
+                    start=num_bits,
+                    end=num_bits + n,
+                )
+            )
+            num_bits += n
 
         # Set qsim options
-        options = {}
-        options.update(self.qsim_options)
+        options = {**self.qsim_options}
 
-        results = {}
-        for key, bound in bounds.items():
-            results[key] = np.ndarray(
-                shape=(repetitions, bound[1] - bound[0]), dtype=int
-            )
+        results = {
+            key: np.ndarray(shape=(repetitions, len(meas_ops[key]), n), dtype=int)
+            for key, n in num_qubits_by_key.items()
+        }
 
         noisy = _needs_trajectories(program)
-        if noisy:
-            translator_fn_name = "translate_cirq_to_qtrajectory"
-            sampler_fn = qsim.qtrajectory_sample
-        else:
-            translator_fn_name = "translate_cirq_to_qsim"
-            sampler_fn = qsim.qsim_sample
-
         if not noisy and program.are_all_measurements_terminal() and repetitions > 1:
-            print(
-                "Provided circuit has no intermediate measurements. "
-                + "Sampling repeatedly from final state vector."
-            )
             # Measurements must be replaced with identity gates to sample properly.
             # Simply removing them may omit qubits from the circuit.
             for i in range(len(program.moments)):
-                program.moments[i] = ops.Moment(
+                program.moments[i] = cirq.Moment(
                     op
-                    if not isinstance(op.gate, ops.MeasurementGate)
-                    else [ops.IdentityGate(1).on(q) for q in op.qubits]
+                    if not isinstance(op.gate, cirq.MeasurementGate)
+                    else [cirq.IdentityGate(1).on(q) for q in op.qubits]
                     for op in program.moments[i]
                 )
             translator_fn_name = "translate_cirq_to_qsim"
-            options["c"] = self._translate_circuit(
+            options["c"], _ = self._translate_circuit(
                 program,
                 translator_fn_name,
-                ops.QubitOrder.DEFAULT,
+                cirq.QubitOrder.DEFAULT,
             )
             options["s"] = self.get_seed()
-            final_state = qsim.qsim_simulate_fullstate(options, 0)
-            full_results = sim.sample_state_vector(
-                final_state.view(np.complex64),
-                range(num_qubits),
-                repetitions=repetitions,
-                seed=self._prng,
+            raw_results = self._sim_module.qsim_sample_final(options, repetitions)
+            full_results = np.array(
+                [
+                    [bool(result & (1 << q)) for q in reversed(range(num_qubits))]
+                    for result in raw_results
+                ]
             )
 
-            for i in range(repetitions):
-                for key, op in meas_ops.items():
+            for key, oplist in meas_ops.items():
+                for i, op in enumerate(oplist):
                     meas_indices = [qubit_map[qubit] for qubit in op.qubits]
-                    for j, q in enumerate(meas_indices):
-                        results[key][i][j] = full_results[i][q]
+                    invert_mask = op.gate.full_invert_mask()
+                    # Apply invert mask to re-ordered results
+                    results[key][:, i, :] = full_results[:, meas_indices] ^ invert_mask
+
         else:
-            options["c"] = self._translate_circuit(
+            if noisy:
+                translator_fn_name = "translate_cirq_to_qtrajectory"
+                sampler_fn = self._sim_module.qtrajectory_sample
+            else:
+                translator_fn_name = "translate_cirq_to_qsim"
+                sampler_fn = self._sim_module.qsim_sample
+
+            options["c"], _ = self._translate_circuit(
                 program,
                 translator_fn_name,
-                ops.QubitOrder.DEFAULT,
+                cirq.QubitOrder.DEFAULT,
             )
+            measurements = np.empty(shape=(repetitions, num_bits), dtype=int)
             for i in range(repetitions):
                 options["s"] = self.get_seed()
-                measurements = sampler_fn(options)
-                for key, bound in bounds.items():
-                    for j in range(bound[1] - bound[0]):
-                        results[key][i][j] = int(measurements[bound[0] + j])
+                measurements[i] = sampler_fn(options)
+
+            for m in meas_infos:
+                results[m.key][:, m.idx, :] = (
+                    measurements[:, m.start : m.end] ^ m.invert_mask
+                )
 
         return results
 
     def compute_amplitudes_sweep(
         self,
-        program: circuits.Circuit,
+        program: cirq.Circuit,
         bitstrings: Sequence[int],
-        params: study.Sweepable,
-        qubit_order: ops.QubitOrderOrList = ops.QubitOrder.DEFAULT,
+        params: cirq.Sweepable,
+        qubit_order: cirq.QubitOrderOrList = cirq.QubitOrder.DEFAULT,
     ) -> Sequence[Sequence[complex]]:
         """Computes the desired amplitudes using qsim.
 
@@ -341,13 +463,17 @@ def compute_amplitudes_sweep(
         Returns:
             List of amplitudes.
         """
-        if not isinstance(program, qsimc.QSimCircuit):
-            program = qsimc.QSimCircuit(program, device=program.device)
 
-        # qsim numbers qubits in reverse order from cirq
-        cirq_order = ops.QubitOrder.as_qubit_order(qubit_order).order_for(
-            program.all_qubits()
+        # Add noise to the circuit if a noise model was provided.
+        all_qubits = program.all_qubits()
+        program = qsimc.QSimCircuit(
+            self.noise.noisy_moments(program, sorted(all_qubits))
+            if self.noise is not cirq.NO_NOISE
+            else program,
         )
+
+        # qsim numbers qubits in reverse order from cirq
+        cirq_order = cirq.QubitOrder.as_qubit_order(qubit_order).order_for(all_qubits)
         num_qubits = len(cirq_order)
         bitstrings = [
             format(bitstring, "b").zfill(num_qubits)[::-1] for bitstring in bitstrings
@@ -356,19 +482,19 @@ def compute_amplitudes_sweep(
         options = {"i": "\n".join(bitstrings)}
         options.update(self.qsim_options)
 
-        param_resolvers = study.to_resolvers(params)
+        param_resolvers = cirq.to_resolvers(params)
 
         trials_results = []
         if _needs_trajectories(program):
             translator_fn_name = "translate_cirq_to_qtrajectory"
-            simulator_fn = qsim.qtrajectory_simulate
+            simulator_fn = self._sim_module.qtrajectory_simulate
         else:
             translator_fn_name = "translate_cirq_to_qsim"
-            simulator_fn = qsim.qsim_simulate
+            simulator_fn = self._sim_module.qsim_simulate
 
         for prs in param_resolvers:
-            solved_circuit = protocols.resolve_parameters(program, prs)
-            options["c"] = self._translate_circuit(
+            solved_circuit = cirq.resolve_parameters(program, prs)
+            options["c"], _ = self._translate_circuit(
                 solved_circuit,
                 translator_fn_name,
                 cirq_order,
@@ -381,9 +507,9 @@ def compute_amplitudes_sweep(
 
     def simulate_sweep(
         self,
-        program: circuits.Circuit,
-        params: study.Sweepable,
-        qubit_order: ops.QubitOrderOrList = ops.QubitOrder.DEFAULT,
+        program: cirq.Circuit,
+        params: cirq.Sweepable,
+        qubit_order: cirq.QubitOrderOrList = cirq.QubitOrder.DEFAULT,
         initial_state: Optional[Union[int, np.ndarray]] = None,
     ) -> List["SimulationTrialResult"]:
         """Simulates the supplied Circuit.
@@ -392,6 +518,11 @@ def simulate_sweep(
         wave function. In contrast to simulate, this allows for sweeping
         over different parameter values.
 
+        Avoid using this method with `use_gpu=True` in the simulator options;
+        when used with GPU this method must copy state from device to host memory
+        multiple times, which can be very slow. This issue is not present in
+        `simulate_expectation_values_sweep`.
+
         Args:
             program: The circuit to simulate.
             params: Parameters to run with the program.
@@ -414,24 +545,28 @@ def simulate_sweep(
             initial_state = 0
         if not isinstance(initial_state, (int, np.ndarray)):
             raise TypeError("initial_state must be an int or state vector.")
-        if not isinstance(program, qsimc.QSimCircuit):
-            program = qsimc.QSimCircuit(program, device=program.device)
+
+        # Add noise to the circuit if a noise model was provided.
+        all_qubits = program.all_qubits()
+        program = qsimc.QSimCircuit(
+            self.noise.noisy_moments(program, sorted(all_qubits))
+            if self.noise is not cirq.NO_NOISE
+            else program,
+        )
 
         options = {}
         options.update(self.qsim_options)
 
-        param_resolvers = study.to_resolvers(params)
+        param_resolvers = cirq.to_resolvers(params)
         # qsim numbers qubits in reverse order from cirq
-        cirq_order = ops.QubitOrder.as_qubit_order(qubit_order).order_for(
-            program.all_qubits()
-        )
+        cirq_order = cirq.QubitOrder.as_qubit_order(qubit_order).order_for(all_qubits)
         qsim_order = list(reversed(cirq_order))
         num_qubits = len(qsim_order)
         if isinstance(initial_state, np.ndarray):
             if initial_state.dtype != np.complex64:
                 raise TypeError(f"initial_state vector must have dtype np.complex64.")
             input_vector = initial_state.view(np.float32)
-            if len(input_vector) != 2 ** num_qubits * 2:
+            if len(input_vector) != 2**num_qubits * 2:
                 raise ValueError(
                     f"initial_state vector size must match number of qubits."
                     f"Expected: {2**num_qubits * 2} Received: {len(input_vector)}"
@@ -440,15 +575,15 @@ def simulate_sweep(
         trials_results = []
         if _needs_trajectories(program):
             translator_fn_name = "translate_cirq_to_qtrajectory"
-            fullstate_simulator_fn = qsim.qtrajectory_simulate_fullstate
+            fullstate_simulator_fn = self._sim_module.qtrajectory_simulate_fullstate
         else:
             translator_fn_name = "translate_cirq_to_qsim"
-            fullstate_simulator_fn = qsim.qsim_simulate_fullstate
+            fullstate_simulator_fn = self._sim_module.qsim_simulate_fullstate
 
         for prs in param_resolvers:
-            solved_circuit = protocols.resolve_parameters(program, prs)
+            solved_circuit = cirq.resolve_parameters(program, prs)
 
-            options["c"] = self._translate_circuit(
+            options["c"], _ = self._translate_circuit(
                 solved_circuit,
                 translator_fn_name,
                 cirq_order,
@@ -474,10 +609,10 @@ def simulate_sweep(
 
     def simulate_expectation_values_sweep(
         self,
-        program: "cirq.Circuit",
-        observables: Union["cirq.PauliSumLike", List["cirq.PauliSumLike"]],
-        params: "study.Sweepable",
-        qubit_order: ops.QubitOrderOrList = ops.QubitOrder.DEFAULT,
+        program: cirq.Circuit,
+        observables: Union[cirq.PauliSumLike, List[cirq.PauliSumLike]],
+        params: cirq.Sweepable,
+        qubit_order: cirq.QubitOrderOrList = cirq.QubitOrder.DEFAULT,
         initial_state: Any = None,
         permit_terminal_measurements: bool = False,
     ) -> List[List[float]]:
@@ -491,7 +626,7 @@ def simulate_expectation_values_sweep(
         Args:
             program: The circuit to simulate.
             observables: An observable or list of observables.
-            param_resolver: Parameters to run with the program.
+            params: Parameters to run with the program.
             qubit_order: Determines the canonical ordering of the qubits. This
                 is often used in specifying the initial state, i.e. the
                 ordering of the computational basis states.
@@ -520,11 +655,10 @@ def simulate_expectation_values_sweep(
             )
         if not isinstance(observables, List):
             observables = [observables]
-        psumlist = [ops.PauliSum.wrap(pslike) for pslike in observables]
+        psumlist = [cirq.PauliSum.wrap(pslike) for pslike in observables]
 
-        cirq_order = ops.QubitOrder.as_qubit_order(qubit_order).order_for(
-            program.all_qubits()
-        )
+        all_qubits = program.all_qubits()
+        cirq_order = cirq.QubitOrder.as_qubit_order(qubit_order).order_for(all_qubits)
         qsim_order = list(reversed(cirq_order))
         num_qubits = len(qsim_order)
         qubit_map = {qubit: index for index, qubit in enumerate(qsim_order)}
@@ -547,18 +681,23 @@ def simulate_expectation_values_sweep(
             initial_state = 0
         if not isinstance(initial_state, (int, np.ndarray)):
             raise TypeError("initial_state must be an int or state vector.")
-        if not isinstance(program, qsimc.QSimCircuit):
-            program = qsimc.QSimCircuit(program, device=program.device)
+
+        # Add noise to the circuit if a noise model was provided.
+        program = qsimc.QSimCircuit(
+            self.noise.noisy_moments(program, sorted(all_qubits))
+            if self.noise is not cirq.NO_NOISE
+            else program,
+        )
 
         options = {}
         options.update(self.qsim_options)
 
-        param_resolvers = study.to_resolvers(params)
+        param_resolvers = cirq.to_resolvers(params)
         if isinstance(initial_state, np.ndarray):
             if initial_state.dtype != np.complex64:
                 raise TypeError(f"initial_state vector must have dtype np.complex64.")
             input_vector = initial_state.view(np.float32)
-            if len(input_vector) != 2 ** num_qubits * 2:
+            if len(input_vector) != 2**num_qubits * 2:
                 raise ValueError(
                     f"initial_state vector size must match number of qubits."
                     f"Expected: {2**num_qubits * 2} Received: {len(input_vector)}"
@@ -567,14 +706,14 @@ def simulate_expectation_values_sweep(
         results = []
         if _needs_trajectories(program):
             translator_fn_name = "translate_cirq_to_qtrajectory"
-            ev_simulator_fn = qsim.qtrajectory_simulate_expectation_values
+            ev_simulator_fn = self._sim_module.qtrajectory_simulate_expectation_values
         else:
             translator_fn_name = "translate_cirq_to_qsim"
-            ev_simulator_fn = qsim.qsim_simulate_expectation_values
+            ev_simulator_fn = self._sim_module.qsim_simulate_expectation_values
 
         for prs in param_resolvers:
-            solved_circuit = protocols.resolve_parameters(program, prs)
-            options["c"] = self._translate_circuit(
+            solved_circuit = cirq.resolve_parameters(program, prs)
+            options["c"], _ = self._translate_circuit(
                 solved_circuit,
                 translator_fn_name,
                 cirq_order,
@@ -589,23 +728,162 @@ def simulate_expectation_values_sweep(
 
         return results
 
+    def simulate_moment_expectation_values(
+        self,
+        program: cirq.Circuit,
+        indexed_observables: Union[
+            Dict[int, Union[cirq.PauliSumLike, List[cirq.PauliSumLike]]],
+            cirq.PauliSumLike,
+            List[cirq.PauliSumLike],
+        ],
+        param_resolver: cirq.ParamResolver,
+        qubit_order: cirq.QubitOrderOrList = cirq.QubitOrder.DEFAULT,
+        initial_state: Any = None,
+    ) -> List[List[float]]:
+        """Calculates expectation values at each moment of a circuit.
+
+        Args:
+            program: The circuit to simulate.
+            indexed_observables: A map of moment indices to an observable
+                or list of observables to calculate after that moment. As a
+                convenience, users can instead pass in a single observable
+                or observable list to calculate after ALL moments.
+            param_resolver: Parameters to run with the program.
+            qubit_order: Determines the canonical ordering of the qubits. This
+                is often used in specifying the initial state, i.e. the
+                ordering of the computational basis states.
+            initial_state: The initial state for the simulation. The form of
+                this state depends on the simulation implementation. See
+                documentation of the implementing class for details.
+            permit_terminal_measurements: If the provided circuit ends with
+                measurement(s), this method will generate an error unless this
+                is set to True. This is meant to prevent measurements from
+                ruining expectation value calculations.
+
+        Returns:
+            A list of expectation values for each moment m in the circuit,
+            where value `n` corresponds to `indexed_observables[m][n]`.
+
+        Raises:
+            ValueError if 'program' has terminal measurement(s) and
+            'permit_terminal_measurements' is False. (Note: We cannot test this
+            until Cirq's `are_any_measurements_terminal` is released.)
+        """
+        if not isinstance(indexed_observables, Dict):
+            if not isinstance(indexed_observables, List):
+                indexed_observables = [
+                    (i, [indexed_observables]) for i, _ in enumerate(program)
+                ]
+            else:
+                indexed_observables = [
+                    (i, indexed_observables) for i, _ in enumerate(program)
+                ]
+        else:
+            indexed_observables = [
+                (i, obs) if isinstance(obs, List) else (i, [obs])
+                for i, obs in indexed_observables.items()
+            ]
+        indexed_observables.sort(key=lambda x: x[0])
+        psum_pairs = [
+            (i, [cirq.PauliSum.wrap(pslike) for pslike in obs_list])
+            for i, obs_list in indexed_observables
+        ]
+
+        all_qubits = program.all_qubits()
+        cirq_order = cirq.QubitOrder.as_qubit_order(qubit_order).order_for(all_qubits)
+        qsim_order = list(reversed(cirq_order))
+        num_qubits = len(qsim_order)
+        qubit_map = {qubit: index for index, qubit in enumerate(qsim_order)}
+
+        opsums_and_qcount_map = {}
+        for i, psumlist in psum_pairs:
+            opsums_and_qcount_map[i] = []
+            for psum in psumlist:
+                opsum = []
+                opsum_qubits = set()
+                for pstr in psum:
+                    opstring = qsim.OpString()
+                    opstring.weight = pstr.coefficient
+                    for q, pauli in pstr.items():
+                        op = pauli.on(q)
+                        opsum_qubits.add(q)
+                        qsimc.add_op_to_opstring(op, qubit_map, opstring)
+                    opsum.append(opstring)
+                opsums_and_qcount_map[i].append((opsum, len(opsum_qubits)))
+
+        if initial_state is None:
+            initial_state = 0
+        if not isinstance(initial_state, (int, np.ndarray)):
+            raise TypeError("initial_state must be an int or state vector.")
+
+        # Add noise to the circuit if a noise model was provided.
+        program = qsimc.QSimCircuit(
+            self.noise.noisy_moments(program, sorted(all_qubits))
+            if self.noise is not cirq.NO_NOISE
+            else program,
+        )
+
+        options = {}
+        options.update(self.qsim_options)
+
+        param_resolver = cirq.to_resolvers(param_resolver)
+        if isinstance(initial_state, np.ndarray):
+            if initial_state.dtype != np.complex64:
+                raise TypeError(f"initial_state vector must have dtype np.complex64.")
+            input_vector = initial_state.view(np.float32)
+            if len(input_vector) != 2**num_qubits * 2:
+                raise ValueError(
+                    f"initial_state vector size must match number of qubits."
+                    f"Expected: {2**num_qubits * 2} Received: {len(input_vector)}"
+                )
+
+        is_noisy = _needs_trajectories(program)
+        if is_noisy:
+            translator_fn_name = "translate_cirq_to_qtrajectory"
+            ev_simulator_fn = (
+                self._sim_module.qtrajectory_simulate_moment_expectation_values
+            )
+        else:
+            translator_fn_name = "translate_cirq_to_qsim"
+            ev_simulator_fn = self._sim_module.qsim_simulate_moment_expectation_values
+
+        solved_circuit = cirq.resolve_parameters(program, param_resolver)
+        options["c"], opsum_reindex = self._translate_circuit(
+            solved_circuit,
+            translator_fn_name,
+            cirq_order,
+        )
+        opsums_and_qubit_counts = []
+        for m, opsum_qc in opsums_and_qcount_map.items():
+            pair = (opsum_reindex[m], opsum_qc)
+            opsums_and_qubit_counts.append(pair)
+        options["s"] = self.get_seed()
+
+        if isinstance(initial_state, int):
+            return ev_simulator_fn(options, opsums_and_qubit_counts, initial_state)
+        elif isinstance(initial_state, np.ndarray):
+            return ev_simulator_fn(options, opsums_and_qubit_counts, input_vector)
+
     def _translate_circuit(
         self,
         circuit: Any,
         translator_fn_name: str,
-        qubit_order: ops.QubitOrderOrList,
+        qubit_order: cirq.QubitOrderOrList,
     ):
         # If the circuit is memoized, reuse the corresponding translated
         # circuit.
         translated_circuit = None
-        for original, translated in self._translated_circuits:
+        for original, translated, m_indices in self._translated_circuits:
             if original == circuit:
                 translated_circuit = translated
+                moment_indices = m_indices
                 break
 
         if translated_circuit is None:
             translator_fn = getattr(circuit, translator_fn_name)
-            translated_circuit = translator_fn(qubit_order)
-            self._translated_circuits.append((circuit, translated_circuit))
+            translated_circuit, moment_indices = translator_fn(qubit_order)
+            self._translated_circuits.append(
+                (circuit, translated_circuit, moment_indices)
+            )
 
-        return translated_circuit
+        return translated_circuit, moment_indices
diff --git a/qsimcirq/qsimh_simulator.py b/qsimcirq/qsimh_simulator.py
index 3d2ec049..3077a935 100644
--- a/qsimcirq/qsimh_simulator.py
+++ b/qsimcirq/qsimh_simulator.py
@@ -12,15 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Union, Sequence
+from typing import Sequence
 
-from cirq import study, ops, protocols, circuits, value, SimulatesAmplitudes
+import cirq
 
 from . import qsim
 import qsimcirq.qsim_circuit as qsimc
 
 
-class QSimhSimulator(SimulatesAmplitudes):
+class QSimhSimulator(cirq.SimulatesAmplitudes):
     def __init__(self, qsimh_options: dict = {}):
         """Creates a new QSimhSimulator using the given options.
 
@@ -41,14 +41,14 @@ def __init__(self, qsimh_options: dict = {}):
 
     def compute_amplitudes_sweep(
         self,
-        program: circuits.Circuit,
+        program: cirq.Circuit,
         bitstrings: Sequence[int],
-        params: study.Sweepable,
-        qubit_order: ops.QubitOrderOrList = ops.QubitOrder.DEFAULT,
+        params: cirq.Sweepable,
+        qubit_order: cirq.QubitOrderOrList = cirq.QubitOrder.DEFAULT,
     ) -> Sequence[Sequence[complex]]:
 
         if not isinstance(program, qsimc.QSimCircuit):
-            program = qsimc.QSimCircuit(program, device=program.device)
+            program = qsimc.QSimCircuit(program)
 
         n_qubits = len(program.all_qubits())
         # qsim numbers qubits in reverse order from cirq
@@ -58,14 +58,14 @@ def compute_amplitudes_sweep(
 
         options = {"i": "\n".join(bitstrings)}
         options.update(self.qsimh_options)
-        param_resolvers = study.to_resolvers(params)
+        param_resolvers = cirq.to_resolvers(params)
 
         trials_results = []
         for prs in param_resolvers:
 
-            solved_circuit = protocols.resolve_parameters(program, prs)
+            solved_circuit = cirq.resolve_parameters(program, prs)
 
-            options["c"] = solved_circuit.translate_cirq_to_qsim(qubit_order)
+            options["c"], _ = solved_circuit.translate_cirq_to_qsim(qubit_order)
 
             options.update(self.qsimh_options)
             amplitudes = qsim.qsimh_simulate(options)
diff --git a/qsimcirq_tests/qsimcirq_test.py b/qsimcirq_tests/qsimcirq_test.py
index 144fce5f..ce439f29 100644
--- a/qsimcirq_tests/qsimcirq_test.py
+++ b/qsimcirq_tests/qsimcirq_test.py
@@ -19,7 +19,7 @@
 import qsimcirq
 
 
-class NoiseTrigger(cirq.SingleQubitGate):
+class NoiseTrigger(cirq.Gate):
     """A no-op gate with no _unitary_ method defined.
 
     Appending this gate to a circuit will force it to use qtrajectory, but the
@@ -29,7 +29,10 @@ class NoiseTrigger(cirq.SingleQubitGate):
     # def _mixture_(self):
     #   return ((1.0, np.asarray([1, 0, 0, 1])),)
 
-    def _channel_(self):
+    def _num_qubits_(self) -> int:
+        return 1
+
+    def _kraus_(self):
         return (np.asarray([1, 0, 0, 1]),)
 
 
@@ -54,6 +57,49 @@ def test_empty_moment(mode: str):
     assert result.final_state_vector.shape == (4,)
 
 
+def test_repeated_keys():
+    q0, q1 = cirq.LineQubit.range(2)
+    circuit = cirq.Circuit(
+        cirq.Moment(cirq.measure(q0, key="m")),
+        cirq.Moment(cirq.X(q1)),
+        cirq.Moment(cirq.measure(q1, key="m")),
+        cirq.Moment(cirq.X(q0)),
+        cirq.Moment(cirq.measure(q0, key="m")),
+        cirq.Moment(cirq.X(q1)),
+        cirq.Moment(cirq.measure(q1, key="m")),
+    )
+    result = qsimcirq.QSimSimulator().run(circuit, repetitions=10)
+    assert result.records["m"].shape == (10, 4, 1)
+    assert np.all(result.records["m"][:, 0, :] == 0)
+    assert np.all(result.records["m"][:, 1, :] == 1)
+    assert np.all(result.records["m"][:, 2, :] == 1)
+    assert np.all(result.records["m"][:, 3, :] == 0)
+
+
+def test_repeated_keys_same_moment():
+    q0, q1 = cirq.LineQubit.range(2)
+    circuit = cirq.Circuit(
+        cirq.Moment(cirq.X(q1)),
+        cirq.Moment(cirq.measure(q0, key="m"), cirq.measure(q1, key="m")),
+    )
+    result = qsimcirq.QSimSimulator().run(circuit, repetitions=10)
+    assert result.records["m"].shape == (10, 2, 1)
+    assert np.all(result.records["m"][:, 0, :] == 0)
+    assert np.all(result.records["m"][:, 1, :] == 1)
+
+
+def test_repeated_keys_different_numbers_of_qubits():
+    q0, q1 = cirq.LineQubit.range(2)
+    circuit = cirq.Circuit(
+        cirq.measure(q0, key="m"),
+        cirq.measure(q0, q1, key="m"),
+    )
+    with pytest.raises(
+        ValueError, match="repeated key 'm' with different numbers of qubits"
+    ):
+        _ = qsimcirq.QSimSimulator().run(circuit, repetitions=10)
+
+
 def test_cirq_too_big_gate():
     # Pick qubits.
     a, b, c, d, e, f, g = [
@@ -271,10 +317,11 @@ def test_invalid_params():
     x, y = sympy.Symbol("x"), sympy.Symbol("y")
     circuit = cirq.Circuit(cirq.X(q0) ** x, cirq.H(q0) ** y)
     prs = [{x: np.int64(0), y: np.int64(1)}, {x: np.int64(1), y: "z"}]
+    sweep = cirq.ListSweep(prs)
 
     qsim_simulator = qsimcirq.QSimSimulator()
     with pytest.raises(ValueError, match="Parameters must be numeric"):
-        _ = qsim_simulator.simulate_sweep(circuit, params=prs)
+        _ = qsim_simulator.simulate_sweep(circuit, params=sweep)
 
 
 def test_iterable_qubit_order():
@@ -286,14 +333,11 @@ def test_iterable_qubit_order():
     )
     qsim_simulator = qsimcirq.QSimSimulator()
 
-    assert (
-        qsim_simulator.compute_amplitudes(
-            circuit,
-            bitstrings=[0b00, 0b01],
-            qubit_order=reversed([q1, q0]),
-        )
-        == qsim_simulator.compute_amplitudes(circuit, bitstrings=[0b00, 0b01])
-    )
+    assert qsim_simulator.compute_amplitudes(
+        circuit,
+        bitstrings=[0b00, 0b01],
+        qubit_order=reversed([q1, q0]),
+    ) == qsim_simulator.compute_amplitudes(circuit, bitstrings=[0b00, 0b01])
 
     assert qsim_simulator.simulate(
         circuit, qubit_order=reversed([q1, q0])
@@ -361,6 +405,46 @@ def test_cirq_qsim_run(mode: str):
         assert value.shape == (5, 1)
 
 
+def test_qsim_invert_mask():
+    q0, q1 = cirq.LineQubit.range(2)
+    circuit = cirq.Circuit(
+        cirq.measure(q0, q1, key="d", invert_mask=[False, True]),
+    )
+    cirq_sample = cirq.Simulator().sample(circuit, repetitions=5)
+    qsim_sample = qsimcirq.QSimSimulator().sample(circuit, repetitions=5)
+    assert qsim_sample.equals(cirq_sample)
+
+
+def test_qsim_invert_mask_different_qubits():
+    q0, q1 = cirq.LineQubit.range(2)
+    circuit = cirq.Circuit(
+        cirq.measure(q1, key="a", invert_mask=[True]),
+        cirq.measure(q0, key="b", invert_mask=[True]),
+        cirq.measure(q0, q1, key="c", invert_mask=[False, True]),
+        cirq.measure(q1, q0, key="d", invert_mask=[False, True]),
+    )
+    cirq_sample = cirq.Simulator().sample(circuit, repetitions=5)
+    qsim_sample = qsimcirq.QSimSimulator().sample(circuit, repetitions=5)
+    assert qsim_sample.equals(cirq_sample)
+
+
+def test_qsim_invert_mask_intermediate_measure():
+    q0, q1 = cirq.LineQubit.range(2)
+    # The dataframe generated by this should be all zeroes.
+    circuit = cirq.Circuit(
+        cirq.measure(q0, q1, key="a", invert_mask=[False, False]),
+        cirq.X(q0),
+        cirq.measure(q0, q1, key="b", invert_mask=[True, False]),
+        cirq.X(q1),
+        cirq.measure(q0, q1, key="c", invert_mask=[True, True]),
+        cirq.X(q0),
+        cirq.measure(q0, q1, key="d", invert_mask=[False, True]),
+    )
+    cirq_sample = cirq.Simulator().sample(circuit, repetitions=5)
+    qsim_sample = qsimcirq.QSimSimulator().sample(circuit, repetitions=5)
+    assert qsim_sample.equals(cirq_sample)
+
+
 @pytest.mark.parametrize("mode", ["noiseless", "noisy"])
 def test_qsim_run_vs_cirq_run(mode: str):
     # Simple circuit, want to check mapping of qubit(s) to their measurements
@@ -431,6 +515,60 @@ def test_expectation_values(mode: str):
     assert cirq.approx_eq(qsim_result, cirq_result, atol=1e-6)
 
 
+@pytest.mark.parametrize("mode", ["noiseless", "noisy"])
+def test_moment_expectation_values(mode: str):
+    # Perform a single-pass Rabi oscillation, measuring Z at each step.
+    q0 = cirq.LineQubit(0)
+    steps = 20
+    circuit = cirq.Circuit(*[cirq.X(q0) ** 0.05 for _ in range(steps)])
+    psum = cirq.Z(q0)
+    params = {}
+
+    if mode == "noisy":
+        circuit.append(NoiseTrigger().on(q0))
+
+    qsim_simulator = qsimcirq.QSimSimulator()
+    qsim_result = qsim_simulator.simulate_moment_expectation_values(
+        circuit, psum, params
+    )
+    # Omit noise trigger element
+    results = [r[0] for r in qsim_result][:steps]
+    assert np.allclose(
+        [result.real for result in results],
+        [np.cos(np.pi * (i + 1) / 20) for i in range(steps)],
+        atol=1e-6,
+    )
+
+
+@pytest.mark.parametrize("mode", ["noiseless", "noisy"])
+def test_select_moment_expectation_values(mode: str):
+    # Measure different observables after specified steps.
+    q0, q1 = cirq.LineQubit.range(2)
+    circuit = cirq.Circuit(
+        cirq.Moment(cirq.X(q0), cirq.H(q1)),
+        cirq.Moment(cirq.H(q0), cirq.Z(q1)),
+        cirq.Moment(cirq.Z(q0), cirq.H(q1)),
+        cirq.Moment(cirq.H(q0), cirq.X(q1)),
+    )
+    psum_map = {
+        0: cirq.Z(q0),
+        1: [cirq.X(q0), cirq.Z(q1)],
+        3: [cirq.Z(q0), cirq.Z(q1)],
+    }
+    params = {}
+
+    if mode == "noisy":
+        circuit.append(NoiseTrigger().on(q0))
+
+    qsim_simulator = qsimcirq.QSimSimulator()
+    qsim_result = qsim_simulator.simulate_moment_expectation_values(
+        circuit, psum_map, params
+    )
+    expected_results = [[-1], [-1, 0], [1, 1]]
+    for i, result in enumerate(qsim_result):
+        assert np.allclose(result, expected_results[i])
+
+
 def test_expectation_values_terminal_measurement_check():
     a, b = [
         cirq.GridQubit(0, 0),
@@ -781,8 +919,8 @@ def test_mixture_simulation():
     possible_circuits = [
         cirq.Circuit(cirq.X(q0) ** 0.5, cirq.X(q1) ** 0.5, pf, bf)
         # Extract the operators from the mixtures to construct trajectories.
-        for pf in [NoiseStep(m).on(q0) for m in cirq.channel(pflip)]
-        for bf in [NoiseStep(m).on(q1) for m in cirq.channel(bflip)]
+        for pf in [NoiseStep(m).on(q0) for m in cirq.kraus(pflip)]
+        for bf in [NoiseStep(m).on(q1) for m in cirq.kraus(bflip)]
     ]
     possible_states = [
         cirq.Simulator().simulate(pc).state_vector() for pc in possible_circuits
@@ -823,8 +961,8 @@ def test_channel_simulation():
     possible_circuits = [
         cirq.Circuit(cirq.X(q0) ** 0.5, cirq.X(q1) ** 0.5, ad, gad)
         # Extract the operators from the channels to construct trajectories.
-        for ad in [NoiseStep(m).on(q0) for m in cirq.channel(amp_damp)]
-        for gad in [NoiseStep(m).on(q1) for m in cirq.channel(gen_amp_damp)]
+        for ad in [NoiseStep(m).on(q0) for m in cirq.kraus(amp_damp)]
+        for gad in [NoiseStep(m).on(q1) for m in cirq.kraus(gen_amp_damp)]
     ]
     possible_states = [
         cirq.Simulator().simulate(pc).state_vector() for pc in possible_circuits
@@ -860,7 +998,7 @@ def __init__(self, *prob_mat_pairs, num_qubits=1):
     def _num_qubits_(self):
         return self._num_qubits
 
-    def _channel_(self):
+    def _kraus_(self):
         return [cirq.unitary(op) for _, op, in self._prob_op_pairs]
 
     def steps(self):
@@ -976,7 +1114,8 @@ def test_noise_aggregation():
 
     # Test expectation value aggregation over repetitions of a noisy circuit.
     # Repetitions are handled in C++, so overhead costs are minimal.
-    qsim_simulator = qsimcirq.QSimSimulator(qsim_options={"r": 10000}, seed=1)
+    qsim_options = qsimcirq.QSimOptions(ev_noisy_repetitions=10000)
+    qsim_simulator = qsimcirq.QSimSimulator(qsim_options=qsim_options, seed=1)
     qsim_evs = qsim_simulator.simulate_expectation_values(circuit, [psum1, psum2])
     assert len(qsim_evs) == 2
 
@@ -987,6 +1126,37 @@ def test_noise_aggregation():
     assert cirq.approx_eq(qsim_evs, expected_evs, atol=0.05)
 
 
+def test_noise_model():
+    q0, q1 = cirq.LineQubit.range(2)
+
+    circuit = cirq.Circuit(cirq.X(q0), cirq.CNOT(q0, q1), cirq.measure(q0, q1, key="m"))
+    quiet_sim = qsimcirq.QSimSimulator()
+    quiet_results = quiet_sim.run(circuit, repetitions=100)
+    assert quiet_results.histogram(key="m")[0b11] == 100
+
+    class ReadoutError(cirq.NoiseModel):
+        def noisy_operation(self, operation: "cirq.Operation") -> "cirq.OP_TREE":
+            if isinstance(operation.gate, cirq.MeasurementGate):
+                return [cirq.X.on_each(*operation.qubits), operation]
+            return [operation]
+
+    noisy_sim = qsimcirq.QSimSimulator(noise=ReadoutError())
+    noisy_results = noisy_sim.run(circuit, repetitions=100)
+    # ReadoutError will flip both qubits.
+    assert noisy_results.histogram(key="m")[0b00] == 100
+
+    noisy_state = noisy_sim.simulate(circuit)
+    assert cirq.approx_eq(noisy_state.state_vector(), [1, 0, 0, 0])
+
+    obs = cirq.Z(q0) + cirq.Z(q1)
+    noisy_evs = noisy_sim.simulate_expectation_values(
+        circuit,
+        observables=obs,
+        permit_terminal_measurements=True,
+    )
+    assert noisy_evs == [2]
+
+
 def test_multi_qubit_fusion():
     q0, q1, q2, q3 = cirq.LineQubit.range(4)
     qubits = [q0, q1, q2, q3]
@@ -1008,10 +1178,12 @@ def test_multi_qubit_fusion():
         cirq.Y(q1) ** 0.5,
     )
 
-    qsimSim = qsimcirq.QSimSimulator(qsim_options={"f": 2})
+    options = qsimcirq.QSimOptions(max_fused_gate_size=2)
+    qsimSim = qsimcirq.QSimSimulator(qsim_options=options)
     result_2q_fusion = qsimSim.simulate(cirq_circuit, qubit_order=qubits)
 
-    qsimSim = qsimcirq.QSimSimulator(qsim_options={"f": 4})
+    options.max_fused_gate_size = 4
+    qsimSim = qsimcirq.QSimSimulator(qsim_options=options)
     result_4q_fusion = qsimSim.simulate(cirq_circuit, qubit_order=qubits)
     assert cirq.linalg.allclose_up_to_global_phase(
         result_2q_fusion.state_vector(), result_4q_fusion.state_vector()
@@ -1022,16 +1194,17 @@ def test_multi_qubit_fusion():
 def test_cirq_qsim_simulate_random_unitary(mode: str):
 
     q0, q1 = cirq.LineQubit.range(2)
-    qsimSim = qsimcirq.QSimSimulator(qsim_options={"t": 16, "v": 0})
+    options = qsimcirq.QSimOptions(cpu_threads=16, verbosity=0)
+    qsimSim = qsimcirq.QSimSimulator(qsim_options=options)
     for iter in range(10):
         random_circuit = cirq.testing.random_circuit(
             qubits=[q0, q1], n_moments=8, op_density=0.99, random_state=iter
         )
 
-        cirq.ConvertToCzAndSingleGates().optimize_circuit(
-            random_circuit
-        )  # cannot work with params
-        cirq.ExpandComposite().optimize_circuit(random_circuit)
+        random_circuit = cirq.optimize_for_target_gateset(
+            random_circuit, gateset=cirq.CZTargetGateset()
+        )
+        random_circuit = cirq.expand_composite(random_circuit)
         if mode == "noisy":
             random_circuit.append(NoiseTrigger().on(q0))
 
@@ -1062,6 +1235,199 @@ def test_cirq_qsimh_simulate():
     assert np.allclose(result, [0j, 0j, (1 + 0j), 0j])
 
 
+def test_qsim_gpu_unavailable():
+    if qsimcirq.qsim_gpu is not None:
+        pytest.skip("GPU is available; skipping test.")
+
+    # Attempt to create a simulator with GPU support.
+    gpu_options = qsimcirq.QSimOptions(use_gpu=True)
+    with pytest.raises(
+        ValueError,
+        match="GPU execution requested, but not supported",
+    ):
+        _ = qsimcirq.QSimSimulator(qsim_options=gpu_options)
+
+
+def test_cirq_qsim_gpu_amplitudes():
+    if qsimcirq.qsim_gpu is None:
+        pytest.skip("GPU is not available for testing.")
+    # Pick qubits.
+    a, b = [cirq.GridQubit(0, 0), cirq.GridQubit(0, 1)]
+
+    # Create a circuit
+    cirq_circuit = cirq.Circuit(cirq.CNOT(a, b), cirq.CNOT(b, a), cirq.X(a))
+
+    # Enable GPU acceleration.
+    gpu_options = qsimcirq.QSimOptions(use_gpu=True)
+    qsimGpuSim = qsimcirq.QSimSimulator(qsim_options=gpu_options)
+    result = qsimGpuSim.compute_amplitudes(
+        cirq_circuit, bitstrings=[0b00, 0b01, 0b10, 0b11]
+    )
+    assert np.allclose(result, [0j, 0j, (1 + 0j), 0j])
+
+
+def test_cirq_qsim_gpu_simulate():
+    if qsimcirq.qsim_gpu is None:
+        pytest.skip("GPU is not available for testing.")
+    # Pick qubits.
+    a, b = [cirq.GridQubit(0, 0), cirq.GridQubit(0, 1)]
+
+    # Create a circuit
+    cirq_circuit = cirq.Circuit(cirq.H(a), cirq.CNOT(a, b), cirq.X(b))
+
+    # Enable GPU acceleration.
+    gpu_options = qsimcirq.QSimOptions(use_gpu=True)
+    qsimGpuSim = qsimcirq.QSimSimulator(qsim_options=gpu_options)
+    result = qsimGpuSim.simulate(cirq_circuit)
+    assert result.state_vector().shape == (4,)
+
+    cirqSim = cirq.Simulator()
+    cirq_result = cirqSim.simulate(cirq_circuit)
+    assert cirq.linalg.allclose_up_to_global_phase(
+        result.state_vector(), cirq_result.state_vector(), atol=1.0e-6
+    )
+
+
+def test_cirq_qsim_gpu_expectation_values():
+    if qsimcirq.qsim_gpu is None:
+        pytest.skip("GPU is not available for testing.")
+    # Pick qubits.
+    a, b = [cirq.GridQubit(0, 0), cirq.GridQubit(0, 1)]
+
+    # Create a circuit
+    cirq_circuit = cirq.Circuit(cirq.H(a), cirq.CNOT(a, b), cirq.X(b))
+    obs = [cirq.Z(a) * cirq.Z(b)]
+
+    # Enable GPU acceleration.
+    gpu_options = qsimcirq.QSimOptions(use_gpu=True)
+    qsimGpuSim = qsimcirq.QSimSimulator(qsim_options=gpu_options)
+    result = qsimGpuSim.simulate_expectation_values(cirq_circuit, obs)
+
+    cirqSim = cirq.Simulator()
+    cirq_result = cirqSim.simulate_expectation_values(cirq_circuit, obs)
+    assert np.allclose(result, cirq_result)
+
+
+def test_cirq_qsim_gpu_input_state():
+    if qsimcirq.qsim_gpu is None:
+        pytest.skip("GPU is not available for testing.")
+    # Pick qubits.
+    a, b = [cirq.GridQubit(0, 0), cirq.GridQubit(0, 1)]
+
+    # Create a circuit
+    cirq_circuit = cirq.Circuit(cirq.H(a), cirq.CNOT(a, b), cirq.X(b))
+
+    # Enable GPU acceleration.
+    gpu_options = qsimcirq.QSimOptions(use_gpu=True)
+    qsimGpuSim = qsimcirq.QSimSimulator(qsim_options=gpu_options)
+    initial_state = np.asarray([0.5] * 4, dtype=np.complex64)
+    result = qsimGpuSim.simulate(cirq_circuit, initial_state=initial_state)
+    assert result.state_vector().shape == (4,)
+
+    cirqSim = cirq.Simulator()
+    cirq_result = cirqSim.simulate(cirq_circuit, initial_state=initial_state)
+    assert cirq.linalg.allclose_up_to_global_phase(
+        result.state_vector(), cirq_result.state_vector(), atol=1.0e-6
+    )
+
+
+def test_cirq_qsim_custatevec_amplitudes():
+    if qsimcirq.qsim_custatevec is None:
+        pytest.skip("cuStateVec library is not available for testing.")
+    # Pick qubits.
+    a, b = [cirq.GridQubit(0, 0), cirq.GridQubit(0, 1)]
+
+    # Create a circuit
+    cirq_circuit = cirq.Circuit(cirq.CNOT(a, b), cirq.CNOT(b, a), cirq.X(a))
+
+    # Enable GPU acceleration.
+    custatevec_options = qsimcirq.QSimOptions(gpu_mode=1)
+    qsimGpuSim = qsimcirq.QSimSimulator(qsim_options=custatevec_options)
+    result = qsimGpuSim.compute_amplitudes(
+        cirq_circuit, bitstrings=[0b00, 0b01, 0b10, 0b11]
+    )
+    assert np.allclose(result, [0j, 0j, (1 + 0j), 0j])
+
+
+def test_cirq_qsim_custatevec_simulate():
+    if qsimcirq.qsim_custatevec is None:
+        pytest.skip("cuStateVec library is not available for testing.")
+    # Pick qubits.
+    a, b = [cirq.GridQubit(0, 0), cirq.GridQubit(0, 1)]
+
+    # Create a circuit
+    cirq_circuit = cirq.Circuit(cirq.H(a), cirq.CNOT(a, b), cirq.X(b))
+
+    # Enable GPU acceleration.
+    custatevec_options = qsimcirq.QSimOptions(gpu_mode=1)
+    qsimGpuSim = qsimcirq.QSimSimulator(qsim_options=custatevec_options)
+    result = qsimGpuSim.simulate(cirq_circuit)
+    assert result.state_vector().shape == (4,)
+
+    cirqSim = cirq.Simulator()
+    cirq_result = cirqSim.simulate(cirq_circuit)
+    assert cirq.linalg.allclose_up_to_global_phase(
+        result.state_vector(), cirq_result.state_vector(), atol=1.0e-6
+    )
+
+
+def test_cirq_qsim_custatevec_expectation_values():
+    if qsimcirq.qsim_custatevec is None:
+        pytest.skip("cuStateVec library is not available for testing.")
+    # Pick qubits.
+    a, b = [cirq.GridQubit(0, 0), cirq.GridQubit(0, 1)]
+
+    # Create a circuit
+    cirq_circuit = cirq.Circuit(cirq.H(a), cirq.CNOT(a, b), cirq.X(b))
+    obs = [cirq.Z(a) * cirq.Z(b)]
+
+    # Enable GPU acceleration.
+    custatevec_options = qsimcirq.QSimOptions(gpu_mode=1)
+    qsimGpuSim = qsimcirq.QSimSimulator(qsim_options=custatevec_options)
+    result = qsimGpuSim.simulate_expectation_values(cirq_circuit, obs)
+
+    cirqSim = cirq.Simulator()
+    cirq_result = cirqSim.simulate_expectation_values(cirq_circuit, obs)
+    assert np.allclose(result, cirq_result)
+
+
+def test_cirq_qsim_custatevec_input_state():
+    if qsimcirq.qsim_custatevec is None:
+        pytest.skip("cuStateVec library is not available for testing.")
+    # Pick qubits.
+    a, b = [cirq.GridQubit(0, 0), cirq.GridQubit(0, 1)]
+
+    # Create a circuit
+    cirq_circuit = cirq.Circuit(cirq.H(a), cirq.CNOT(a, b), cirq.X(b))
+
+    # Enable GPU acceleration.
+    custatevec_options = qsimcirq.QSimOptions(gpu_mode=1)
+    qsimGpuSim = qsimcirq.QSimSimulator(qsim_options=custatevec_options)
+    initial_state = np.asarray([0.5] * 4, dtype=np.complex64)
+    result = qsimGpuSim.simulate(cirq_circuit, initial_state=initial_state)
+    assert result.state_vector().shape == (4,)
+
+    cirqSim = cirq.Simulator()
+    cirq_result = cirqSim.simulate(cirq_circuit, initial_state=initial_state)
+    assert cirq.linalg.allclose_up_to_global_phase(
+        result.state_vector(), cirq_result.state_vector(), atol=1.0e-6
+    )
+
+
+def test_cirq_qsim_old_options():
+    old_options = {"f": 3, "t": 4, "r": 100, "v": 1}
+    old_sim = qsimcirq.QSimSimulator(qsim_options=old_options)
+
+    new_options = qsimcirq.QSimOptions(
+        max_fused_gate_size=3,
+        cpu_threads=4,
+        ev_noisy_repetitions=100,
+        verbosity=1,
+    )
+    new_sim = qsimcirq.QSimSimulator(qsim_options=new_options)
+    assert new_sim.qsim_options == old_sim.qsim_options
+
+
 def test_cirq_qsim_params():
     qubit = cirq.GridQubit(0, 0)
 
@@ -1371,12 +1737,22 @@ def test_cirq_qsim_global_shift():
     cirq_result = simulator.simulate(circuit)
 
     qsim_simulator = qsimcirq.QSimSimulator()
-    qsim_result = qsim_simulator.simulate(circuit)
+    qsim_result1 = qsim_simulator.simulate(circuit)
 
     assert cirq.linalg.allclose_up_to_global_phase(
-        qsim_result.state_vector(), cirq_result.state_vector()
+        qsim_result1.state_vector(), cirq_result.state_vector()
     )
 
+    qsim_simulator.qsim_options["z"] = True
+    qsim_result2 = qsim_simulator.simulate(circuit)
+
+    assert (qsim_result1.state_vector() == qsim_result2.state_vector()).all()
+
+    qsim_simulator.qsim_options["z"] = False
+    qsim_result3 = qsim_simulator.simulate(circuit)
+
+    assert (qsim_result1.state_vector() == qsim_result3.state_vector()).all()
+
 
 @pytest.mark.parametrize("mode", ["noiseless", "noisy"])
 def test_cirq_qsim_circuit_memoization_compute_amplitudes(mode: str):
diff --git a/requirements.txt b/requirements.txt
index 01d20cd5..0a95d64c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,13 +1,5 @@
-# Runtime requirements for the python 3 version of cirq.
-
+absl-py
 cirq-core
 numpy~=1.16
-typing_extensions
-absl-py
-
-# Build and test requirements
-
-black==20.8b1
-flynt~=0.60
 pybind11
-pytest
+typing_extensions
diff --git a/setup.py b/setup.py
index 9fea30da..10c7aa15 100644
--- a/setup.py
+++ b/setup.py
@@ -38,6 +38,7 @@ def run(self):
     def build_extension(self, ext):
         extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
         cmake_args = [
+            "-DCMAKE_CUDA_COMPILER=nvcc",
             "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir,
             "-DPYTHON_EXECUTABLE=" + sys.executable,
         ]
@@ -49,7 +50,7 @@ def build_extension(self, ext):
             cmake_args += [
                 "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}".format(cfg.upper(), extdir)
             ]
-            if sys.maxsize > 2 ** 32:
+            if sys.maxsize > 2**32:
                 cmake_args += ["-A", "x64"]
             build_args += ["--", "/m"]
         else:
@@ -77,6 +78,7 @@ def build_extension(self, ext):
 
 
 requirements = open("requirements.txt").readlines()
+dev_requirements = open("dev-requirements.txt").readlines()
 
 description = "Schrödinger and Schrödinger-Feynman simulators for quantum circuits."
 
@@ -89,10 +91,14 @@ def build_extension(self, ext):
 setup(
     name="qsimcirq",
     version=__version__,
+    url="https://github.com/quantumlib/qsim",
     author="Vamsi Krishna Devabathini",
     author_email="devabathini92@gmail.com",
     python_requires=">=3.3.0",
     install_requires=requirements,
+    extras_require={
+        "dev": dev_requirements,
+    },
     license="Apache 2",
     description=description,
     long_description=long_description,
@@ -102,6 +108,8 @@ def build_extension(self, ext):
         CMakeExtension("qsimcirq/qsim_avx2"),
         CMakeExtension("qsimcirq/qsim_sse"),
         CMakeExtension("qsimcirq/qsim_basic"),
+        CMakeExtension("qsimcirq/qsim_cuda"),
+        CMakeExtension("qsimcirq/qsim_custatevec"),
         CMakeExtension("qsimcirq/qsim_decide"),
     ],
     cmdclass=dict(build_ext=CMakeBuild),
diff --git a/tests/BUILD b/tests/BUILD
index 71a4f916..a6c05c96 100644
--- a/tests/BUILD
+++ b/tests/BUILD
@@ -31,6 +31,23 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "channel_test",
+    srcs = ["channel_test.cc"],
+    deps = [
+        "@com_google_googletest//:gtest_main",
+        "//lib:channel",
+        "//lib:formux",
+        "//lib:gates_cirq",
+        "//lib:matrix",
+        "//lib:simulator",
+    ],
+    copts = select({
+        ":windows": windows_copts,
+        "//conditions:default": [],
+    }),
+)
+
 cc_test(
     name = "channels_cirq_test",
     srcs = ["channels_cirq_test.cc"],
@@ -197,8 +214,10 @@ cc_library(
     }),
     deps = [
         "@com_google_googletest//:gtest_main",
+        "//lib:channel",
         "//lib:channels_cirq",
         "//lib:circuit_noisy",
+        "//lib:expect",
         "//lib:fuser_mqubit",
         "//lib:gate_appl",
         "//lib:gates_cirq",
@@ -265,6 +284,7 @@ cc_library(
         "//lib:gate_appl",
         "//lib:gates_qsim",
         "//lib:io",
+        "//lib:util_cpu",
     ],
     testonly = 1,
 )
@@ -612,6 +632,7 @@ cc_test(
         "@com_google_googletest//:gtest_main",
         "//lib:gate_appl",
         "//lib:gates_cirq",
+        "//lib:gates_qsim",
         "//lib:mps_simulator",
         "//lib:formux",
     ],
diff --git a/tests/Makefile b/tests/Makefile
index d10e887b..f4a37278 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -8,8 +8,11 @@ CXX_TARGETS = $(shell\
 )
 CXX_TARGETS := $(CXX_TARGETS:%.cc=%.x)
 
-CUDA_TARGETS = $(shell find . -maxdepth 1 -name "*_test.cu")
-CUDA_TARGETS := $(CUDA_TARGETS:%.cu=%.x)
+CUDA_TARGETS = $(shell find . -maxdepth 1 -name "*cuda_test.cu")
+CUDA_TARGETS := $(CUDA_TARGETS:%cuda_test.cu=%cuda_test.x)
+
+CUSTATEVEC_TARGETS = $(shell find . -maxdepth 1 -name "*custatevec_test.cu")
+CUSTATEVEC_TARGETS := $(CUSTATEVEC_TARGETS:%custatevec_test.cu=%custatevec_test.x)
 
 GTEST_DIR = $(CURDIR)/googletest/googletest
 GMOCK_DIR = $(CURDIR)/googletest/googlemock
@@ -24,6 +27,9 @@ cxx-tests: $(CXX_TARGETS)
 .PHONY: cuda-tests
 cuda-tests: $(CUDA_TARGETS)
 
+.PHONY: custatevec-tests
+custatevec-tests: $(CUSTATEVEC_TARGETS)
+
 .PHONY: run-cxx-tests
 run-cxx-tests: cxx-tests
 	for exe in $(CXX_TARGETS); do if ! ./$$exe; then exit 1; fi; done
@@ -32,6 +38,10 @@ run-cxx-tests: cxx-tests
 run-cuda-tests: cuda-tests
 	for exe in $(CUDA_TARGETS); do if ! ./$$exe; then exit 1; fi; done
 
+.PHONY: run-custatevec-tests
+run-custatevec-tests: custatevec-tests
+	for exe in $(CUSTATEVEC_TARGETS); do if ! ./$$exe; then exit 1; fi; done
+
 $(GTEST_DIR)/make:
 	-git submodule update --init --recursive googletest
 	mkdir -p $(GTEST_DIR)/make
@@ -40,9 +50,12 @@ $(GTEST_DIR)/make:
 %.x: %.cc $(GTEST_DIR)/make
 	$(CXX) -o ./$@ $< $(TESTFLAGS) $(CXXFLAGS) $(ARCHFLAGS)
 
-%.x: %.cu $(GTEST_DIR)/make
+%cuda_test.x: %cuda_test.cu $(GTEST_DIR)/make
 	$(NVCC) -o ./$@ $< $(TESTFLAGS) $(NVCCFLAGS)
 
+%custatevec_test.x: %custatevec_test.cu $(GTEST_DIR)/make
+	$(NVCC) -o ./$@ $< $(TESTFLAGS) $(NVCCFLAGS) $(CUSTATEVECFLAGS)
+
 .PHONY: clean
 clean:
 	-rm -f ./*.x ./*.a ./*.so ./*.mod
diff --git a/tests/channel_test.cc b/tests/channel_test.cc
new file mode 100644
index 00000000..15b2307e
--- /dev/null
+++ b/tests/channel_test.cc
@@ -0,0 +1,256 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cmath>
+#include <cstdint>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "../lib/channel.h"
+#include "../lib/formux.h"
+#include "../lib/gates_cirq.h"
+#include "../lib/matrix.h"
+#include "../lib/simmux.h"
+
+namespace qsim {
+
+namespace {
+
+template <typename KrausOperator>
+void TestUnitaryKrausOparator(const KrausOperator& kop) {
+  // Should be an identity matrix.
+
+  unsigned m = 1 << kop.qubits.size();
+
+  for (unsigned i = 0; i < m; ++i) {
+    for (unsigned j = 0; j < m; ++j) {
+      auto re = kop.kd_k.data()[2 * m * i + 2 * j];
+      auto im = kop.kd_k.data()[2 * m * i + 2 * j + 1];
+
+      if (i == j) {
+        EXPECT_NEAR(re, 1, 1e-6);
+        EXPECT_NEAR(im, 0, 1e-7);
+      } else {
+        EXPECT_NEAR(re, 0, 1e-7);
+        EXPECT_NEAR(im, 0, 1e-7);
+      }
+    }
+  }
+}
+
+template <typename KrausOperator, typename StateSpace, typename Simulator,
+          typename State>
+void TestNonUnitaryKrausOparator(const KrausOperator& kop,
+                                 const StateSpace& state_space,
+                                 const Simulator& simulator,
+                                 State& state0, State& state1) {
+  state_space.SetStateUniform(state0);
+  state_space.SetStateUniform(state1);
+
+  for (const auto&op : kop.ops) {
+    simulator.ApplyGate(op.qubits, op.matrix.data(), state0);
+  }
+
+  for (auto it = kop.ops.rbegin(); it != kop.ops.rend(); ++it) {
+    auto md = it->matrix;
+    MatrixDagger(1 << it->qubits.size(), md);
+    simulator.ApplyGate(it->qubits, md.data(), state0);
+  }
+
+  simulator.ApplyGate(kop.qubits, kop.kd_k.data(), state1);
+
+  unsigned size = unsigned{1} << (state0.num_qubits() + 1);
+
+  for (unsigned i = 0; i < size; ++i) {
+    EXPECT_NEAR(state0.get()[i], state1.get()[i], 1e-7);
+  }
+}
+
+}  // namespace
+
+TEST(ChannelTest, UnitaryKdKMatrix) {
+  using fp_type = Simulator<For>::fp_type;
+  using Gate = Cirq::GateCirq<fp_type>;
+
+  auto normal = KrausOperator<Gate>::kNormal;
+
+  Channel<Gate> channel = {
+    {
+      normal, 1, 0.2, {
+                        Cirq::FSimGate<fp_type>::Create(0, 3, 4, 0.1, 1.4),
+                      }
+    },
+    {
+      normal, 1, 0.2, {
+                        Cirq::rx<fp_type>::Create(0, 0, 0.1),
+                        Cirq::ry<fp_type>::Create(0, 1, 0.2),
+                        Cirq::FSimGate<fp_type>::Create(1, 0, 1, 0.2, 1.3),
+                      }
+    },
+    {
+      normal, 1, 0.2, {
+                        Cirq::rz<fp_type>::Create(0, 3, 0.3),
+                        Cirq::rx<fp_type>::Create(0, 1, 0.4),
+                        Cirq::ry<fp_type>::Create(0, 4, 0.5),
+                        Cirq::rz<fp_type>::Create(0, 0, 0.6),
+                      }
+    },
+    {
+      normal, 1, 0.2, {
+                        Cirq::rx<fp_type>::Create(0, 4, 0.7),
+                        Cirq::ry<fp_type>::Create(0, 3, 0.8),
+                        Cirq::rz<fp_type>::Create(0, 1, 0.9),
+                        Cirq::rx<fp_type>::Create(0, 0, 1.0),
+                        Cirq::FSimGate<fp_type>::Create(1, 1, 3, 0.3, 1.2),
+                        Cirq::FSimGate<fp_type>::Create(1, 0, 4, 0.4, 1.1),
+                      }
+    },
+    {
+      normal, 1, 0.2, {
+                        Cirq::ry<fp_type>::Create(0, 7, 1.1),
+                        Cirq::rz<fp_type>::Create(0, 5, 1.2),
+                        Cirq::rx<fp_type>::Create(0, 1, 1.3),
+                        Cirq::ry<fp_type>::Create(0, 3, 1.4),
+                        Cirq::rz<fp_type>::Create(0, 2, 1.5),
+                        Cirq::rx<fp_type>::Create(0, 4, 1.6),
+                        Cirq::FSimGate<fp_type>::Create(1, 4, 5, 0.5, 1.0),
+                        Cirq::FSimGate<fp_type>::Create(1, 1, 3, 0.6, 0.9),
+                        Cirq::FSimGate<fp_type>::Create(1, 2, 7, 0.7, 0.8),
+                      }
+    },
+  };
+
+  channel[0].CalculateKdKMatrix();
+  ASSERT_EQ(channel[0].kd_k.size(), 32);
+  ASSERT_EQ(channel[0].qubits.size(), 2);
+  EXPECT_EQ(channel[0].qubits[0], 3);
+  EXPECT_EQ(channel[0].qubits[1], 4);
+  TestUnitaryKrausOparator(channel[0]);
+
+  channel[1].CalculateKdKMatrix();
+  ASSERT_EQ(channel[1].kd_k.size(), 32);
+  ASSERT_EQ(channel[1].qubits.size(), 2);
+  EXPECT_EQ(channel[1].qubits[0], 0);
+  EXPECT_EQ(channel[1].qubits[1], 1);
+  TestUnitaryKrausOparator(channel[1]);
+
+  channel[2].CalculateKdKMatrix();
+  ASSERT_EQ(channel[2].kd_k.size(), 512);
+  ASSERT_EQ(channel[2].qubits.size(), 4);
+  EXPECT_EQ(channel[2].qubits[0], 0);
+  EXPECT_EQ(channel[2].qubits[1], 1);
+  EXPECT_EQ(channel[2].qubits[2], 3);
+  EXPECT_EQ(channel[2].qubits[3], 4);
+  TestUnitaryKrausOparator(channel[2]);
+
+  channel[3].CalculateKdKMatrix();
+  ASSERT_EQ(channel[3].kd_k.size(), 512);
+  ASSERT_EQ(channel[3].qubits.size(), 4);
+  EXPECT_EQ(channel[3].qubits[0], 0);
+  EXPECT_EQ(channel[3].qubits[1], 1);
+  EXPECT_EQ(channel[3].qubits[2], 3);
+  EXPECT_EQ(channel[3].qubits[3], 4);
+  TestUnitaryKrausOparator(channel[3]);
+
+  channel[4].CalculateKdKMatrix();
+  ASSERT_EQ(channel[4].kd_k.size(), 8192);
+  ASSERT_EQ(channel[4].qubits.size(), 6);
+  EXPECT_EQ(channel[4].qubits[0], 1);
+  EXPECT_EQ(channel[4].qubits[1], 2);
+  EXPECT_EQ(channel[4].qubits[2], 3);
+  EXPECT_EQ(channel[4].qubits[3], 4);
+  EXPECT_EQ(channel[4].qubits[4], 5);
+  EXPECT_EQ(channel[4].qubits[5], 7);
+  TestUnitaryKrausOparator(channel[4]);
+}
+
+TEST(ChannelTest, NonUnitaryKdKMatrix) {
+  using StateSpace = Simulator<For>::StateSpace;
+  using State = StateSpace::State;
+  using fp_type = StateSpace::fp_type;
+  using Gate = Cirq::GateCirq<fp_type>;
+  using M1 = Cirq::MatrixGate1<fp_type>;
+  using M2 = Cirq::MatrixGate2<fp_type>;
+
+  unsigned  num_qubits = 8;
+  auto normal = KrausOperator<Gate>::kNormal;
+
+  Channel<Gate> channel = {
+    {
+      normal, 0, 0, {
+                      M1::Create(0, 0,
+                                 {0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4}),
+                      M1::Create(0, 1,
+                                 {0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4, 0.1}),
+                      M2::Create(0, 0, 1,
+                                 {0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4,
+                                  0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4, 0.1,
+                                  0.3, 0.4, 0.1, 0.2, 0.3, 0.4, 0.1, 0.2,
+                                  0.4, 0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3}),
+                    }
+    },
+    {
+      normal, 0, 0, {
+                      M1::Create(0, 4,
+                                 {0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4}),
+                      M1::Create(0, 3,
+                                 {0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4, 0.1}),
+                      M1::Create(0, 1,
+                                 {0.3, 0.4, 0.1, 0.2, 0.3, 0.4, 0.1, 0.2}),
+                      M1::Create(0, 0,
+                                 {0.4, 0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3}),
+                      M2::Create(0, 0, 4,
+                                 {0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4,
+                                  0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4, 0.1,
+                                  0.3, 0.4, 0.1, 0.2, 0.3, 0.4, 0.1, 0.2,
+                                  0.4, 0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3}),
+                    }
+    },
+  };
+
+  StateSpace state_space(1);
+  Simulator<For> simulator(1);
+
+  State state0 = state_space.Create(num_qubits);
+  ASSERT_FALSE(state_space.IsNull(state0));
+
+  State state1 = state_space.Create(num_qubits);
+  ASSERT_FALSE(state_space.IsNull(state1));
+
+  channel[0].CalculateKdKMatrix();
+  ASSERT_EQ(channel[0].kd_k.size(), 32);
+  ASSERT_EQ(channel[0].qubits.size(), 2);
+  EXPECT_EQ(channel[0].qubits[0], 0);
+  EXPECT_EQ(channel[0].qubits[1], 1);
+  TestNonUnitaryKrausOparator(
+      channel[0], state_space, simulator, state0, state1);
+
+  channel[1].CalculateKdKMatrix();
+  ASSERT_EQ(channel[1].kd_k.size(), 512);
+  ASSERT_EQ(channel[1].qubits.size(), 4);
+  EXPECT_EQ(channel[1].qubits[0], 0);
+  EXPECT_EQ(channel[1].qubits[1], 1);
+  EXPECT_EQ(channel[1].qubits[2], 3);
+  EXPECT_EQ(channel[1].qubits[3], 4);
+  TestNonUnitaryKrausOparator(
+      channel[1], state_space, simulator, state0, state1);
+}
+
+}  // namespace qsim
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/channels_cirq_test.cc b/tests/channels_cirq_test.cc
index b7bdade5..dedb922e 100644
--- a/tests/channels_cirq_test.cc
+++ b/tests/channels_cirq_test.cc
@@ -68,10 +68,10 @@ void RunBatch(const NoisyCircuit<Gate>& ncircuit,
   unsigned num_threads = 1;
 
   auto measure = [](uint64_t r, const State& state,
-                    const std::vector<uint64_t>& stat,
+                    const QTSimulator::Stat& stat,
                     std::vector<unsigned>& histogram) {
-    ASSERT_EQ(stat.size(), 1);
-    ++histogram[stat[0]];
+    ASSERT_EQ(stat.samples.size(), 1);
+    ++histogram[stat.samples[0]];
   };
 
   std::vector<unsigned> histogram(1 << num_qubits, 0);
diff --git a/tests/expect_test.cc b/tests/expect_test.cc
index 19f4eb10..61e0fa77 100644
--- a/tests/expect_test.cc
+++ b/tests/expect_test.cc
@@ -94,7 +94,7 @@ TEST(ExpectTest, ExpectationValue) {
   Fuser::Parameter param;
   param.max_fused_size = 4;
 
-  State tmp_state = state_space.Create(num_qubits);
+  State tmp_state = state_space.Null();
 
   for (unsigned k = 1; k <= 6; ++k) {
     std::vector<OpString<GateQSim<fp_type>>> strings;
@@ -120,8 +120,12 @@ TEST(ExpectTest, ExpectationValue) {
       }
     }
 
-    auto evala = ExpectationValue<Fuser>(param, strings, state_space, simulator,
-                                         state, tmp_state);
+    if (k == 2) {
+      tmp_state = state_space.Create(num_qubits - 2);
+    }
+
+    auto evala = ExpectationValue<IO, Fuser>(param, strings, state_space,
+                                             simulator, state, tmp_state);
 
     EXPECT_NEAR(std::real(evala), expected_real[k - 1], 1e-6);
     EXPECT_NEAR(std::imag(evala), 0, 1e-8);
diff --git a/tests/fuser_basic_test.cc b/tests/fuser_basic_test.cc
index 250d9f30..1eb85c28 100644
--- a/tests/fuser_basic_test.cc
+++ b/tests/fuser_basic_test.cc
@@ -1438,6 +1438,41 @@ TEST(FuserBasicTest, InvalidTimeOrder) {
   }
 }
 
+TEST(FuserBasicTest, QubitsOutOfRange) {
+  using Gate = GateQSim<float>;
+  using Fuser = BasicGateFuser<IO, Gate>;
+
+  Fuser::Parameter param;
+  param.verbosity = 0;
+
+  {
+    unsigned num_qubits = 3;
+    std::vector<Gate> circuit = {
+      GateCZ<float>::Create(0, 0, 3),
+      GateCZ<float>::Create(0, 1, 2),
+    };
+
+    auto fused_gates = Fuser::FuseGates(
+        param, num_qubits, circuit.begin(), circuit.end(), false);
+
+    EXPECT_EQ(fused_gates.size(), 0);
+  }
+
+  {
+    unsigned num_qubits = 3;
+    auto gate = GateZ<float>::Create(0, 2);
+    std::vector<Gate> circuit = {
+      GateCZ<float>::Create(0, 0, 1),
+      MakeControlledGate({3}, gate),
+    };
+
+    auto fused_gates = Fuser::FuseGates(
+        param, num_qubits, circuit.begin(), circuit.end(), false);
+
+    EXPECT_EQ(fused_gates.size(), 0);
+  }
+}
+
 }  // namespace qsim
 
 int main(int argc, char** argv) {
diff --git a/tests/fuser_mqubit_test.cc b/tests/fuser_mqubit_test.cc
index d63e20af..cf49b73b 100644
--- a/tests/fuser_mqubit_test.cc
+++ b/tests/fuser_mqubit_test.cc
@@ -44,6 +44,7 @@ enum DummyGateKind {
 
 struct DummyGate {
   using GateKind = DummyGateKind;
+  using fp_type = float;
 
   GateKind kind;
   unsigned time;
@@ -263,7 +264,7 @@ TEST(FuserMultiQubitTest, RandomCircuit1) {
   for (unsigned q = 2; q <= 6; ++q) {
     param.max_fused_size = q;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
   }
@@ -272,7 +273,7 @@ TEST(FuserMultiQubitTest, RandomCircuit1) {
     param.max_fused_size = q;
     auto fused_gates = Fuser::FuseGates(
         param, num_qubits, circuit.begin(), circuit.end(),
-        {5000, 7000, 25000, 37000});
+        {5000, 7000, 25000, 37000}, false);
 
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
   }
@@ -301,7 +302,7 @@ TEST(FuserMultiQubitTest, RandomCircuit2) {
   for (unsigned q = 2; q <= 6; ++q) {
     param.max_fused_size = q;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, pcircuit.begin(), pcircuit.end());
+        param, num_qubits, pcircuit.begin(), pcircuit.end(), false);
 
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
   }
@@ -309,7 +310,8 @@ TEST(FuserMultiQubitTest, RandomCircuit2) {
   for (unsigned q = 2; q <= 6; ++q) {
     param.max_fused_size = q;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, pcircuit.begin(), pcircuit.end(), {300, 700, 2400});
+        param, num_qubits, pcircuit.begin(), pcircuit.end(),
+        {300, 700, 2400}, false);
 
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
   }
@@ -404,7 +406,7 @@ TEST(FuserMultiQubitTest, SmallCircuits) {
 
     param.max_fused_size = 4;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
     EXPECT_EQ(fused_gates.size(), 1);
@@ -424,7 +426,7 @@ TEST(FuserMultiQubitTest, SmallCircuits) {
 
     param.max_fused_size = 4;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
     EXPECT_EQ(fused_gates.size(), 3);
@@ -447,7 +449,7 @@ TEST(FuserMultiQubitTest, SmallCircuits) {
 
     param.max_fused_size = 6;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
     EXPECT_EQ(fused_gates.size(), 1);
@@ -472,7 +474,7 @@ TEST(FuserMultiQubitTest, SmallCircuits) {
 
     param.max_fused_size = 6;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
     EXPECT_EQ(fused_gates.size(), 3);
@@ -488,7 +490,7 @@ TEST(FuserMultiQubitTest, SmallCircuits) {
 
     param.max_fused_size = 3;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
     EXPECT_EQ(fused_gates.size(), 2);
@@ -505,7 +507,7 @@ TEST(FuserMultiQubitTest, SmallCircuits) {
 
     param.max_fused_size = 3;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
     EXPECT_EQ(fused_gates.size(), 3);
@@ -523,7 +525,7 @@ TEST(FuserMultiQubitTest, SmallCircuits) {
 
     param.max_fused_size = 3;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
     EXPECT_EQ(fused_gates.size(), 3);
@@ -542,7 +544,7 @@ TEST(FuserMultiQubitTest, SmallCircuits) {
 
     param.max_fused_size = 3;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
     EXPECT_EQ(fused_gates.size(), 4);
@@ -563,7 +565,7 @@ TEST(FuserMultiQubitTest, SmallCircuits) {
 
     param.max_fused_size = 5;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
     EXPECT_EQ(fused_gates.size(), 3);
@@ -579,7 +581,7 @@ TEST(FuserMultiQubitTest, SmallCircuits) {
 
     param.max_fused_size = 3;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
     EXPECT_EQ(fused_gates.size(), 1);
@@ -595,7 +597,7 @@ TEST(FuserMultiQubitTest, SmallCircuits) {
 
     param.max_fused_size = 3;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
     EXPECT_EQ(fused_gates.size(), 1);
@@ -613,7 +615,7 @@ TEST(FuserMultiQubitTest, SmallCircuits) {
 
     param.max_fused_size = 3;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
     EXPECT_EQ(fused_gates.size(), 4);
@@ -633,7 +635,7 @@ TEST(FuserMultiQubitTest, SmallCircuits) {
     {
       param.max_fused_size = 3;
       auto fused_gates = Fuser::FuseGates(
-          param, num_qubits, circuit.begin(), circuit.end());
+          param, num_qubits, circuit.begin(), circuit.end(), false);
 
       EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
       EXPECT_EQ(fused_gates.size(), 4);
@@ -642,7 +644,7 @@ TEST(FuserMultiQubitTest, SmallCircuits) {
     {
       param.max_fused_size = 5;
       auto fused_gates = Fuser::FuseGates(
-          param, num_qubits, circuit.begin(), circuit.end());
+          param, num_qubits, circuit.begin(), circuit.end(), false);
 
       EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
 
@@ -680,7 +682,7 @@ TEST(FuserMultiQubitTest, ValidTimeOrder) {
 
     param.max_fused_size = 2;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_EQ(fused_gates.size(), 14);
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
@@ -705,7 +707,7 @@ TEST(FuserMultiQubitTest, ValidTimeOrder) {
 
     param.max_fused_size = 6;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_EQ(fused_gates.size(), 3);
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
@@ -724,7 +726,7 @@ TEST(FuserMultiQubitTest, ValidTimeOrder) {
 
     param.max_fused_size = 3;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_EQ(fused_gates.size(), 4);
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
@@ -745,7 +747,7 @@ TEST(FuserMultiQubitTest, ValidTimeOrder) {
 
     param.max_fused_size = 5;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_EQ(fused_gates.size(), 3);
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
@@ -763,7 +765,7 @@ TEST(FuserMultiQubitTest, ValidTimeOrder) {
 
     param.max_fused_size = 3;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_EQ(fused_gates.size(), 4);
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
@@ -783,7 +785,7 @@ TEST(FuserMultiQubitTest, ValidTimeOrder) {
     {
       param.max_fused_size = 3;
       auto fused_gates = Fuser::FuseGates(
-          param, num_qubits, circuit.begin(), circuit.end());
+          param, num_qubits, circuit.begin(), circuit.end(), false);
 
       EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
     }
@@ -791,7 +793,7 @@ TEST(FuserMultiQubitTest, ValidTimeOrder) {
     {
       param.max_fused_size = 5;
       auto fused_gates = Fuser::FuseGates(
-          param, num_qubits, circuit.begin(), circuit.end());
+          param, num_qubits, circuit.begin(), circuit.end(), false);
 
       EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
     }
@@ -814,8 +816,8 @@ TEST(FuserMultiQubitTest, ValidTimeOrder) {
 
     param.max_fused_size = 4;
     std::vector<unsigned> time_boundary = {3};
-    auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end(), time_boundary);
+    auto fused_gates = Fuser::FuseGates(param, num_qubits, circuit.begin(),
+                                        circuit.end(), time_boundary, false);
 
     EXPECT_EQ(fused_gates.size(), 2);
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
@@ -838,7 +840,7 @@ TEST(FuserMultiQubitTest, ValidTimeOrder) {
 
     param.max_fused_size = 4;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_EQ(fused_gates.size(), 3);
     EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
@@ -860,7 +862,7 @@ TEST(FuserMultiQubitTest, InvalidTimeOrder) {
 
     param.max_fused_size = 3;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_EQ(fused_gates.size(), 0);
   }
@@ -876,7 +878,7 @@ TEST(FuserMultiQubitTest, InvalidTimeOrder) {
 
     param.max_fused_size = 2;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_EQ(fused_gates.size(), 0);
   }
@@ -892,8 +894,8 @@ TEST(FuserMultiQubitTest, InvalidTimeOrder) {
 
     param.max_fused_size = 2;
     std::vector<unsigned> time_boundary = {1};
-    auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end(), time_boundary);
+    auto fused_gates = Fuser::FuseGates(param, num_qubits, circuit.begin(),
+                                        circuit.end(), time_boundary, false);
 
     EXPECT_EQ(fused_gates.size(), 0);
   }
@@ -909,8 +911,8 @@ TEST(FuserMultiQubitTest, InvalidTimeOrder) {
 
     param.max_fused_size = 2;
     std::vector<unsigned> time_boundary = {2};
-    auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end(), time_boundary);
+    auto fused_gates = Fuser::FuseGates(param, num_qubits, circuit.begin(),
+                                        circuit.end(), time_boundary, false);
 
     EXPECT_EQ(fused_gates.size(), 0);
   }
@@ -926,7 +928,7 @@ TEST(FuserMultiQubitTest, InvalidTimeOrder) {
 
     param.max_fused_size = 2;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_EQ(fused_gates.size(), 0);
   }
@@ -942,7 +944,7 @@ TEST(FuserMultiQubitTest, InvalidTimeOrder) {
 
     param.max_fused_size = 2;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_EQ(fused_gates.size(), 0);
   }
@@ -958,7 +960,7 @@ TEST(FuserMultiQubitTest, InvalidTimeOrder) {
 
     param.max_fused_size = 2;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_EQ(fused_gates.size(), 0);
   }
@@ -974,10 +976,134 @@ TEST(FuserMultiQubitTest, InvalidTimeOrder) {
 
     param.max_fused_size = 2;
     auto fused_gates = Fuser::FuseGates(
-        param, num_qubits, circuit.begin(), circuit.end());
+        param, num_qubits, circuit.begin(), circuit.end(), false);
+
+    EXPECT_EQ(fused_gates.size(), 0);
+  }
+}
+
+TEST(FuserMultiQubitTest, QubitsOutOfRange) {
+  using Fuser = MultiQubitGateFuser<IO, DummyGate>;
+
+  Fuser::Parameter param;
+  param.verbosity = 0;
+
+  {
+    unsigned num_qubits = 3;
+    std::vector<DummyGate> circuit = {
+      CreateDummyGate(0, {0, 3}),
+      CreateDummyGate(0, {1, 2}),
+    };
+
+    param.max_fused_size = 2;
+    auto fused_gates = Fuser::FuseGates(
+        param, num_qubits, circuit.begin(), circuit.end(), false);
 
     EXPECT_EQ(fused_gates.size(), 0);
   }
+
+  {
+    unsigned num_qubits = 3;
+    std::vector<DummyGate> circuit = {
+      CreateDummyGate(0, {0, 1}),
+      CreateDummyControlledGate(0, {2}, {3}),
+    };
+
+    param.max_fused_size = 2;
+    auto fused_gates = Fuser::FuseGates(
+        param, num_qubits, circuit.begin(), circuit.end(), false);
+
+    EXPECT_EQ(fused_gates.size(), 0);
+  }
+}
+
+TEST(FuserMultiQubitTest, OrphanedGates) {
+  using Fuser = MultiQubitGateFuser<IO, DummyGate>;
+
+  std::vector<DummyGate> circuit;
+  circuit.reserve(6);
+
+  Fuser::Parameter param;
+  param.verbosity = 0;
+
+  for (unsigned num_qubits = 2; num_qubits <= 6; ++ num_qubits) {
+    circuit.resize(0);
+
+    for (unsigned q = 0; q < num_qubits; ++q) {
+      circuit.push_back(CreateDummyGate(0, {q}));
+    }
+
+    for (unsigned f = 2; f <= num_qubits; ++f) {
+      param.max_fused_size = f;
+      auto fused_gates = Fuser::FuseGates(
+          param, num_qubits, circuit.begin(), circuit.end(), false);
+
+      EXPECT_TRUE(TestFusedGates(num_qubits, circuit, fused_gates));
+      EXPECT_EQ(fused_gates.size(), (num_qubits - 1) / f + 1);
+    }
+  }
+
+  {
+    unsigned num_qubits = 4;
+    std::vector<DummyGate> circuit = {
+      CreateDummyGate(0, {0}),
+      CreateDummyGate(0, {1}),
+      CreateDummyGate(0, {2}),
+      CreateDummyGate(0, {3}),
+      CreateDummyGate(1, {0, 3}),
+    };
+
+    param.max_fused_size = 2;
+    auto fused_gates = Fuser::FuseGates(
+        param, num_qubits, circuit.begin(), circuit.end(), false);
+
+    EXPECT_EQ(fused_gates.size(), 2);
+  }
+
+  {
+    unsigned num_qubits = 4;
+    std::vector<DummyGate> circuit = {
+      CreateDummyGate(0, {0, 3}),
+      CreateDummyGate(1, {0}),
+      CreateDummyGate(1, {1}),
+      CreateDummyGate(1, {2}),
+      CreateDummyGate(1, {3}),
+    };
+
+    param.max_fused_size = 2;
+    auto fused_gates = Fuser::FuseGates(
+        param, num_qubits, circuit.begin(), circuit.end(), false);
+
+    EXPECT_EQ(fused_gates.size(), 2);
+  }
+
+  {
+    unsigned num_qubits = 3;
+    std::vector<DummyGate> circuit = {
+      CreateDummyGate(0, {0}),
+      CreateDummyControlledGate(0, {1}, {2}),
+    };
+
+    param.max_fused_size = 2;
+    auto fused_gates = Fuser::FuseGates(
+        param, num_qubits, circuit.begin(), circuit.end(), false);
+
+    EXPECT_EQ(fused_gates.size(), 2);
+  }
+
+  {
+    unsigned num_qubits = 3;
+    std::vector<DummyGate> circuit = {
+      CreateDummyGate(0, {0}),
+      CreateDummyMeasurementGate(0, {2}),
+    };
+
+    param.max_fused_size = 2;
+    auto fused_gates = Fuser::FuseGates(
+        param, num_qubits, circuit.begin(), circuit.end(), false);
+
+    EXPECT_EQ(fused_gates.size(), 2);
+  }
 }
 
 }  // namespace qsim
diff --git a/tests/hybrid_custatevec_test.cu b/tests/hybrid_custatevec_test.cu
new file mode 100644
index 00000000..82dbeafc
--- /dev/null
+++ b/tests/hybrid_custatevec_test.cu
@@ -0,0 +1,67 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hybrid_testfixture.h"
+
+#include <cublas_v2.h>
+#include <custatevec.h>
+
+#include "gtest/gtest.h"
+
+#include "../lib/simulator_custatevec.h"
+
+namespace qsim {
+
+template <typename FP>
+struct Factory {
+  using fp_type = FP;
+  using Simulator = qsim::SimulatorCuStateVec<fp_type>;
+  using StateSpace = typename Simulator::StateSpace;
+
+  Factory() {
+    ErrorCheck(cublasCreate(&cublas_handle));
+    ErrorCheck(custatevecCreate(&custatevec_handle));
+  }
+
+  ~Factory() {
+    ErrorCheck(cublasDestroy(cublas_handle));
+    ErrorCheck(custatevecDestroy(custatevec_handle));
+  }
+
+  StateSpace CreateStateSpace() const {
+    return StateSpace(cublas_handle, custatevec_handle);
+  }
+
+  Simulator CreateSimulator() const {
+    return Simulator(custatevec_handle);
+  }
+
+  cublasHandle_t cublas_handle;
+  custatevecHandle_t custatevec_handle;
+};
+
+TEST(HybridCuStateVecTest, Hybrid2) {
+  TestHybrid2(qsim::Factory<float>());
+}
+
+TEST(HybridCuStateVecTest, Hybrid4) {
+  TestHybrid4(qsim::Factory<float>());
+}
+
+}  // namespace qsim
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/make.sh b/tests/make.sh
index 37b91723..f379a102 100755
--- a/tests/make.sh
+++ b/tests/make.sh
@@ -18,34 +18,39 @@
 # Prefer using the Makefile (e.g. `make -C tests/`) if possible.
 
 path_to_include=googletest/googletest/include
-path_to_lib=googletest/lib
+path_to_lib=googletest/googletest/make/lib
 
 g++ -O3 -I$path_to_include -L$path_to_lib -o bitstring_test.x bitstring_test.cc -lgtest -lpthread
+g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o channel_test.x channel_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o channels_cirq_test.x channels_cirq_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -o circuit_qsim_parser_test.x circuit_qsim_parser_test.cc -lgtest -lpthread
-g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o expect_test.x expect_test.cc -lgtest -lpthread
+g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o expect_nobmi2_test.x expect_test.cc -lgtest -lpthread
+g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -mbmi2 -fopenmp -o expect_test.x expect_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -o fuser_basic_test.x fuser_basic_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -o fuser_mqubit_test.x fuser_mqubit_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -o gates_qsim_test.x gates_qsim_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o hybrid_avx_test.x hybrid_avx_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -o matrix_test.x matrix_test.cc -lgtest -lpthread
-g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o qtrajectory_avx_test.x qtrajectory_avx_test.cc -lgtest -lpthread
+g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o qtrajectory_avx_nobmi2_test.x qtrajectory_avx_test.cc -lgtest -lpthread
+g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -mbmi2 -o qtrajectory_avx_test.x qtrajectory_avx_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o run_qsim_test.x run_qsim_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o run_qsimh_test.x run_qsimh_test.cc -lgtest -lpthread
-g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o simulator_avx_test.x simulator_avx_test.cc -lgtest -lpthread
-g++ -O3 -I$path_to_include -L$path_to_lib -mavx512f -fopenmp -o simulator_avx512_test.x simulator_avx512_test.cc -lgtest -lpthread
+g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o simulator_avx_nobmi2_test.x simulator_avx_test.cc -lgtest -lpthread
+g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -mbmi2 -fopenmp -o simulator_avx_test.x simulator_avx_test.cc -lgtest -lpthread
+g++ -O3 -I$path_to_include -L$path_to_lib -mavx512f -mbmi2 -fopenmp -o simulator_avx512_test.x simulator_avx512_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -fopenmp -o simulator_basic_test.x simulator_basic_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -msse4 -fopenmp -o simulator_sse_test.x simulator_sse_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o statespace_avx_test.x statespace_avx_test.cc -lgtest -lpthread
-g++ -O3 -I$path_to_include -L$path_to_lib -mavx512f -fopenmp -o statespace_avx512_test.x statespace_avx512_test.cc -lgtest -lpthread
+g++ -O3 -I$path_to_include -L$path_to_lib -mavx512f -mbmi2 -fopenmp -o statespace_avx512_test.x statespace_avx512_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -fopenmp -o statespace_basic_test.x statespace_basic_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -msse4 -fopenmp -o statespace_sse_test.x statespace_sse_test.cc -lgtest -lpthread
-g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o unitary_calculator_avx_test.x unitary_calculator_avx_test.cc -lgtest -lpthread
-g++ -O3 -I$path_to_include -L$path_to_lib -mavx512f -mfma -fopenmp -o unitary_calculator_avx512_test.x unitary_calculator_avx512_test.cc -lgtest -lpthread
+g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o unitary_calculator_avx_nobmi2_test.x unitary_calculator_avx_test.cc -lgtest -lpthread
+g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -mbmi2 -fopenmp -o unitary_calculator_avx_test.x unitary_calculator_avx_test.cc -lgtest -lpthread
+g++ -O3 -I$path_to_include -L$path_to_lib -mavx512f -mbmi2 -mfma -fopenmp -o unitary_calculator_avx512_test.x unitary_calculator_avx512_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -fopenmp -o unitary_calculator_basic_test.x unitary_calculator_basic_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -msse4 -fopenmp -o unitary_calculator_sse_test.x unitary_calculator_sse_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o unitaryspace_avx_test.x unitaryspace_avx_test.cc -lgtest -lpthread
-g++ -O3 -I$path_to_include -L$path_to_lib -mavx512f -mfma -fopenmp -o unitaryspace_avx512_test.x unitaryspace_avx512_test.cc -lgtest -lpthread
+g++ -O3 -I$path_to_include -L$path_to_lib -mavx512f -mbmi2 -fopenmp -o unitaryspace_avx512_test.x unitaryspace_avx512_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -fopenmp -o unitaryspace_basic_test.x unitaryspace_basic_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -msse4 -fopenmp -o unitaryspace_sse_test.x unitaryspace_sse_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -o vectorspace_test.x vectorspace_test.cc -lgtest -lpthread
@@ -54,3 +59,10 @@ nvcc -O3 -I$path_to_include -L$path_to_lib -o hybrid_cuda_test.x hybrid_cuda_tes
 nvcc -O3 -I$path_to_include -L$path_to_lib -o qtrajectory_cuda_test.x qtrajectory_cuda_test.cu -lgtest -lpthread
 nvcc -O3 -I$path_to_include -L$path_to_lib -o simulator_cuda_test.x simulator_cuda_test.cu -lgtest -lpthread
 nvcc -O3 -I$path_to_include -L$path_to_lib -o statespace_cuda_test.x statespace_cuda_test.cu -lgtest -lpthread
+
+# CUQUANTUM_DIR should be set.
+CUSTATEVECFLAGS="-I${CUQUANTUM_DIR}/include -L${CUQUANTUM_DIR}/lib -L${CUQUANTUM_DIR}/lib64 -lcustatevec -lcublas"
+nvcc -O3 $CUSTATEVECFLAGS -I$path_to_include -L$path_to_lib -o hybrid_custatevec_test.x hybrid_custatevec_test.cu -lgtest -lpthread
+nvcc -O3 $CUSTATEVECFLAGS -I$path_to_include -L$path_to_lib -o qtrajectory_custatevec_test.x qtrajectory_custatevec_test.cu -lgtest -lpthread
+nvcc -O3 $CUSTATEVECFLAGS -I$path_to_include -L$path_to_lib -o simulator_custatevec_test.x simulator_custatevec_test.cu -lgtest -lpthread
+nvcc -O3 $CUSTATEVECFLAGS -I$path_to_include -L$path_to_lib -o statespace_custatevec_test.x statespace_custatevec_test.cu -lgtest -lpthread
diff --git a/tests/mps_simulator_test.cc b/tests/mps_simulator_test.cc
index 0bbff966..c607d1aa 100644
--- a/tests/mps_simulator_test.cc
+++ b/tests/mps_simulator_test.cc
@@ -17,6 +17,7 @@
 #include "../lib/formux.h"
 #include "../lib/gate_appl.h"
 #include "../lib/gates_cirq.h"
+#include "../lib/gates_qsim.h"
 #include "gtest/gtest.h"
 
 namespace qsim {
@@ -26,7 +27,7 @@ namespace mps {
 namespace {
 
 TEST(MPSSimulator, Create) {
-  auto sim = MPSSimulator<For, float>(1);
+  MPSSimulator<For, float>(1);
 }
 
 TEST(MPSSimulator, Apply1RightArbitrary) {
@@ -802,6 +803,113 @@ TEST(MPSSimulator, OneTwoQubitFuzz) {
   */
 }
 
+TEST(MPSSimulator, ApplyFusedGateLeft) {
+  // Apply a fused gate matrix to the first two qubits.
+  // Compute the state vector of:
+  //   |     |     |
+  // +-+-----+-+   |
+  // |FusedGate|   |
+  // +-+-----+-+   |
+  //   |     |     |
+  // +-+-+ +-+-+ +-+-+
+  // | 0 +-+ 1 +-+ 2 |
+  // +---+ +---+ +---+
+  auto sim = MPSSimulator<For, float>(1);
+  using MPSStateSpace = MPSSimulator<For, float>::MPSStateSpace_;
+  auto ss = MPSStateSpace(1);
+
+  auto gate1 = GateCZ<float>::Create(2, 0, 1);
+  auto gate2 = GateHd<float>::Create(0, 0);
+  auto gate3 = GateHd<float>::Create(0, 1);
+
+  GateFused<GateQSim<float>> fgate1{kGateCZ, 2, {0, 1}, &gate1,
+                                    {&gate2, &gate3}, {}};
+  CalculateFusedMatrix(fgate1);
+  auto mps = ss.Create(3, 4);
+  ss.SetStateZero(mps);
+  ApplyFusedGate(sim, fgate1, mps);
+
+  float wf[32];
+  float ground_truth[] = {0.5, 0., 0., 0., 0.5, 0., 0., 0.,
+                          0.5, 0., 0., 0., 0.5, 0., 0., 0.};
+  ss.ToWaveFunction(mps, wf);
+  for (int i = 0; i < 16; i++) {
+    EXPECT_NEAR(wf[i], ground_truth[i], 1e-4);
+  }
+}
+
+TEST(MPSSimulator, ApplyFusedGateRight) {
+  // Apply a fused gate matrix to the last two qubits.
+  // Compute the state vector of:
+  //   |     |     |
+  //   |   +-+-----+-+
+  //   |   |FusedGate|
+  //   |   +-+-----+-+ 
+  //   |     |     |
+  // +-+-+ +-+-+ +-+-+
+  // | 0 +-+ 1 +-+ 2 |
+  // +---+ +---+ +---+
+  auto sim = MPSSimulator<For, float>(1);
+  using MPSStateSpace = MPSSimulator<For, float>::MPSStateSpace_;
+  auto ss = MPSStateSpace(1);
+
+  auto gate1 = GateCZ<float>::Create(2, 1, 2);
+  auto gate2 = GateHd<float>::Create(0, 1);
+  auto gate3 = GateHd<float>::Create(0, 2);
+
+  GateFused<GateQSim<float>> fgate1{kGateCZ, 2, {1, 2}, &gate1,
+                                    {&gate2, &gate3}, {}};
+  CalculateFusedMatrix(fgate1);
+  auto mps = ss.Create(3, 4);
+  ss.SetStateZero(mps);
+  ApplyFusedGate(sim, fgate1, mps);
+
+  float wf[32];
+  float ground_truth[] = {0.5, 0., 0.5, 0., 0.5, 0., 0.5, 0.,
+                          0., 0., 0., 0., 0., 0., 0., 0.};
+  ss.ToWaveFunction(mps, wf);
+  for (int i = 0; i < 16; i++) {
+    EXPECT_NEAR(wf[i], ground_truth[i], 1e-4);
+  }
+}
+
+TEST(MPSSimulator, ApplyFusedGateMiddle) {
+  // Apply a fused gate matrix to the middle two qubits.
+  // Compute the state vector of:
+  //   |     |     |     |
+  //   |   +-+-----+-+   |
+  //   |   |FusedGate|   |
+  //   |   +-+-----+-+   |
+  //   |     |     |     |
+  // +-+-+ +-+-+ +-+-+ +-+-+
+  // | 0 +-+ 1 +-+ 2 |-| 3 |
+  // +---+ +---+ +---+ +-+-+
+  auto sim = MPSSimulator<For, float>(1);
+  using MPSStateSpace = MPSSimulator<For, float>::MPSStateSpace_;
+  auto ss = MPSStateSpace(1);
+
+  auto gate1 = GateCZ<float>::Create(2, 1, 2);
+  auto gate2 = GateHd<float>::Create(0, 1);
+  auto gate3 = GateHd<float>::Create(0, 2);
+
+  GateFused<GateQSim<float>> fgate1{kGateCZ, 2, {1, 2}, &gate1,
+                                    {&gate2, &gate3}, {}};
+  CalculateFusedMatrix(fgate1);
+  auto mps = ss.Create(4, 4);
+  ss.SetStateZero(mps);
+  ApplyFusedGate(sim, fgate1, mps);
+
+  float wf[64];
+  float ground_truth[] = {0.5, 0., 0., 0., 0.5, 0., 0., 0.,
+                          0.5, 0., 0., 0., 0.5, 0., 0., 0.,
+                          0., 0., 0., 0., 0., 0., 0., 0.,
+                          0., 0., 0., 0., 0., 0., 0., 0.};
+  ss.ToWaveFunction(mps, wf);
+  for (int i = 0; i < 32; i++) {
+    EXPECT_NEAR(wf[i], ground_truth[i], 1e-4);
+  }
+}
+
 }  // namespace
 }  // namespace mps
 }  // namespace qsim
diff --git a/tests/mps_statespace_test.cc b/tests/mps_statespace_test.cc
index 4860c1ff..1ea037c5 100644
--- a/tests/mps_statespace_test.cc
+++ b/tests/mps_statespace_test.cc
@@ -586,6 +586,534 @@ TEST(MPSStateSpaceTest, InnerProduct4) {
   EXPECT_NEAR(f, 0.5524, 1e-4);
 }
 
+TEST(MPSStateSpaceTest, ReduceDensityMatrixLarge){
+  auto ss = MPSStateSpace<For, float>(1);
+  auto mps = ss.Create(5, 8);
+  auto scratch = ss.Create(5, 8);
+
+  // Set to highly entangled five qubit state.
+  memset(mps.get(), 0, ss.RawSize(mps));
+  mps.get()[ 0 ] = 0.2309518310679887 ;
+  mps.get()[ 1 ] = 0.6567032847786496 ;
+  mps.get()[ 2 ] = 0.2768328727585293 ;
+  mps.get()[ 3 ] = 0.6623938314484864 ;
+  mps.get()[ 16 ] = -0.2726304616546085 ;
+  mps.get()[ 17 ] = -0.6641345176352952 ;
+  mps.get()[ 18 ] = 0.3010495535897523 ;
+  mps.get()[ 19 ] = 0.627667980511763 ;
+  mps.get()[ 32 ] = -0.3191944318061405 ;
+  mps.get()[ 33 ] = 0.3643805550045959 ;
+  mps.get()[ 34 ] = -0.04874347818313596 ;
+  mps.get()[ 35 ] = 0.6919258000829638 ;
+  mps.get()[ 36 ] = 0.49609358835838246 ;
+  mps.get()[ 37 ] = -0.04574935292831311 ;
+  mps.get()[ 38 ] = 0.09997934427265631 ;
+  mps.get()[ 39 ] = -0.16126613211648944 ;
+  mps.get()[ 48 ] = 0.11471063502833712 ;
+  mps.get()[ 49 ] = -0.17128708246811702 ;
+  mps.get()[ 50 ] = -0.537166251460361 ;
+  mps.get()[ 51 ] = 0.2131135551453959 ;
+  mps.get()[ 52 ] = -0.19584141695097979 ;
+  mps.get()[ 53 ] = -0.5224748182520113 ;
+  mps.get()[ 54 ] = -0.4786314439048336 ;
+  mps.get()[ 55 ] = -0.28829738309774056 ;
+  mps.get()[ 64 ] = -0.22760229053474473 ;
+  mps.get()[ 65 ] = -0.3983913913209793 ;
+  mps.get()[ 66 ] = 0.02677088226132468 ;
+  mps.get()[ 67 ] = 0.019402378895274214 ;
+  mps.get()[ 68 ] = 0.11824715374271104 ;
+  mps.get()[ 69 ] = -0.6255924221100192 ;
+  mps.get()[ 70 ] = 0.4373827784232901 ;
+  mps.get()[ 71 ] = 0.43787715330879845 ;
+  mps.get()[ 80 ] = 0.46930961473374244 ;
+  mps.get()[ 81 ] = 0.5404386307552339 ;
+  mps.get()[ 82 ] = -0.4215827008929379 ;
+  mps.get()[ 83 ] = 0.07792743601226776 ;
+  mps.get()[ 84 ] = -0.17872276471126872 ;
+  mps.get()[ 85 ] = -0.05634054845076332 ;
+  mps.get()[ 86 ] = 0.31229453092671483 ;
+  mps.get()[ 87 ] = 0.41379458050793116 ;
+  mps.get()[ 168 ] = 1.0 ;
+  mps.get()[ 169 ] = 0.0 ;
+  mps.get()[ 186 ] = 1.0 ;
+  mps.get()[ 187 ] = 0.0 ;
+  mps.get()[ 204 ] = 1.0 ;
+  mps.get()[ 205 ] = 0.0 ;
+  mps.get()[ 222 ] = 1.0 ;
+  mps.get()[ 223 ] = 0.0 ;
+  mps.get()[ 288 ] = -0.6130799737070379 ;
+  mps.get()[ 289 ] = 0.0 ;
+  mps.get()[ 290 ] = 0.043268104759918435 ;
+  mps.get()[ 291 ] = -0.18995272475290723 ;
+  mps.get()[ 292 ] = 0.12876152804781352 ;
+  mps.get()[ 293 ] = -0.11893213425423177 ;
+  mps.get()[ 294 ] = 0.1607817034199853 ;
+  mps.get()[ 295 ] = 0.24665465943552506 ;
+  mps.get()[ 296 ] = 0.24552427823440562 ;
+  mps.get()[ 297 ] = 0.03609285713030893 ;
+  mps.get()[ 298 ] = -0.15990808712940063 ;
+  mps.get()[ 299 ] = 0.24468178722817494 ;
+  mps.get()[ 300 ] = -0.4914494834254487 ;
+  mps.get()[ 301 ] = 0.2405398049739527 ;
+  mps.get()[ 302 ] = -0.14134232805639232 ;
+  mps.get()[ 303 ] = 0.0487940490917071 ;
+  mps.get()[ 304 ] = 0.2742172608377126 ;
+  mps.get()[ 305 ] = 0.047271896662111734 ;
+  mps.get()[ 306 ] = -0.18146376283725354 ;
+  mps.get()[ 307 ] = 0.33152462391237286 ;
+  mps.get()[ 308 ] = 0.0773807177545771 ;
+  mps.get()[ 309 ] = 0.24654528023213645 ;
+  mps.get()[ 310 ] = 0.008528550130968378 ;
+  mps.get()[ 311 ] = 0.2390239731739813 ;
+  mps.get()[ 312 ] = -0.508089429071731 ;
+  mps.get()[ 313 ] = -0.002320091211748876 ;
+  mps.get()[ 314 ] = -0.13528872019886337 ;
+  mps.get()[ 315 ] = 0.31045800372692844 ;
+  mps.get()[ 316 ] = -0.3746798814674866 ;
+  mps.get()[ 317 ] = 0.1374707983071416 ;
+  mps.get()[ 318 ] = 0.06279287873849984 ;
+  mps.get()[ 319 ] = 0.345950035760249 ;
+  mps.get()[ 320 ] = 0.2170489335583332 ;
+  mps.get()[ 321 ] = 0.0 ;
+  mps.get()[ 322 ] = 0.25343359465176174 ;
+  mps.get()[ 323 ] = -0.06460873268181125 ;
+  mps.get()[ 324 ] = 0.46101262974278245 ;
+  mps.get()[ 325 ] = -0.2092480984031435 ;
+  mps.get()[ 326 ] = 0.13091963057724038 ;
+  mps.get()[ 327 ] = 0.22386537991270267 ;
+  mps.get()[ 328 ] = 0.12356459122286649 ;
+  mps.get()[ 329 ] = -0.4070091557135954 ;
+  mps.get()[ 330 ] = -0.28135621185555465 ;
+  mps.get()[ 331 ] = -0.3875888286693259 ;
+  mps.get()[ 332 ] = 0.03359145124758722 ;
+  mps.get()[ 333 ] = 0.08762316323030826 ;
+  mps.get()[ 334 ] = 0.2470684092385266 ;
+  mps.get()[ 335 ] = 0.2841719777265282 ;
+  mps.get()[ 336 ] = -0.10369449090740858 ;
+  mps.get()[ 337 ] = -0.06989422362389419 ;
+  mps.get()[ 338 ] = 0.22469512904806252 ;
+  mps.get()[ 339 ] = -0.5302042447544824 ;
+  mps.get()[ 340 ] = 0.1385544129616299 ;
+  mps.get()[ 341 ] = 0.16964924283127805 ;
+  mps.get()[ 342 ] = -0.04065543762807928 ;
+  mps.get()[ 343 ] = 0.005959393071539496 ;
+  mps.get()[ 344 ] = -0.4512829757314996 ;
+  mps.get()[ 345 ] = -0.2586701038412787 ;
+  mps.get()[ 346 ] = 0.5529069902589456 ;
+  mps.get()[ 347 ] = -0.03606848742855439 ;
+  mps.get()[ 348 ] = -0.14118940851816178 ;
+  mps.get()[ 349 ] = -0.06994728084240831 ;
+  mps.get()[ 350 ] = 0.018957416881223405 ;
+  mps.get()[ 351 ] = 0.01819725521128998 ;
+  mps.get()[ 352 ] = -0.17686033082431762 ;
+  mps.get()[ 353 ] = 0.0 ;
+  mps.get()[ 354 ] = 0.2931301972780771 ;
+  mps.get()[ 355 ] = 0.21016759261710286 ;
+  mps.get()[ 356 ] = -0.11356014183124072 ;
+  mps.get()[ 357 ] = -0.46047703337338425 ;
+  mps.get()[ 358 ] = -0.13658537870586535 ;
+  mps.get()[ 359 ] = -0.40395680360094305 ;
+  mps.get()[ 360 ] = -0.2192834055419525 ;
+  mps.get()[ 361 ] = 0.1147739015484587 ;
+  mps.get()[ 362 ] = 0.0406334021199655 ;
+  mps.get()[ 363 ] = -0.13324853969745315 ;
+  mps.get()[ 364 ] = -0.08677952965904745 ;
+  mps.get()[ 365 ] = -0.06520136749772663 ;
+  mps.get()[ 366 ] = -0.2592315135086059 ;
+  mps.get()[ 367 ] = 0.5217332205571495 ;
+  mps.get()[ 368 ] = 0.04262419252580863 ;
+  mps.get()[ 369 ] = 0.3848508087888552 ;
+  mps.get()[ 370 ] = 0.07107418272505975 ;
+  mps.get()[ 371 ] = -0.23184566218302685 ;
+  mps.get()[ 372 ] = 0.2521398405505697 ;
+  mps.get()[ 373 ] = 0.07367621634957493 ;
+  mps.get()[ 374 ] = 0.35237066854683313 ;
+  mps.get()[ 375 ] = -0.037975076688907206 ;
+  mps.get()[ 376 ] = 0.04017444083340625 ;
+  mps.get()[ 377 ] = 0.3474071346882127 ;
+  mps.get()[ 378 ] = -0.09328244237604688 ;
+  mps.get()[ 379 ] = 0.254631755111914 ;
+  mps.get()[ 380 ] = 0.11980261826920166 ;
+  mps.get()[ 381 ] = -0.5763161558192667 ;
+  mps.get()[ 382 ] = 0.03761704118581086 ;
+  mps.get()[ 383 ] = 0.23001403035505844 ;
+  mps.get()[ 384 ] = -0.26919429961914343 ;
+  mps.get()[ 385 ] = 0.0 ;
+  mps.get()[ 386 ] = -0.26974433859093544 ;
+  mps.get()[ 387 ] = 0.07475159568942476 ;
+  mps.get()[ 388 ] = 0.4614017944048309 ;
+  mps.get()[ 389 ] = -0.1653283114243778 ;
+  mps.get()[ 390 ] = -0.5969489482357646 ;
+  mps.get()[ 391 ] = -0.010072342322176322 ;
+  mps.get()[ 392 ] = -0.0019403159041931585 ;
+  mps.get()[ 393 ] = -0.07573380596911085 ;
+  mps.get()[ 394 ] = 0.05771235935096261 ;
+  mps.get()[ 395 ] = 0.32234604777657144 ;
+  mps.get()[ 396 ] = 0.28055767387417235 ;
+  mps.get()[ 397 ] = -0.08179104024696918 ;
+  mps.get()[ 398 ] = 0.22420649690508931 ;
+  mps.get()[ 399 ] = 0.06214434558530708 ;
+  mps.get()[ 400 ] = -0.10312646494528237 ;
+  mps.get()[ 401 ] = -0.46866618040645364 ;
+  mps.get()[ 402 ] = -0.3880666432855699 ;
+  mps.get()[ 403 ] = -0.03681826251267426 ;
+  mps.get()[ 404 ] = -0.25192564830049524 ;
+  mps.get()[ 405 ] = -0.024048027190332867 ;
+  mps.get()[ 406 ] = 0.2710542945659806 ;
+  mps.get()[ 407 ] = 0.22315379238735505 ;
+  mps.get()[ 408 ] = 0.1774445513167211 ;
+  mps.get()[ 409 ] = -0.08929584369397156 ;
+  mps.get()[ 410 ] = 0.2513518619982434 ;
+  mps.get()[ 411 ] = 0.010901632944735585 ;
+  mps.get()[ 412 ] = 0.20937810155968847 ;
+  mps.get()[ 413 ] = -0.14207394095443068 ;
+  mps.get()[ 414 ] = 0.09108907016995436 ;
+  mps.get()[ 415 ] = 0.5053457101186702 ;
+  mps.get()[ 544 ] = -0.23436455 ;
+  mps.get()[ 545 ] = 0.53141874 ;
+  mps.get()[ 546 ] = 0.06564701 ;
+  mps.get()[ 547 ] = -0.016569737 ;
+  mps.get()[ 560 ] = 0.27552164 ;
+  mps.get()[ 561 ] = 0.39324737 ;
+  mps.get()[ 562 ] = 0.03920218 ;
+  mps.get()[ 563 ] = 0.3126193 ;
+  mps.get()[ 576 ] = 0.08090294 ;
+  mps.get()[ 577 ] = 0.067396805 ;
+  mps.get()[ 578 ] = 0.38391852 ;
+  mps.get()[ 579 ] = 0.49181914 ;
+  mps.get()[ 592 ] = -0.34309888 ;
+  mps.get()[ 593 ] = -0.16689296 ;
+  mps.get()[ 594 ] = 0.3111027 ;
+  mps.get()[ 595 ] = 0.33973938 ;
+  mps.get()[ 608 ] = -0.39236507 ;
+  mps.get()[ 609 ] = 0.21032207 ;
+  mps.get()[ 610 ] = 0.084636666 ;
+  mps.get()[ 611 ] = -0.026731271 ;
+  mps.get()[ 624 ] = 0.048989747 ;
+  mps.get()[ 625 ] = -0.22122668 ;
+  mps.get()[ 626 ] = 0.24929003 ;
+  mps.get()[ 627 ] = -0.23605682 ;
+  mps.get()[ 640 ] = 0.02890851 ;
+  mps.get()[ 641 ] = -0.008860635 ;
+  mps.get()[ 642 ] = -0.30513528 ;
+  mps.get()[ 643 ] = 0.14362136 ;
+  mps.get()[ 656 ] = 0.008236099 ;
+  mps.get()[ 657 ] = 0.15793985 ;
+  mps.get()[ 658 ] = 0.16013248 ;
+  mps.get()[ 659 ] = -0.17186542 ;
+  mps.get()[ 672 ] = 2.1042667e-32 ;
+  mps.get()[ 673 ] = -1.7309414e-32 ;
+  mps.get()[ 674 ] = -1.18279285e-32 ;
+  mps.get()[ 675 ] = 4.1110057e-33 ;
+  mps.get()[ 688 ] = 1.6622225e-33 ;
+  mps.get()[ 689 ] = 1.718814e-32 ;
+  mps.get()[ 690 ] = -1.2495627e-32 ;
+  mps.get()[ 691 ] = 1.3456783e-32 ;
+  mps.get()[ 704 ] = -1.5954751e-32 ;
+  mps.get()[ 705 ] = 1.0564825e-32 ;
+  mps.get()[ 706 ] = 8.3999555e-33 ;
+  mps.get()[ 707 ] = 7.0071443e-34 ;
+  mps.get()[ 720 ] = 3.413174e-34 ;
+  mps.get()[ 721 ] = -1.01854646e-32 ;
+  mps.get()[ 722 ] = 1.0789845e-32 ;
+  mps.get()[ 723 ] = -5.4311017e-33 ;
+  mps.get()[ 800 ] = 0.503408 ;
+  mps.get()[ 801 ] = 0.0 ;
+  mps.get()[ 802 ] = -0.48754588 ;
+  mps.get()[ 803 ] = 0.38097852 ;
+  mps.get()[ 804 ] = -0.4678266 ;
+  mps.get()[ 805 ] = 0.0 ;
+  mps.get()[ 806 ] = -0.29991406 ;
+  mps.get()[ 807 ] = 0.2343591 ;
+
+  // The below snippets compute the following:
+  //        |
+  //      +---+ +---+ +---+ +---+ +---+
+  // mps2 | 0 +-+ 1 +-+ 2 +-+ 3 |-+ 4 |
+  //      +-+-+ +-+-+ +-+-+ +-+-+ +-+-+
+  //              |     |     |     |
+  //              |     |     |     |
+  //      +-+-+ +-+-+ +-+-+ +-+-+ +-+-+
+  // mps  | 0 +-+ 1 +-+ 2 +-+ 3 |-+ 4 |
+  //      +---+ +---+ +---+ +---+ +---+
+  //        |
+  //-----------------------------------------
+  //              |
+  //      +---+ +---+ +---+ +---+ +---+
+  // mps2 | 0 +-+ 1 +-+ 2 +-+ 3 |-+ 4 |
+  //      +-+-+ +-+-+ +-+-+ +-+-+ +-+-+
+  //        |           |     |     |
+  //        |           |     |     |
+  //      +-+-+ +-+-+ +-+-+ +-+-+ +-+-+
+  // mps  | 0 +-+ 1 +-+ 2 +-+ 3 |-+ 4 |
+  //      +---+ +---+ +---+ +---+ +---+
+  //              |
+  //-----------------------------------------
+  // And so on.
+
+  float rdm[8];
+  ss.ReduceDensityMatrix(mps, scratch, 0, rdm);
+  EXPECT_NEAR(rdm[ 0 ], 0.43434495 , 1e-4);
+  EXPECT_NEAR(rdm[ 1 ], 0.0 , 1e-4);
+  EXPECT_NEAR(rdm[ 2 ], 0.0452075 , 1e-4);
+  EXPECT_NEAR(rdm[ 3 ], -0.06079552 , 1e-4);
+  EXPECT_NEAR(rdm[ 4 ], 0.0452075 , 1e-4);
+  EXPECT_NEAR(rdm[ 5 ], 0.06079552 , 1e-4);
+  EXPECT_NEAR(rdm[ 6 ], 0.5656554 , 1e-4);
+  EXPECT_NEAR(rdm[ 7 ], 0.0 , 1e-4);
+  ss.ReduceDensityMatrix(mps, scratch, 1, rdm);
+  EXPECT_NEAR(rdm[ 0 ], 0.39097905 , 1e-4);
+  EXPECT_NEAR(rdm[ 1 ], 0.0 , 1e-4);
+  EXPECT_NEAR(rdm[ 2 ], 0.02212441 , 1e-4);
+  EXPECT_NEAR(rdm[ 3 ], -0.026811063 , 1e-4);
+  EXPECT_NEAR(rdm[ 4 ], 0.02212441 , 1e-4);
+  EXPECT_NEAR(rdm[ 5 ], 0.026811063 , 1e-4);
+  EXPECT_NEAR(rdm[ 6 ], 0.6090213 , 1e-4);
+  EXPECT_NEAR(rdm[ 7 ], 0.0 , 1e-4);
+  ss.ReduceDensityMatrix(mps, scratch, 2, rdm);
+  EXPECT_NEAR(rdm[ 0 ], 0.49911368 , 1e-4);
+  EXPECT_NEAR(rdm[ 1 ], 0.0 , 1e-4);
+  EXPECT_NEAR(rdm[ 2 ], -0.1224697 , 1e-4);
+  EXPECT_NEAR(rdm[ 3 ], 0.030501436 , 1e-4);
+  EXPECT_NEAR(rdm[ 4 ], -0.1224697 , 1e-4);
+  EXPECT_NEAR(rdm[ 5 ], -0.030501436 , 1e-4);
+  EXPECT_NEAR(rdm[ 6 ], 0.5008867 , 1e-4);
+  EXPECT_NEAR(rdm[ 7 ], 0.0 , 1e-4);
+  ss.ReduceDensityMatrix(mps, scratch, 3, rdm);
+  EXPECT_NEAR(rdm[ 0 ], 0.5358647 , 1e-4);
+  EXPECT_NEAR(rdm[ 1 ], 0.0 , 1e-4);
+  EXPECT_NEAR(rdm[ 2 ], 0.11097979 , 1e-4);
+  EXPECT_NEAR(rdm[ 3 ], 0.08869759 , 1e-4);
+  EXPECT_NEAR(rdm[ 4 ], 0.11097979 , 1e-4);
+  EXPECT_NEAR(rdm[ 5 ], -0.08869759 , 1e-4);
+  EXPECT_NEAR(rdm[ 6 ], 0.4641356 , 1e-4);
+  EXPECT_NEAR(rdm[ 7 ], 0.0 , 1e-4);
+  ss.ReduceDensityMatrix(mps, scratch, 4, rdm);
+  EXPECT_NEAR(rdm[ 0 ], 0.47228184 , 1e-4);
+  EXPECT_NEAR(rdm[ 1 ], 0.0 , 1e-4);
+  EXPECT_NEAR(rdm[ 2 ], -0.105126746 , 1e-4);
+  EXPECT_NEAR(rdm[ 3 ], -0.082148254 , 1e-4);
+  EXPECT_NEAR(rdm[ 4 ], -0.105126746 , 1e-4);
+  EXPECT_NEAR(rdm[ 5 ], 0.082148254 , 1e-4);
+  EXPECT_NEAR(rdm[ 6 ], 0.5277185 , 1e-4);
+  EXPECT_NEAR(rdm[ 7 ], 0.0 , 1e-4);
+
+}
+
+TEST(MPSStateSpaceTest, SampleOnceSimple){
+  auto ss = MPSStateSpace<For, float>(1);
+  auto mps = ss.Create(3, 4);
+  auto scratch = ss.Create(3, 4);
+  auto scratch2 = ss.Create(3, 4);
+  std::mt19937 rand_source(1234);
+  std::vector<bool> results;
+
+  // Set to |100>.
+  results.clear();
+  ss.SetStateZero(mps);
+  mps.get()[0] = 0;
+  mps.get()[8] = 1;
+  ss.SampleOnce(mps, scratch, scratch2, &rand_source, &results);
+  EXPECT_EQ(results[0], 1);
+  EXPECT_EQ(results[1], 0);
+  EXPECT_EQ(results[2], 0);
+
+  // Set to |010>.
+  results.clear();
+  ss.SetStateZero(mps);
+  mps.get()[16] = 0;
+  mps.get()[24] = 1;
+  ss.SampleOnce(mps, scratch, scratch2, &rand_source, &results);
+  EXPECT_EQ(results[0], 0);
+  EXPECT_EQ(results[1], 1);
+  EXPECT_EQ(results[2], 0);
+
+  // Set to |001>.
+  results.clear();
+  ss.SetStateZero(mps);
+  mps.get()[80] = 0;
+  mps.get()[82] = 1;
+  ss.SampleOnce(mps, scratch, scratch2, &rand_source, &results);
+  EXPECT_EQ(results[0], 0);
+  EXPECT_EQ(results[1], 0);
+  EXPECT_EQ(results[2], 1);
+
+  // Set to |101>.
+  results.clear();
+  ss.SetStateZero(mps);
+  mps.get()[0] = 0;
+  mps.get()[8] = 1;
+  mps.get()[80] = 0;
+  mps.get()[82] = 1;
+  ss.SampleOnce(mps, scratch, scratch2, &rand_source, &results);
+  EXPECT_EQ(results[0], 1);
+  EXPECT_EQ(results[1], 0);
+  EXPECT_EQ(results[2], 1);
+}
+
+TEST(MPSStateSpaceTest, SampleGHZ){
+  const int num_samples = 10000;
+  auto ss = MPSStateSpace<For, float>(1);
+  auto mps = ss.Create(3, 4);
+  auto scratch = ss.Create(3, 4);
+  auto scratch2 = ss.Create(3, 4);
+  std::vector<std::vector<bool>> results(
+    num_samples, std::vector<bool>({}));
+
+  memset(mps.get(), 0, ss.RawSize(mps));
+  mps.get()[0] = 1;
+  mps.get()[10] = 1;
+  mps.get()[16] = 1;
+  mps.get()[42] = -1;
+  mps.get()[80] = 0.70710677;
+  mps.get()[86] = -0.70710677;
+
+  float count = 0;
+  ss.Sample(mps, scratch, scratch2, num_samples, 1234, &results);
+  for(int i = 0 ; i < num_samples; i++){
+    bool all_same = 1;
+    all_same &= results[i][0] == results[i][1];
+    all_same &= results[i][1] == results[i][2];
+    EXPECT_EQ(all_same, 1);
+    count += results[i][0];
+    EXPECT_EQ(results[i].size(), 3);
+  }
+  EXPECT_NEAR(count / float(num_samples), 0.5, 1e-2);
+}
+
+TEST(MPSStateSpaceTest, SampleComplex){
+  const int num_samples = 10000;
+  auto ss = MPSStateSpace<For, float>(1);
+  auto mps = ss.Create(4, 4);
+  auto scratch = ss.Create(4, 4);
+  auto scratch2 = ss.Create(4, 4);
+  std::vector<std::vector<bool>> results(
+    num_samples, std::vector<bool>({}));
+
+  memset(mps.get(), 0, ss.RawSize(mps));
+  mps.get()[ 0 ] = -0.4917038696869799 ;
+  mps.get()[ 1 ] = 0.016731957658280873 ;
+  mps.get()[ 2 ] = 0.86132663373237 ;
+  mps.get()[ 3 ] = 0.12674293823327035 ;
+  mps.get()[ 8 ] = -0.5023020703950029 ;
+  mps.get()[ 9 ] = -0.711083648814302 ;
+  mps.get()[ 10 ] = -0.20727818303023368 ;
+  mps.get()[ 11 ] = -0.4461932766843352 ;
+  mps.get()[ 16 ] = 0.15655121570640956 ;
+  mps.get()[ 17 ] = 0.4732738079187066 ;
+  mps.get()[ 18 ] = -0.08511634068671248 ;
+  mps.get()[ 19 ] = 0.4509108800471812 ;
+  mps.get()[ 20 ] = 0.3399824326377983 ;
+  mps.get()[ 21 ] = 0.26456637633430585 ;
+  mps.get()[ 22 ] = 0.5923848721836553 ;
+  mps.get()[ 23 ] = -0.06659540240231236 ;
+  mps.get()[ 24 ] = 0.3386920440520109 ;
+  mps.get()[ 25 ] = -0.5078386788732782 ;
+  mps.get()[ 26 ] = -0.5938438138167242 ;
+  mps.get()[ 27 ] = -0.2253530600030204 ;
+  mps.get()[ 28 ] = -0.08439705180650249 ;
+  mps.get()[ 29 ] = 0.18289872169116567 ;
+  mps.get()[ 30 ] = 0.33989833066754255 ;
+  mps.get()[ 31 ] = -0.2604753706869852 ;
+  mps.get()[ 32 ] = 0.3013840839514031 ;
+  mps.get()[ 33 ] = -0.10757629710841352 ;
+  mps.get()[ 34 ] = -0.043855659850960294 ;
+  mps.get()[ 35 ] = -0.0999497956398576 ;
+  mps.get()[ 36 ] = 0.6336147397284169 ;
+  mps.get()[ 37 ] = 0.43658807519265264 ;
+  mps.get()[ 38 ] = -0.448346536528476 ;
+  mps.get()[ 39 ] = 0.30428652791930944 ;
+  mps.get()[ 40 ] = 0.2954131683108271 ;
+  mps.get()[ 41 ] = -0.4349910681437736 ;
+  mps.get()[ 42 ] = 0.35640542464599323 ;
+  mps.get()[ 43 ] = 0.4970533197510696 ;
+  mps.get()[ 44 ] = -0.37101487814696105 ;
+  mps.get()[ 45 ] = 0.2100308254832807 ;
+  mps.get()[ 46 ] = 0.10591704897593116 ;
+  mps.get()[ 47 ] = 0.3955295090226334 ;
+  mps.get()[ 80 ] = -0.24953341864058454 ;
+  mps.get()[ 81 ] = 0.0 ;
+  mps.get()[ 82 ] = -0.5480093086703182 ;
+  mps.get()[ 83 ] = -0.20497358945530025 ;
+  mps.get()[ 84 ] = -1.1887516198406813e-16 ;
+  mps.get()[ 85 ] = 3.714848812002129e-18 ;
+  mps.get()[ 88 ] = 0.6045663379213811 ;
+  mps.get()[ 89 ] = -0.3501271865840065 ;
+  mps.get()[ 90 ] = -0.29968140886676936 ;
+  mps.get()[ 91 ] = 0.40493683779718603 ;
+  mps.get()[ 96 ] = 0.3073334814703704 ;
+  mps.get()[ 97 ] = 0.0 ;
+  mps.get()[ 98 ] = 0.07297353820052123 ;
+  mps.get()[ 99 ] = -0.2859132301813451 ;
+  mps.get()[ 100 ] = -1.7214471606144266e-16 ;
+  mps.get()[ 101 ] = 5.379522376920083e-18 ;
+  mps.get()[ 104 ] = -0.18689238699414557 ;
+  mps.get()[ 105 ] = -0.4911602105890581 ;
+  mps.get()[ 106 ] = -0.30326863844349566 ;
+  mps.get()[ 107 ] = -0.22667282775953723 ;
+  mps.get()[ 112 ] = -0.10881711525857803 ;
+  mps.get()[ 113 ] = 0.0 ;
+  mps.get()[ 114 ] = -0.146152770590198 ;
+  mps.get()[ 115 ] = 0.2149415742117364 ;
+  mps.get()[ 116 ] = -4.72314539505504e-16 ;
+  mps.get()[ 117 ] = 1.1519866817207415e-17 ;
+  mps.get()[ 120 ] = -0.01567698028444534 ;
+  mps.get()[ 121 ] = 0.013440646849502781 ;
+  mps.get()[ 122 ] = -0.17367051562799563 ;
+  mps.get()[ 123 ] = -0.24954843447516284 ;
+  mps.get()[ 128 ] = 0.24030153622040965 ;
+  mps.get()[ 129 ] = 0.0 ;
+  mps.get()[ 130 ] = -0.08309837568058188 ;
+  mps.get()[ 131 ] = 0.07924116582885271 ;
+  mps.get()[ 132 ] = -7.075275311738327e-17 ;
+  mps.get()[ 133 ] = 3.930708506521293e-18 ;
+  mps.get()[ 136 ] = 0.0725269370009367 ;
+  mps.get()[ 137 ] = 0.06123701427497634 ;
+  mps.get()[ 138 ] = -0.006630682493419155 ;
+  mps.get()[ 139 ] = 0.015491880670142021 ;
+  mps.get()[ 144 ] = -0.021403127627426542 ;
+  mps.get()[ 145 ] = 0.04422341855596844 ;
+  mps.get()[ 146 ] = 0.27602112861704176 ;
+  mps.get()[ 147 ] = 0.7790060986745896 ;
+  mps.get()[ 148 ] = 0.25252680029727903 ;
+  mps.get()[ 149 ] = 0.49967041792054084 ;
+  mps.get()[ 150 ] = -0.031679241045523554 ;
+  mps.get()[ 151 ] = -0.010202895067710558 ;
+
+  ss.Sample(mps, scratch, scratch2, num_samples, 12345, &results);
+  std::vector<float> expected({
+    0.036801,
+    0.040697,
+    0.002013,
+    0.064595,
+    0.014892,
+    0.082028,
+    0.008521,
+    0.168310,
+    0.022078,
+    0.005907,
+    0.024806,
+    0.189074,
+    0.090056,
+    0.023125,
+    0.116683,
+    0.110406
+  });
+  std::vector<float> hist(16, 0);
+  for(int i =0;i<num_samples;i++){
+    int index = 0;
+    index += 8 * results[i][0];
+    index += 4 * results[i][1];
+    index += 2 * results[i][2];
+    index += 1 * results[i][3];
+    hist[index] += 1;
+  }
+  for(int i =0;i<16;i++){
+    EXPECT_NEAR(hist[i] / float(num_samples), expected[i], 1e-2);
+  }
+
+}
+
 }  // namespace
 }  // namespace mps
 }  // namespace qsim
diff --git a/tests/qtrajectory_avx_test.cc b/tests/qtrajectory_avx_test.cc
index b4a31baa..2dc126d3 100644
--- a/tests/qtrajectory_avx_test.cc
+++ b/tests/qtrajectory_avx_test.cc
@@ -44,6 +44,10 @@ TEST(QTrajectoryAVXTest, GenDump) {
   TestGenDump(qsim::Factory<SequentialFor>());
 }
 
+TEST(QTrajectoryAVXTest, ReusingResults) {
+  TestReusingResults(qsim::Factory<SequentialFor>());
+}
+
 TEST(QTrajectoryAVXTest, CollectKopStat) {
   TestCollectKopStat(qsim::Factory<SequentialFor>());
 }
diff --git a/tests/qtrajectory_cuda_test.cu b/tests/qtrajectory_cuda_test.cu
index 66c31acb..730ff7ed 100644
--- a/tests/qtrajectory_cuda_test.cu
+++ b/tests/qtrajectory_cuda_test.cu
@@ -58,6 +58,14 @@ TEST(QTrajectoryCUDATest, GenDump) {
   TestGenDump(factory);
 }
 
+TEST(QTrajectoryCUDATest, ReusingResults) {
+  using Factory = qsim::Factory<float>;
+  Factory::StateSpace::Parameter param1;
+  Factory::Simulator::Parameter param2;
+  Factory factory(param1, param2);
+  TestReusingResults(factory);
+}
+
 TEST(QTrajectoryCUDATest, CollectKopStat) {
   using Factory = qsim::Factory<float>;
   Factory::StateSpace::Parameter param1;
diff --git a/tests/qtrajectory_custatevec_test.cu b/tests/qtrajectory_custatevec_test.cu
new file mode 100644
index 00000000..ca7e823b
--- /dev/null
+++ b/tests/qtrajectory_custatevec_test.cu
@@ -0,0 +1,87 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "qtrajectory_testfixture.h"
+
+#include <cublas_v2.h>
+#include <custatevec.h>
+
+#include "gtest/gtest.h"
+
+#include "../lib/simulator_custatevec.h"
+
+namespace qsim {
+
+template <typename FP>
+struct Factory {
+  using fp_type = FP;
+  using Simulator = qsim::SimulatorCuStateVec<fp_type>;
+  using StateSpace = typename Simulator::StateSpace;
+
+  Factory() {
+    ErrorCheck(cublasCreate(&cublas_handle));
+    ErrorCheck(custatevecCreate(&custatevec_handle));
+  }
+
+  ~Factory() {
+    ErrorCheck(cublasDestroy(cublas_handle));
+    ErrorCheck(custatevecDestroy(custatevec_handle));
+  }
+
+  StateSpace CreateStateSpace() const {
+    return StateSpace(cublas_handle, custatevec_handle);
+  }
+
+  Simulator CreateSimulator() const {
+    return Simulator(custatevec_handle);
+  }
+
+  cublasHandle_t cublas_handle;
+  custatevecHandle_t custatevec_handle;
+};
+
+TEST(QTrajectoryCuStateVecTest, BitFlip) {
+  TestBitFlip(qsim::Factory<float>());
+}
+
+TEST(QTrajectoryCuStateVecTest, GenDump) {
+  TestGenDump(qsim::Factory<float>());
+}
+
+TEST(QTrajectoryCuStateVecTest, ReusingResults) {
+  TestReusingResults(qsim::Factory<float>());
+}
+
+TEST(QTrajectoryCuStateVecTest, CollectKopStat) {
+  TestCollectKopStat(qsim::Factory<float>());
+}
+
+TEST(QTrajectoryCuStateVecTest, CleanCircuit) {
+  TestCleanCircuit(qsim::Factory<float>());
+}
+
+TEST(QTrajectoryCuStateVecTest, InitialState) {
+  TestInitialState(qsim::Factory<float>());
+}
+
+TEST(QTrajectoryCuStateVecTest, UncomputeFinalState) {
+  TestUncomputeFinalState(qsim::Factory<float>());
+}
+
+}  // namespace qsim
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/qtrajectory_testfixture.h b/tests/qtrajectory_testfixture.h
index 80bd15a6..786d997c 100644
--- a/tests/qtrajectory_testfixture.h
+++ b/tests/qtrajectory_testfixture.h
@@ -21,8 +21,10 @@
 
 #include "gtest/gtest.h"
 
+#include "../lib/channel.h"
 #include "../lib/channels_cirq.h"
 #include "../lib/circuit_noisy.h"
+#include "../lib/expect.h"
 #include "../lib/fuser_mqubit.h"
 #include "../lib/gate_appl.h"
 #include "../lib/gates_cirq.h"
@@ -104,6 +106,10 @@ void AddGenAmplDumpNoise1(
        {normal, 0, p2, {M::Create(time, q, {r2, 0, 0, 0, 0, 0, t2, 0})}},
        {normal, 0, p3, {M::Create(time, q, {0, 0, s1, 0, 0, 0, 0, 0})}},
        {normal, 0, p3, {M::Create(time, q, {0, 0, 0, 0, s2, 0, 0, 0})}}});
+
+  for (auto& kop : ncircuit.channels.back()) {
+    kop.CalculateKdKMatrix();
+  }
 }
 
 template <typename Gate>
@@ -134,16 +140,165 @@ void AddGenAmplDumpNoise2(
        {normal, 0, p2, {M::Create(time, 0, {r2, 0, 0, 0, 0, 0, t2, 0})}},
        {normal, 0, p3, {M::Create(time, 0, {0, 0, s1, 0, 0, 0, 0, 0})}},
        {normal, 0, p3, {M::Create(time, 0, {0, 0, 0, 0, s2, 0, 0, 0})}}});
+
+  for (auto& kop : ncircuit.channels.back()) {
+    kop.CalculateKdKMatrix();
+  }
+
   ncircuit.channels.push_back(
       {{normal, 0, p1, {M::Create(time, 1, {t1, 0, 0, 0, 0, 0, r1, 0})}},
        {normal, 0, p2, {M::Create(time, 1, {r2, 0, 0, 0, 0, 0, t2, 0})}},
        {normal, 0, p3, {M::Create(time, 1, {0, 0, s1, 0, 0, 0, 0, 0})}},
        {normal, 0, p3, {M::Create(time, 1, {0, 0, 0, 0, s2, 0, 0, 0})}}});
+
+  for (auto& kop : ncircuit.channels.back()) {
+    kop.CalculateKdKMatrix();
+  }
+}
+
+// Adds the same channel as in AddGenAmplDumpNoise2 above.
+template <typename Gate>
+void AddGenAmplDumpNoise2Alt(
+    unsigned time, double g, NoisyCircuit<Gate>& ncircuit) {
+  using fp_type = typename Gate::fp_type;
+
+  // Probability of exchanging energy with the environment.
+  double p = 0.5;
+
+  double p1 = p * (1 - g);
+  double p2 = (1 - p) * (1 - g);
+  double p3 = 0;
+
+  fp_type t1 = std::sqrt(p);
+  fp_type r1 = std::sqrt(p * (1 - g));
+  fp_type s1 = std::sqrt(p * g);
+  fp_type t2 = std::sqrt(1 - p);
+  fp_type r2 = std::sqrt((1 - p) * (1 - g));
+  fp_type s2 = std::sqrt((1 - p) * g);
+
+  auto normal = KrausOperator<Gate>::kNormal;
+
+  using M = Cirq::MatrixGate1<fp_type>;
+
+  ncircuit.channels.push_back(
+      {{normal, 0, p1 * p1,
+        {M::Create(time, 0, {t1, 0, 0, 0, 0, 0, r1, 0}),
+         M::Create(time, 1, {t1, 0, 0, 0, 0, 0, r1, 0})}},
+       {normal, 0, p1 * p2,
+        {M::Create(time, 0, {t1, 0, 0, 0, 0, 0, r1, 0}),
+         M::Create(time, 1, {r2, 0, 0, 0, 0, 0, t2, 0})}},
+       {normal, 0, p1 * p3,
+        {M::Create(time, 0, {t1, 0, 0, 0, 0, 0, r1, 0}),
+         M::Create(time, 1, {0, 0, s1, 0, 0, 0, 0, 0})}},
+       {normal, 0, p1 * p3,
+        {M::Create(time, 0, {t1, 0, 0, 0, 0, 0, r1, 0}),
+         M::Create(time, 1, {0, 0, 0, 0, s2, 0, 0, 0})}},
+
+       {normal, 0, p2 * p1,
+        {M::Create(time, 0, {r2, 0, 0, 0, 0, 0, t2, 0}),
+         M::Create(time, 1, {t1, 0, 0, 0, 0, 0, r1, 0})}},
+       {normal, 0, p2 * p2,
+        {M::Create(time, 0, {r2, 0, 0, 0, 0, 0, t2, 0}),
+         M::Create(time, 1, {r2, 0, 0, 0, 0, 0, t2, 0})}},
+       {normal, 0, p2 * p3,
+        {M::Create(time, 0, {r2, 0, 0, 0, 0, 0, t2, 0}),
+         M::Create(time, 1, {0, 0, s1, 0, 0, 0, 0, 0})}},
+       {normal, 0, p2 * p3,
+        {M::Create(time, 0, {r2, 0, 0, 0, 0, 0, t2, 0}),
+         M::Create(time, 1, {0, 0, 0, 0, s2, 0, 0, 0})}},
+
+       {normal, 0, p3 * p1,
+        {M::Create(time, 0, {0, 0, s1, 0, 0, 0, 0, 0}),
+         M::Create(time, 1, {t1, 0, 0, 0, 0, 0, r1, 0})}},
+       {normal, 0, p3 * p2,
+        {M::Create(time, 0, {0, 0, s1, 0, 0, 0, 0, 0}),
+         M::Create(time, 1, {r2, 0, 0, 0, 0, 0, t2, 0})}},
+       {normal, 0, p3 * p3,
+        {M::Create(time, 0, {0, 0, s1, 0, 0, 0, 0, 0}),
+         M::Create(time, 1, {0, 0, s1, 0, 0, 0, 0, 0})}},
+       {normal, 0, p3 * p3,
+        {M::Create(time, 0, {0, 0, s1, 0, 0, 0, 0, 0}),
+         M::Create(time, 1, {0, 0, 0, 0, s2, 0, 0, 0})}},
+
+       {normal, 0, p3 * p1,
+        {M::Create(time, 0, {0, 0, 0, 0, s2, 0, 0, 0}),
+         M::Create(time, 1, {t1, 0, 0, 0, 0, 0, r1, 0})}},
+       {normal, 0, p3 * p2,
+        {M::Create(time, 0, {0, 0, 0, 0, s2, 0, 0, 0}),
+         M::Create(time, 1, {r2, 0, 0, 0, 0, 0, t2, 0})}},
+       {normal, 0, p3 * p3,
+        {M::Create(time, 0, {0, 0, 0, 0, s2, 0, 0, 0}),
+         M::Create(time, 1, {0, 0, s1, 0, 0, 0, 0, 0})}},
+       {normal, 0, p3 * p3,
+        {M::Create(time, 0, {0, 0, 0, 0, s2, 0, 0, 0}),
+         M::Create(time, 1, {0, 0, 0, 0, s2, 0, 0, 0})}},
+      });
+
+  for (auto& kop : ncircuit.channels.back()) {
+    kop.CalculateKdKMatrix();
+  }
+}
+
+template <typename Gate>
+void AddAmplDumpNoise1(
+    unsigned time, unsigned q, double g, NoisyCircuit<Gate>& ncircuit) {
+  using fp_type = typename Gate::fp_type;
+
+  double p1 = 1 - g;
+  double p2 = 0;
+
+  fp_type r = std::sqrt(p1);
+  fp_type s = std::sqrt(g);
+
+  auto normal = KrausOperator<Gate>::kNormal;
+
+  using M = Cirq::MatrixGate1<fp_type>;
+
+  ncircuit.channels.push_back(
+      {{normal, 0, p1, {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})}},
+       {normal, 0, p2, {M::Create(time, q, {0, 0, s, 0, 0, 0, 0, 0})}}});
+
+  for (auto& kop : ncircuit.channels.back()) {
+    kop.CalculateKdKMatrix();
+  }
+}
+
+template <typename Gate>
+void AddAmplDumpNoise2(
+    unsigned time, double g, NoisyCircuit<Gate>& ncircuit) {
+  using fp_type = typename Gate::fp_type;
+
+  double p1 = 1 - g;
+  double p2 = 0;
+
+  fp_type r = std::sqrt(p1);
+  fp_type s = std::sqrt(g);
+
+  auto normal = KrausOperator<Gate>::kNormal;
+
+  using M = Cirq::MatrixGate1<fp_type>;
+
+  ncircuit.channels.push_back(
+      {{normal, 0, p1, {M::Create(time, 0, {1, 0, 0, 0, 0, 0, r, 0})}},
+       {normal, 0, p2, {M::Create(time, 0, {0, 0, s, 0, 0, 0, 0, 0})}}});
+
+  for (auto& kop : ncircuit.channels.back()) {
+    kop.CalculateKdKMatrix();
+  }
+
+  ncircuit.channels.push_back(
+      {{normal, 0, p1, {M::Create(time, 1, {1, 0, 0, 0, 0, 0, r, 0})}},
+       {normal, 0, p2, {M::Create(time, 1, {0, 0, s, 0, 0, 0, 0, 0})}}});
+
+  for (auto& kop : ncircuit.channels.back()) {
+    kop.CalculateKdKMatrix();
+  }
 }
 
 template <typename Gate, typename AddNoise1, typename AddNoise2>
 NoisyCircuit<Gate> GenerateNoisyCircuit(
-    double p, AddNoise1&& add_noise1, AddNoise2&& add_noise2) {
+    double p, AddNoise1&& add_noise1, AddNoise2&& add_noise2,
+    bool add_measurement = true) {
   using fp_type = typename Gate::fp_type;
 
   NoisyCircuit<Gate> ncircuit;
@@ -176,9 +331,12 @@ NoisyCircuit<Gate> GenerateNoisyCircuit(
   add_noise1(9, 1, p, ncircuit);
   ncircuit.channels.push_back({{normal, 1, 1.0, {IS::Create(10, 0, 1)}}});
   add_noise2(11, p, ncircuit);
-  ncircuit.channels.push_back({{KrausOperator<Gate>::kMeasurement, 1, 1.0,
-                              {gate::Measurement<Gate>::Create(12, {0, 1})}}});
-  add_noise2(13, p, ncircuit);
+  if (add_measurement) {
+    ncircuit.channels.push_back(
+        {{KrausOperator<Gate>::kMeasurement, 1, 1.0,
+          {gate::Measurement<Gate>::Create(12, {0, 1})}}});
+    add_noise2(13, p, ncircuit);
+  }
 
   return ncircuit;
 }
@@ -196,10 +354,10 @@ void RunBatch(const Factory& factory, const NoisyCircuit<Gate>& ncircuit,
   unsigned num_reps = 25000;
 
   auto measure = [](uint64_t r, const State& state,
-                    const std::vector<uint64_t>& stat,
+                    const typename QTSimulator::Stat& stat,
                     std::vector<unsigned>& histogram) {
-    ASSERT_EQ(stat.size(), 1);
-    ++histogram[stat[0]];
+    ASSERT_EQ(stat.samples.size(), 1);
+    ++histogram[stat.samples[0]];
   };
 
   std::vector<unsigned> histogram(1 << num_qubits, 0);
@@ -234,13 +392,10 @@ void RunOnceRepeatedly(const Factory& factory,
   Simulator simulator = factory.CreateSimulator();
   StateSpace state_space = factory.CreateStateSpace();
 
-  State scratch = state_space.Null();
   State state = state_space.Create(num_qubits);
   EXPECT_FALSE(state_space.IsNull(state));
 
-  auto state_pointer = state.get();
-
-  std::vector<uint64_t> stat;
+  typename QTSimulator::Stat stat;
 
   std::vector<unsigned> histogram(1 << num_qubits, 0);
 
@@ -251,12 +406,10 @@ void RunOnceRepeatedly(const Factory& factory,
     state_space.SetStateZero(state);
 
     EXPECT_TRUE(QTSimulator::RunOnce(
-        param, ncircuit, i, state_space, simulator, scratch, state, stat));
-
-    EXPECT_EQ(state_pointer, state.get());
+        param, ncircuit, i, state_space, simulator, state, stat));
 
-    ASSERT_EQ(stat.size(), 1);
-    ++histogram[stat[0]];
+    ASSERT_EQ(stat.samples.size(), 1);
+    ++histogram[stat.samples[0]];
   }
 
   for (std::size_t i = 0; i < histogram.size(); ++i) {
@@ -264,6 +417,181 @@ void RunOnceRepeatedly(const Factory& factory,
   }
 }
 
+template <typename Factory, typename Gate>
+std::vector<std::complex<double>> ExpValsRunBatch(
+    const Factory& factory, const NoisyCircuit<Gate>& ncircuit,
+    bool reuse_results) {
+  using Simulator = typename Factory::Simulator;
+  using StateSpace = typename Factory::StateSpace;
+  using State = typename StateSpace::State;
+  using Fuser = MultiQubitGateFuser<IO, Gate>;
+  using QTSimulator = QuantumTrajectorySimulator<IO, Gate, MultiQubitGateFuser,
+                                                 Simulator>;
+
+  unsigned num_qubits = 2;
+  unsigned num_reps = 25000;
+
+  Simulator simulator = factory.CreateSimulator();
+  StateSpace state_space = factory.CreateStateSpace();
+
+  State state = state_space.Create(num_qubits);
+  EXPECT_FALSE(state_space.IsNull(state));
+
+  typename QTSimulator::Stat stat;
+
+  typename QTSimulator::Parameter param;
+  param.apply_last_deferred_ops = !reuse_results;
+
+  using Observables = std::vector<std::vector<qsim::OpString<Gate>>>;
+  Observables observables;
+  observables.reserve(num_qubits);
+
+  using rx = qsim::Cirq::rx<typename Gate::fp_type>;
+
+  for (unsigned q = 0; q < num_qubits; ++q) {
+    observables.push_back({{{1.0, 0.0}, {rx::Create(0, q, 1.7 + 0.6 * q)}}});
+  }
+
+  using TrajResults = std::vector<std::vector<std::complex<double>>>;
+  TrajResults traj_results(observables.size());
+
+  for (std::size_t k = 0; k < observables.size(); ++k) {
+    traj_results[k].reserve(num_reps);
+  }
+
+  std::vector<std::complex<double>> primary_results;
+  primary_results.reserve(observables.size());
+
+  auto measure = [](uint64_t r, const State& state,
+                    const typename QTSimulator::Stat& stat,
+                    const Simulator& simulator, bool reuse_results,
+                    const Observables& observables,
+                    std::vector<std::complex<double>>& primary_results,
+                    TrajResults& traj_results) {
+    if (reuse_results && stat.primary && !primary_results.empty()) {
+      for (std::size_t k = 0; k < observables.size(); ++k) {
+        traj_results[k].push_back(primary_results[k]);
+      }
+    } else {
+      for (std::size_t k = 0; k < observables.size(); ++k) {
+        const auto& obs = observables[k];
+        auto result = ExpectationValue<IO, Fuser>(obs, simulator, state);
+        traj_results[k].push_back(result);
+
+        if (reuse_results && stat.primary) {
+          primary_results.push_back(result);
+        }
+      }
+    }
+  };
+
+  EXPECT_TRUE(QTSimulator::RunBatch(param, ncircuit, 0, num_reps, state_space,
+                                    simulator, measure, simulator,
+                                    reuse_results, observables, primary_results,
+                                    traj_results));
+
+  std::vector<std::complex<double>> results;
+  results.reserve(observables.size());
+
+  double inverse_num_reps = 1.0 / num_reps;
+
+  for (std::size_t k = 0; k < observables.size(); ++k) {
+    std::complex<double> sum = 0;
+    for (unsigned i = 0; i < num_reps; ++i) {
+      sum += traj_results[k][i];
+    }
+
+    results.push_back(inverse_num_reps * sum);
+  }
+
+  return results;
+}
+
+template <typename Factory, typename Gate>
+std::vector<std::complex<double>> ExpValsRunOnceRepeatedly(
+    const Factory& factory, const NoisyCircuit<Gate>& ncircuit,
+    bool reuse_results) {
+  using Simulator = typename Factory::Simulator;
+  using StateSpace = typename Factory::StateSpace;
+  using State = typename StateSpace::State;
+  using Fuser = MultiQubitGateFuser<IO, Gate>;
+  using QTSimulator = QuantumTrajectorySimulator<IO, Gate, MultiQubitGateFuser,
+                                                 Simulator>;
+
+  unsigned num_qubits = 2;
+  unsigned num_reps = 25000;
+
+  Simulator simulator = factory.CreateSimulator();
+  StateSpace state_space = factory.CreateStateSpace();
+
+  State state = state_space.Create(num_qubits);
+  EXPECT_FALSE(state_space.IsNull(state));
+
+  typename QTSimulator::Stat stat;
+
+  typename QTSimulator::Parameter param;
+  param.apply_last_deferred_ops = true;
+
+  std::vector<std::vector<qsim::OpString<Gate>>> observables;
+  observables.reserve(num_qubits);
+
+  using rx = qsim::Cirq::rx<typename Gate::fp_type>;
+
+  for (unsigned q = 0; q < num_qubits; ++q) {
+    observables.push_back({{{1.0, 0.0}, {rx::Create(0, q, 1.7 + 0.6 * q)}}});
+  }
+
+  using TrajResults = std::vector<std::vector<std::complex<double>>>;
+  TrajResults traj_results(observables.size());
+
+  for (std::size_t k = 0; k < observables.size(); ++k) {
+    traj_results[k].reserve(num_reps);
+  }
+
+  std::vector<std::complex<double>> primary_results;
+  primary_results.reserve(observables.size());
+
+  for (unsigned i = 0; i < num_reps; ++i) {
+    state_space.SetStateZero(state);
+
+    EXPECT_TRUE(QTSimulator::RunOnce(
+        param, ncircuit, i, state_space, simulator, state, stat));
+
+    if (reuse_results && stat.primary && !primary_results.empty()) {
+      for (std::size_t k = 0; k < observables.size(); ++k) {
+        traj_results[k].push_back(primary_results[k]);
+      }
+    } else {
+      for (std::size_t k = 0; k < observables.size(); ++k) {
+        const auto& obs = observables[k];
+        auto result = ExpectationValue<IO, Fuser>(obs, simulator, state);
+        traj_results[k].push_back(result);
+
+        if (reuse_results && stat.primary) {
+          primary_results.push_back(result);
+          param.apply_last_deferred_ops = false;
+        }
+      }
+    }
+  }
+
+  std::vector<std::complex<double>> results;
+  results.reserve(observables.size());
+
+  double inverse_num_reps = 1.0 / num_reps;
+
+  for (std::size_t k = 0; k < observables.size(); ++k) {
+    std::complex<double> sum = 0;
+    for (unsigned i = 0; i < num_reps; ++i) {
+      sum += traj_results[k][i];
+    }
+
+    results.push_back(inverse_num_reps * sum);
+  }
+
+  return results;
+}
+
 template <typename Factory>
 void TestBitFlip(const Factory& factory) {
 /* The expected results are obtained with the following Cirq code.
@@ -361,9 +689,39 @@ for key, val in sorted(res.histogram(key='m').items()):
 
   using Gate = Cirq::GateCirq<typename Factory::fp_type>;
 
-  auto ncircuit = GenerateNoisyCircuit<Gate>(0.1, AddGenAmplDumpNoise1<Gate>,
-                                             AddGenAmplDumpNoise2<Gate>);
-  RunOnceRepeatedly(factory, ncircuit, expected_results);
+  {
+    auto ncircuit = GenerateNoisyCircuit<Gate>(0.1, AddGenAmplDumpNoise1<Gate>,
+                                               AddGenAmplDumpNoise2<Gate>);
+    RunOnceRepeatedly(factory, ncircuit, expected_results);
+  }
+
+  {
+    auto ncircuit = GenerateNoisyCircuit<Gate>(0.1, AddGenAmplDumpNoise1<Gate>,
+                                               AddGenAmplDumpNoise2Alt<Gate>);
+    RunOnceRepeatedly(factory, ncircuit, expected_results);
+  }
+}
+
+template <typename Factory>
+void TestReusingResults(const Factory& factory) {
+  using Gate = Cirq::GateCirq<typename Factory::fp_type>;
+
+  auto ncircuit = GenerateNoisyCircuit<Gate>(0.02, AddAmplDumpNoise1<Gate>,
+                                             AddAmplDumpNoise2<Gate>, false);
+
+  auto results1 = ExpValsRunOnceRepeatedly(factory, ncircuit, false);
+  auto results2 = ExpValsRunOnceRepeatedly(factory, ncircuit, true);
+  auto results3 = ExpValsRunBatch(factory, ncircuit, false);
+  auto results4 = ExpValsRunBatch(factory, ncircuit, true);
+
+  for (std::size_t k = 0; k < results1.size(); ++k) {
+    EXPECT_NEAR(std::real(results1[k]), std::real(results2[k]), 1e-8);
+    EXPECT_NEAR(std::imag(results1[k]), std::imag(results2[k]), 1e-8);
+    EXPECT_NEAR(std::real(results1[k]), std::real(results3[k]), 1e-8);
+    EXPECT_NEAR(std::imag(results1[k]), std::imag(results3[k]), 1e-8);
+    EXPECT_NEAR(std::real(results1[k]), std::real(results4[k]), 1e-8);
+    EXPECT_NEAR(std::imag(results1[k]), std::imag(results4[k]), 1e-8);
+  }
 }
 
 template <typename Factory>
@@ -411,11 +769,11 @@ void TestCollectKopStat(const Factory& factory) {
                       {normal, 1, p2, {X::Create(1, 3)}}});
 
   auto measure = [](uint64_t r, const State& state,
-                    const std::vector<uint64_t>& stat,
+                    const typename QTSimulator::Stat& stat,
                     std::vector<std::vector<unsigned>>& histogram) {
-    ASSERT_EQ(stat.size(), histogram.size());
+    ASSERT_EQ(stat.samples.size(), histogram.size());
     for (std::size_t i = 0; i < histogram.size(); ++i) {
-      ++histogram[i][stat[i]];
+      ++histogram[i][stat.samples[i]];
     }
   };
 
@@ -508,12 +866,11 @@ void TestCleanCircuit(const Factory& factory) {
     ApplyGate(simulator, gate, state);
   }
 
-  State scratch = state_space.Null();
   State nstate = state_space.Create(num_qubits);
 
   EXPECT_FALSE(state_space.IsNull(nstate));
 
-  std::vector<uint64_t> stat;
+  typename QTSimulator::Stat stat;
 
   typename QTSimulator::Parameter param;
 
@@ -522,9 +879,9 @@ void TestCleanCircuit(const Factory& factory) {
   // Run quantum trajectory simulator.
   EXPECT_TRUE(QTSimulator::RunOnce(param, num_qubits, ncircuit.channels.begin(),
                                    ncircuit.channels.end(), 0, state_space,
-                                   simulator, scratch, nstate, stat));
+                                   simulator, nstate, stat));
 
-  EXPECT_EQ(stat.size(), 0);
+  EXPECT_EQ(stat.samples.size(), 0);
 
   for (uint64_t i = 0; i < size; ++i) {
     auto a1 = state_space.GetAmpl(state, i);
@@ -565,20 +922,19 @@ void TestInitialState(const Factory& factory) {
   Simulator simulator = factory.CreateSimulator();
   StateSpace state_space = factory.CreateStateSpace();
 
-  State scratch = state_space.Null();
   State state = state_space.Create(num_qubits);
 
   EXPECT_FALSE(state_space.IsNull(state));
 
   typename QTSimulator::Parameter param;
-  std::vector<uint64_t> stat;
+  typename QTSimulator::Stat stat;
 
   for (unsigned i = 0; i < 8; ++i) {
     state_space.SetAmpl(state, i, 1 + i, 0);
   }
 
   EXPECT_TRUE(QTSimulator::RunOnce(
-      param, ncircuit, 0, state_space, simulator, scratch, state, stat));
+      param, ncircuit, 0, state_space, simulator, state, stat));
 
   // Expect reversed order of amplitudes.
   for (unsigned i = 0; i < 8; ++i) {
@@ -625,7 +981,6 @@ void TestUncomputeFinalState(const Factory& factory) {
   Simulator simulator = factory.CreateSimulator();
   StateSpace state_space = factory.CreateStateSpace();
 
-  State scratch = state_space.Null();
   State state = state_space.Create(num_qubits);
 
   EXPECT_FALSE(state_space.IsNull(state));
@@ -635,19 +990,19 @@ void TestUncomputeFinalState(const Factory& factory) {
   typename QTSimulator::Parameter param;
   param.collect_kop_stat = true;
 
-  std::vector<uint64_t> stat;
+  typename QTSimulator::Stat stat;
 
   // Run one trajectory.
-  EXPECT_TRUE(QTSimulator::RunOnce(param, ncircuit, 0, state_space, simulator,
-                                   scratch, state, stat));
+  EXPECT_TRUE(QTSimulator::RunOnce(
+      param, ncircuit, 0, state_space, simulator, state, stat));
 
-  EXPECT_EQ(ncircuit.channels.size(), stat.size());
+  EXPECT_EQ(ncircuit.channels.size(), stat.samples.size());
 
   // Uncompute the final state back to |0000> (up to round-off errors).
   for (std::size_t i = 0; i < ncircuit.channels.size(); ++i) {
     auto k = ncircuit.channels.size() - 1 - i;
 
-    const auto& ops = ncircuit.channels[k][stat[k]].ops;
+    const auto& ops = ncircuit.channels[k][stat.samples[k]].ops;
 
     for (auto it = ops.rbegin(); it != ops.rend(); ++it) {
       ApplyGateDagger(simulator, *it, state);
diff --git a/tests/simulator_avx512_test.cc b/tests/simulator_avx512_test.cc
index 9265038a..fbc9510e 100644
--- a/tests/simulator_avx512_test.cc
+++ b/tests/simulator_avx512_test.cc
@@ -16,7 +16,7 @@
 
 #include "gtest/gtest.h"
 
-#if defined(__AVX512F__) && !defined(_WIN32) && !defined(__SANITIZE_ADDRESS__)
+#if defined(__AVX512F__) && !defined(_WIN32)
 
 #ifdef _OPENMP
 #include "../lib/parfor.h"
@@ -94,7 +94,7 @@ TYPED_TEST(SimulatorAVX512Test, ExpectationValue2) {
 
 }  // namespace qsim
 
-#endif  // defined(__AVX512F__) && !defined(_WIN32) && !defined(__SANITIZE_ADDRESS__)
+#endif  // defined(__AVX512F__) && !defined(_WIN32)
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
diff --git a/tests/simulator_custatevec_test.cu b/tests/simulator_custatevec_test.cu
new file mode 100644
index 00000000..646e0c07
--- /dev/null
+++ b/tests/simulator_custatevec_test.cu
@@ -0,0 +1,108 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "simulator_testfixture.h"
+
+#include <cublas_v2.h>
+#include <custatevec.h>
+
+#include <type_traits>
+
+#include "gtest/gtest.h"
+
+#include "../lib/simulator_custatevec.h"
+
+namespace qsim {
+
+template <class T>
+class SimulatorCuStateVecTest : public testing::Test {};
+
+using fp_impl = ::testing::Types<float, double>;
+
+TYPED_TEST_SUITE(SimulatorCuStateVecTest, fp_impl);
+
+template <typename fp_type>
+struct Factory {
+  using Simulator = qsim::SimulatorCuStateVec<fp_type>;
+  using StateSpace = typename Simulator::StateSpace;
+
+  Factory() {
+    ErrorCheck(cublasCreate(&cublas_handle));
+    ErrorCheck(custatevecCreate(&custatevec_handle));
+  }
+
+  ~Factory() {
+    ErrorCheck(cublasDestroy(cublas_handle));
+    ErrorCheck(custatevecDestroy(custatevec_handle));
+  }
+
+  StateSpace CreateStateSpace() const {
+    return StateSpace(cublas_handle, custatevec_handle);
+  }
+
+  Simulator CreateSimulator() const {
+    return Simulator(custatevec_handle);
+  }
+
+  cublasHandle_t cublas_handle;
+  custatevecHandle_t custatevec_handle;
+};
+
+TYPED_TEST(SimulatorCuStateVecTest, ApplyGate1) {
+  TestApplyGate1(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(SimulatorCuStateVecTest, ApplyGate2) {
+  TestApplyGate2(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(SimulatorCuStateVecTest, ApplyGate3) {
+  TestApplyGate3(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(SimulatorCuStateVecTest, ApplyGate5) {
+  TestApplyGate5(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(SimulatorCuStateVecTest, CircuitWithControlledGates) {
+  TestCircuitWithControlledGates(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(SimulatorCuStateVecTest, CircuitWithControlledGatesDagger) {
+  TestCircuitWithControlledGatesDagger(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(SimulatorCuStateVecTest, MultiQubitGates) {
+  TestMultiQubitGates(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(SimulatorCuStateVecTest, ControlledGates) {
+  bool high_precision = std::is_same<TypeParam, double>::value;
+  TestControlledGates(qsim::Factory<TypeParam>(), high_precision);
+}
+
+TYPED_TEST(SimulatorCuStateVecTest, ExpectationValue1) {
+  TestExpectationValue1(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(SimulatorCuStateVecTest, ExpectationValue2) {
+  TestExpectationValue2(qsim::Factory<TypeParam>());
+}
+
+}  // namespace qsim
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/simulator_testfixture.h b/tests/simulator_testfixture.h
index ba575f4a..ef335565 100644
--- a/tests/simulator_testfixture.h
+++ b/tests/simulator_testfixture.h
@@ -27,6 +27,7 @@
 #include "../lib/gate_appl.h"
 #include "../lib/gates_qsim.h"
 #include "../lib/io.h"
+#include "../lib/util_cpu.h"
 
 namespace qsim {
 
@@ -223,7 +224,8 @@ void TestApplyGate5(const Factory& factory) {
   auto gate25 = GateRX<fp_type>::Create(11, 4, 0.3);
 
   GateFused<GateQSim<fp_type>> fgate1{kGateCZ, 2, {0, 1},  &gate11,
-      {&gate1, &gate2, &gate6, &gate7, &gate11, &gate12, &gate13}};
+      {&gate1, &gate2, &gate6, &gate7, &gate11, &gate12, &gate13}, {}};
+  CalculateFusedMatrix(fgate1);
   ApplyFusedGate(simulator, fgate1, state);
 
   EXPECT_NEAR(state_space.Norm(state), 1, 1e-6);
@@ -244,7 +246,8 @@ void TestApplyGate5(const Factory& factory) {
   }
 
   GateFused<GateQSim<fp_type>> fgate2{kGateIS, 4, {1, 2}, &gate14,
-      {&gate3, &gate8, &gate14, &gate15, &gate16}};
+      {&gate3, &gate8, &gate14, &gate15, &gate16}, {}};
+  CalculateFusedMatrix(fgate2);
   ApplyFusedGate(simulator, fgate2, state);
 
   EXPECT_NEAR(state_space.Norm(state), 1, 1e-6);
@@ -265,7 +268,8 @@ void TestApplyGate5(const Factory& factory) {
   }
 
   GateFused<GateQSim<fp_type>> fgate3{kGateCNot, 6, {2, 3}, &gate17,
-      {&gate4, &gate9, &gate17, &gate18, &gate19}};
+      {&gate4, &gate9, &gate17, &gate18, &gate19},{}};
+  CalculateFusedMatrix(fgate3);
   ApplyFusedGate(simulator, fgate3, state);
 
   EXPECT_NEAR(state_space.Norm(state), 1, 1e-6);
@@ -286,7 +290,8 @@ void TestApplyGate5(const Factory& factory) {
   }
 
   GateFused<GateQSim<fp_type>> fgate4{kGateFS, 8, {3, 4}, &gate20,
-      {&gate5, &gate10, &gate20, &gate21, &gate22}};
+      {&gate5, &gate10, &gate20, &gate21, &gate22}, {}};
+  CalculateFusedMatrix(fgate4);
   ApplyFusedGate(simulator, fgate4, state);
 
   EXPECT_NEAR(state_space.Norm(state), 1, 1e-6);
@@ -306,7 +311,9 @@ void TestApplyGate5(const Factory& factory) {
     EXPECT_NEAR(std::imag(ampl3), -0.00987822, 1e-6);
   }
 
-  GateFused<GateQSim<fp_type>> fgate5{kGateCP, 10, {0, 1}, &gate23, {&gate23}};
+  GateFused<GateQSim<fp_type>> fgate5{kGateCP, 10, {0, 1}, &gate23,
+      {&gate23}, {}};
+  CalculateFusedMatrix(fgate5);
   ApplyFusedGate(simulator, fgate5, state);
 
   EXPECT_NEAR(state_space.Norm(state), 1, 1e-6);
@@ -460,11 +467,11 @@ void TestCircuitWithControlledGates(const Factory& factory) {
   StateSpace state_space = factory.CreateStateSpace();
   Simulator simulator = factory.CreateSimulator();
 
-  auto state = state_space.Create(num_qubits);
-  state_space.SetStateZero(state);
+  auto state1 = state_space.Create(num_qubits);
+  state_space.SetStateZero(state1);
 
   for (const auto& gate : gates) {
-    ApplyGate(simulator, gate, state);
+    ApplyGate(simulator, gate, state1);
   }
 
 /*
@@ -717,10 +724,42 @@ if __name__ == '__main__':
   unsigned size = 1 << num_qubits;
 
   for (unsigned i = 0; i < size; ++i) {
-    auto a = StateSpace::GetAmpl(state, i);
+    auto a = StateSpace::GetAmpl(state1, i);
     EXPECT_NEAR(std::real(a), expected_results[i][0], 1e-6);
     EXPECT_NEAR(std::imag(a), expected_results[i][1], 1e-6);
   }
+
+  SetFlushToZeroAndDenormalsAreZeros();
+
+  auto state2 = state_space.Create(num_qubits);
+  state_space.SetStateZero(state2);
+
+  for (const auto& gate : gates) {
+    ApplyGate(simulator, gate, state2);
+  }
+
+  for (unsigned i = 0; i < size; ++i) {
+    auto a1 = StateSpace::GetAmpl(state1, i);
+    auto a2 = StateSpace::GetAmpl(state2, i);
+    EXPECT_EQ(std::real(a1), std::real(a2));
+    EXPECT_EQ(std::imag(a1), std::imag(a2));
+  }
+
+  ClearFlushToZeroAndDenormalsAreZeros();
+
+  auto state3 = state_space.Create(num_qubits);
+  state_space.SetStateZero(state3);
+
+  for (const auto& gate : gates) {
+    ApplyGate(simulator, gate, state3);
+  }
+
+  for (unsigned i = 0; i < size; ++i) {
+    auto a1 = StateSpace::GetAmpl(state1, i);
+    auto a2 = StateSpace::GetAmpl(state3, i);
+    EXPECT_EQ(std::real(a1), std::real(a2));
+    EXPECT_EQ(std::imag(a1), std::imag(a2));
+  }
 }
 
 template <typename Factory>
@@ -1191,13 +1230,10 @@ void TestControlledGates(const Factory& factory, bool high_precision) {
   std::vector<fp_type> matrix;
   matrix.reserve(1 << (2 * max_target_qubits + 1));
 
-  std::vector<fp_type> vec;
-  vec.reserve(state_space.MinSize(max_qubits));
+  std::vector<fp_type> vec(state_space.MinSize(max_qubits));
 
   // Iterate over circuit size.
   for (unsigned num_qubits = 2; num_qubits <= max_qubits; ++num_qubits) {
-    vec.resize(state_space.MinSize(num_qubits));
-
     unsigned size = 1 << num_qubits;
     unsigned nmask = size - 1;
 
diff --git a/tests/statespace_avx512_test.cc b/tests/statespace_avx512_test.cc
index d018eb79..6bbb1f58 100644
--- a/tests/statespace_avx512_test.cc
+++ b/tests/statespace_avx512_test.cc
@@ -16,7 +16,7 @@
 
 #include "gtest/gtest.h"
 
-#if defined(__AVX512F__) && !defined(_WIN32) && !defined(__SANITIZE_ADDRESS__)
+#if defined(__AVX512F__) && !defined(_WIN32)
 
 #ifdef _OPENMP
 #include "../lib/parfor.h"
@@ -115,7 +115,7 @@ TYPED_TEST(StateSpaceAVX512Test, ThreadThrashing) {
 
 }  // namespace qsim
 
-#endif  // defined(__AVX512F__) && !defined(_WIN32) && !defined(__SANITIZE_ADDRESS__)
+#endif  // defined(__AVX512F__) && !defined(_WIN32)
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
diff --git a/tests/statespace_custatevec_test.cu b/tests/statespace_custatevec_test.cu
new file mode 100644
index 00000000..841be8ee
--- /dev/null
+++ b/tests/statespace_custatevec_test.cu
@@ -0,0 +1,126 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "statespace_testfixture.h"
+
+#include <cublas_v2.h>
+#include <custatevec.h>
+
+#include "gtest/gtest.h"
+
+#include "../lib/simulator_custatevec.h"
+#include "../lib/statespace_custatevec.h"
+
+namespace qsim {
+
+template <class T>
+class StateSpaceCuStateVecTest : public testing::Test {};
+
+using fp_impl = ::testing::Types<float, double>;
+
+TYPED_TEST_SUITE(StateSpaceCuStateVecTest, fp_impl);
+
+template <typename fp_type>
+struct Factory {
+  using Simulator = qsim::SimulatorCuStateVec<fp_type>;
+  using StateSpace = typename Simulator::StateSpace;
+
+  Factory() {
+    ErrorCheck(cublasCreate(&cublas_handle));
+    ErrorCheck(custatevecCreate(&custatevec_handle));
+  }
+
+  ~Factory() {
+    ErrorCheck(cublasDestroy(cublas_handle));
+    ErrorCheck(custatevecDestroy(custatevec_handle));
+  }
+
+  StateSpace CreateStateSpace() const {
+    return StateSpace(cublas_handle, custatevec_handle);
+  }
+
+  Simulator CreateSimulator() const {
+    return Simulator(custatevec_handle);
+  }
+
+  cublasHandle_t cublas_handle;
+  custatevecHandle_t custatevec_handle;
+};
+
+TYPED_TEST(StateSpaceCuStateVecTest, Add) {
+  TestAdd(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(StateSpaceCuStateVecTest, NormSmall) {
+  TestNormSmall(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(StateSpaceCuStateVecTest, NormAndInnerProductSmall) {
+  TestNormAndInnerProductSmall(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(StateSpaceCuStateVecTest, NormAndInnerProduct) {
+  TestNormAndInnerProduct(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(StateSpaceCuStateVecTest, SamplingSmall) {
+  TestSamplingSmall(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(StateSpaceCuStateVecTest, SamplingCrossEntropyDifference) {
+  TestSamplingCrossEntropyDifference(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(StateSpaceCuStateVecTest, Ordering) {
+  TestOrdering(qsim::Factory<TypeParam>());
+}
+
+TEST(StateSpaceCuStateVecTest, MeasurementSmall) {
+  TestMeasurementSmall(qsim::Factory<float>(), true);
+}
+
+TYPED_TEST(StateSpaceCuStateVecTest, MeasurementLarge) {
+//  This test fails.
+//  TestMeasurementLarge(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(StateSpaceCuStateVecTest, Collapse) {
+  TestCollapse(qsim::Factory<TypeParam>());
+}
+
+TEST(StateSpaceCuStateVecTest, InvalidStateSize) {
+  TestInvalidStateSize(qsim::Factory<float>());
+}
+
+TYPED_TEST(StateSpaceCuStateVecTest, BulkSetAmpl) {
+//  Not implemented.
+//  TestBulkSetAmplitude(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(StateSpaceCuStateVecTest, BulkSetAmplExclusion) {
+//  Not implemented.
+//  TestBulkSetAmplitudeExclusion(qsim::Factory<TypeParam>());
+}
+
+TYPED_TEST(StateSpaceCuStateVecTest, BulkSetAmplDefault) {
+//  Not implemented.
+//  TestBulkSetAmplitudeDefault(factory);
+}
+
+}  // namespace qsim
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/statespace_testfixture.h b/tests/statespace_testfixture.h
index 2a41c047..a6df9a29 100644
--- a/tests/statespace_testfixture.h
+++ b/tests/statespace_testfixture.h
@@ -463,17 +463,18 @@ void TestNormAndInnerProductSmall(const Factory& factory) {
 
 template <typename Factory>
 void TestNormAndInnerProduct(const Factory& factory) {
+  using Simulator = typename Factory::Simulator;
+  using StateSpace = typename Simulator::StateSpace;
+  using State = typename StateSpace::State;
+  using fp_type = typename StateSpace::fp_type;
+  using Runner = QSimRunner<IO, BasicGateFuser<IO, GateQSim<fp_type>>, Factory>;
+
   unsigned depth = 8;
 
   std::stringstream ss(circuit_string);
-  Circuit<GateQSim<float>> circuit;
+  Circuit<GateQSim<fp_type>> circuit;
   EXPECT_TRUE(CircuitQsimParser<IO>::FromStream(depth, provider, ss, circuit));
-  circuit.gates.emplace_back(GateT<float>::Create(depth + 1, 0));
-
-  using Simulator = typename Factory::Simulator;
-  using StateSpace = typename Simulator::StateSpace;
-  using State = typename StateSpace::State;
-  using Runner = QSimRunner<IO, BasicGateFuser<IO, GateQSim<float>>, Factory>;
+  circuit.gates.emplace_back(GateT<fp_type>::Create(depth + 1, 0));
 
   StateSpace state_space = factory.CreateStateSpace();
   State state0 = state_space.Create(circuit.num_qubits);
@@ -773,17 +774,18 @@ void TestMeasurementSmall(const Factory& factory, bool cuda = false) {
 
 template <typename Factory>
 void TestMeasurementLarge(const Factory& factory) {
+  using Simulator = typename Factory::Simulator;
+  using StateSpace = typename Simulator::StateSpace;
+  using State = typename StateSpace::State;
+  using fp_type = typename StateSpace::fp_type;
+  using Runner = QSimRunner<IO, BasicGateFuser<IO, GateQSim<fp_type>>, Factory>;
+
   unsigned depth = 20;
 
   std::stringstream ss(circuit_string);
-  Circuit<GateQSim<float>> circuit;
+  Circuit<GateQSim<fp_type>> circuit;
   EXPECT_TRUE(CircuitQsimParser<IO>::FromStream(depth, provider, ss, circuit));
 
-  using Simulator = typename Factory::Simulator;
-  using StateSpace = typename Simulator::StateSpace;
-  using State = typename StateSpace::State;
-  using Runner = QSimRunner<IO, BasicGateFuser<IO, GateQSim<float>>, Factory>;
-
   StateSpace state_space = factory.CreateStateSpace();
   State state = state_space.Create(circuit.num_qubits);
 
diff --git a/tests/unitary_calculator_avx512_test.cc b/tests/unitary_calculator_avx512_test.cc
index e7cad957..56982688 100644
--- a/tests/unitary_calculator_avx512_test.cc
+++ b/tests/unitary_calculator_avx512_test.cc
@@ -16,7 +16,7 @@
 
 #include "gtest/gtest.h"
 
-#if defined(__AVX512F__) && !defined(_WIN32) && !defined(__SANITIZE_ADDRESS__)
+#if defined(__AVX512F__) && !defined(_WIN32)
 
 #include "../lib/formux.h"
 #include "../lib/unitary_calculator_avx512.h"
diff --git a/tests/unitary_calculator_testfixture.h b/tests/unitary_calculator_testfixture.h
index 3b7707c3..cd77ca43 100644
--- a/tests/unitary_calculator_testfixture.h
+++ b/tests/unitary_calculator_testfixture.h
@@ -425,8 +425,10 @@ void TestApplyFusedGate() {
   std::vector<Gate> gates = {Cirq::H<fp_type>::Create(0, 0),
                              Cirq::H<fp_type>::Create(1, 0)};
 
-  GateFused<Gate> fgate{Cirq::kH, 0, {0}, &gates[0], {&gates[0], &gates[1]}};
+  GateFused<Gate> fgate {Cirq::kH, 0, {0}, &gates[0],
+                         {&gates[0], &gates[1]}, {}};
 
+  CalculateFusedMatrix(fgate);
   ApplyFusedGate(uc, fgate, u);
 
   unsigned size = 1 << num_qubits;
diff --git a/tests/unitaryspace_avx512_test.cc b/tests/unitaryspace_avx512_test.cc
index 21f2b0be..6301ec4d 100644
--- a/tests/unitaryspace_avx512_test.cc
+++ b/tests/unitaryspace_avx512_test.cc
@@ -16,7 +16,7 @@
 
 #include "gtest/gtest.h"
 
-#if defined(__AVX512F__) && !defined(_WIN32) && !defined(__SANITIZE_ADDRESS__)
+#if defined(__AVX512F__) && !defined(_WIN32)
 
 #include "../lib/formux.h"
 #include "../lib/unitaryspace_avx512.h"