Merge pull request #143 from quantumlib/stateful-for

Convert to stateful for.
quantumlib · Jul 7, 2020 · 5b6e26a · 5b6e26a
2 parents 8ad553c + 6a8c82d
commit 5b6e26a
Show file tree

Hide file tree

Showing 13 changed files with 130 additions and 139 deletions.
diff --git a/apps/qsim_von_neumann.cc b/apps/qsim_von_neumann.cc
@@ -108,8 +108,8 @@ int main(int argc, char* argv[]) {
       return p != 0 ? p * std::log(p) : 0;
     };
 
-    double entropy = -For::RunReduce(opt.num_threads, state_space.Size(), f,
-                                     Op(), state_space, state);
+    double entropy = -For(opt.num_threads).RunReduce(state_space.Size(), f,
+                                                     Op(), state_space, state);
     IO::messagef("entropy=%g\n", entropy);
   };
 

diff --git a/lib/hybrid.h b/lib/hybrid.h
@@ -86,6 +86,9 @@ struct HybridSimulator final {
     unsigned verbosity = 0;
   };
 
+  template <typename... Args>
+  explicit HybridSimulator(Args&&... args) : for_(args...) {}
+
   /**
    * Splits the lattice into two parts, using Schmidt decomposition for gates
    * on the cut.
@@ -242,12 +245,12 @@ struct HybridSimulator final {
    *   will be populated with amplitudes for each state in 'bitstrings'.
    * @return True if the simulation completed successfully; false otherwise.
    */
-  static bool Run(const Parameter& param, HybridData& hd,
-                  const std::vector<unsigned>& parts,
-                  const std::vector<GateFused>& fgates0,
-                  const std::vector<GateFused>& fgates1,
-                  const std::vector<uint64_t>& bitstrings,
-                  std::vector<std::complex<fp_type>>& results) {
+  bool Run(const Parameter& param, HybridData& hd,
+           const std::vector<unsigned>& parts,
+           const std::vector<GateFused>& fgates0,
+           const std::vector<GateFused>& fgates1,
+           const std::vector<uint64_t>& bitstrings,
+           std::vector<std::complex<fp_type>>& results) const {
     unsigned num_p_gates = param.num_prefix_gatexs;
     unsigned num_pr_gates = num_p_gates + param.num_root_gatexs;
 
@@ -374,8 +377,8 @@ struct HybridSimulator final {
         };
 
         // Collect results.
-        For::Run(param.num_threads, results.size(), f, sspace0, sspace1,
-                 *rstate0, *rstate1, indices, results);
+        for_.Run(results.size(), f, sspace0, sspace1, *rstate0, *rstate1,
+                 indices, results);
       }
     }
 
@@ -541,6 +544,8 @@ struct HybridSimulator final {
 
     return true;
   }
+
+  For for_;
 };
 
 }  // namespace qsim

diff --git a/lib/parfor.h b/lib/parfor.h
@@ -25,30 +25,29 @@ namespace qsim {
 
 template <uint64_t MIN_SIZE>
 struct ParallelForT {
+  explicit ParallelForT(unsigned num_threads) : num_threads(num_threads) {}
+
   // GetIndex0 and GetIndex1 are useful when we need to know how work was
   // divided between threads, for instance, for reusing partial sums obtained
   // by RunReduceP.
-  static uint64_t GetIndex0(
-      uint64_t size, unsigned num_threads, unsigned thread_id) {
+  uint64_t GetIndex0(uint64_t size, unsigned thread_id) const {
     return size >= MIN_SIZE ? size * thread_id / num_threads : 0;
   }
 
-  static uint64_t GetIndex1(
-      uint64_t size, unsigned num_threads, unsigned thread_id) {
+  uint64_t GetIndex1(uint64_t size, unsigned thread_id) const {
     return size >= MIN_SIZE ? size * (thread_id + 1) / num_threads : size;
   }
 
   template <typename Function, typename... Args>
-  static void Run(
-      unsigned num_threads, uint64_t size, Function&& func, Args&&... args) {
+  void Run(uint64_t size, Function&& func, Args&&... args) const {
     if (num_threads > 1 && size >= MIN_SIZE) {
       #pragma omp parallel num_threads(num_threads)
       {
         unsigned n = omp_get_num_threads();
         unsigned m = omp_get_thread_num();
 
-        uint64_t i0 = GetIndex0(size, n, m);
-        uint64_t i1 = GetIndex1(size, n, m);
+        uint64_t i0 = GetIndex0(size, m);
+        uint64_t i1 = GetIndex1(size, m);
 
         for (uint64_t i = i0; i < i1; ++i) {
           func(n, m, i, args...);
@@ -62,9 +61,8 @@ struct ParallelForT {
   }
 
   template <typename Function, typename Op, typename... Args>
-  static std::vector<typename Op::result_type> RunReduceP(
-      unsigned num_threads, uint64_t size, Function&& func, Op&& op,
-      Args&&... args) {
+  std::vector<typename Op::result_type> RunReduceP(
+      uint64_t size, Function&& func, Op&& op, Args&&... args) const {
     std::vector<typename Op::result_type> partial_results;
 
     if (num_threads > 1 && size >= MIN_SIZE) {
@@ -75,8 +73,8 @@ struct ParallelForT {
         unsigned n = omp_get_num_threads();
         unsigned m = omp_get_thread_num();
 
-        uint64_t i0 = GetIndex0(size, n, m);
-        uint64_t i1 = GetIndex1(size, n, m);
+        uint64_t i0 = GetIndex0(size, m);
+        uint64_t i1 = GetIndex1(size, m);
 
         typename Op::result_type partial_result = 0;
 
@@ -99,11 +97,9 @@ struct ParallelForT {
   }
 
   template <typename Function, typename Op, typename... Args>
-  static typename Op::result_type RunReduce(unsigned num_threads,
-                                            uint64_t size, Function&& func,
-                                            Op&& op, Args&&... args) {
-    auto partial_results = RunReduceP(
-        num_threads, size, func, std::move(op), args...);
+  typename Op::result_type RunReduce(uint64_t size, Function&& func,
+                                     Op&& op, Args&&... args) const {
+    auto partial_results = RunReduceP(size, func, std::move(op), args...);
 
     typename Op::result_type result = 0;
 
@@ -113,6 +109,8 @@ struct ParallelForT {
 
     return result;
   }
+
+  unsigned num_threads;
 };
 
 using ParallelFor = ParallelForT<1024>;

diff --git a/lib/run_qsimh.h b/lib/run_qsimh.h
@@ -91,7 +91,7 @@ struct QSimHRunner final {
       return false;
     }
 
-    rc = HybridSimulator::Run(
+    rc = HybridSimulator(param.num_threads).Run(
         param, hd, parts, fgates0, fgates1, bitstrings, results);
 
     if (rc && param.verbosity > 0) {
@@ -112,7 +112,6 @@ struct QSimHRunner final {
     IO::messagef("breakup: %up+%ur+%us\n", param.num_prefix_gatexs,
                  param.num_root_gatexs, num_suffix_gates);
   }
-
 };
 
 }  // namespace qsim

diff --git a/lib/seqfor.h b/lib/seqfor.h
@@ -22,28 +22,28 @@
 namespace qsim {
 
 struct SequentialFor {
-  static uint64_t GetIndex0(
-      uint64_t size, unsigned num_threads, unsigned thread_id) {
+  explicit SequentialFor(unsigned num_threads) {}
+
+  // SequentialFor does not have any state. So all its methods can be static.
+
+  static uint64_t GetIndex0(uint64_t size, unsigned thread_id) {
     return 0;
   }
 
-  static uint64_t GetIndex1(
-      uint64_t size, unsigned num_threads, unsigned thread_id) {
+  static uint64_t GetIndex1(uint64_t size, unsigned thread_id) {
     return size;
   }
 
   template <typename Function, typename... Args>
-  static void Run(
-    unsigned num_threads, uint64_t size, Function&& func, Args&&... args) {
+  static void Run(uint64_t size, Function&& func, Args&&... args) {
     for (uint64_t i = 0; i < size; ++i) {
       func(1, 0, i, args...);
     }
   }
 
   template <typename Function, typename Op, typename... Args>
   static std::vector<typename Op::result_type> RunReduceP(
-      unsigned num_threads, uint64_t size, Function&& func, Op&& op,
-      Args&&... args) {
+      uint64_t size, Function&& func, Op&& op, Args&&... args) {
     typename Op::result_type result = 0;
 
     for (uint64_t i = 0; i < size; ++i) {
@@ -54,10 +54,9 @@ struct SequentialFor {
   }
 
   template <typename Function, typename Op, typename... Args>
-  static typename Op::result_type RunReduce(unsigned num_threads,
-                                            uint64_t size, Function&& func,
+  static typename Op::result_type RunReduce(uint64_t size, Function&& func,
                                             Op&& op, Args&&... args) {
-    return RunReduceP(num_threads, size, func, std::move(op), args...)[0];
+    return RunReduceP(size, func, std::move(op), args...)[0];
   }
 };
 

diff --git a/lib/simulator_avx.h b/lib/simulator_avx.h
@@ -32,8 +32,9 @@ class SimulatorAVX final {
   using State = typename StateSpace::State;
   using fp_type = typename StateSpace::fp_type;
 
-  SimulatorAVX(unsigned num_qubits, unsigned num_threads)
-      : num_qubits_(num_qubits), num_threads_(num_threads) {}
+  template <typename... Args>
+  explicit SimulatorAVX(unsigned num_qubits, Args&&... args)
+      : for_(args...), num_qubits_(num_qubits) {}
 
   /**
    * Applies a single-qubit gate using AVX instructions.
@@ -130,7 +131,7 @@ class SimulatorAVX final {
       _mm256_store_ps(rstate + p + 8, in);
     };
 
-    For::Run(num_threads_, sizei / 16, f, sizek, mask0, mask1, matrix, rstate);
+    for_.Run(sizei / 16, f, sizek, mask0, mask1, matrix, rstate);
   }
 
   // Applies a single-qubit gate for qubit <= 2.
@@ -223,8 +224,7 @@ class SimulatorAVX final {
       _mm256_store_ps(rstate + p + 8, in);
     };
 
-    For::Run(num_threads_, std::max(uint64_t{1}, sizei / 16), f, q0, ml,
-             matrix, rstate);
+    for_.Run(std::max(uint64_t{1}, sizei / 16), f, q0, ml, matrix, rstate);
   }
 
   // Applies two-qubit gate for qubit0 > 2 and qubit1 > 2.
@@ -376,8 +376,7 @@ class SimulatorAVX final {
       _mm256_store_ps(rstate + p + 8, in);
     };
 
-    For::Run(num_threads_, sizei / 16, f, sizej, sizek, mask0, mask1, mask2,
-             matrix, rstate);
+    for_.Run(sizei / 16, f, sizej, sizek, mask0, mask1, mask2, matrix, rstate);
   }
 
   // Applies a two-qubit gate for qubit0 <= 2 and qubit1 > 2.
@@ -586,8 +585,7 @@ class SimulatorAVX final {
       _mm256_store_ps(rstate + p + 8, in);
     };
 
-    For::Run(num_threads_, sizei / 16, f, sizej, mask0, mask1, q0, ml,
-             matrix, rstate);
+    for_.Run(sizei / 16, f, sizej, mask0, mask1, q0, ml, matrix, rstate);
   }
 
   // Applies a two-qubit gate for qubit0 <= 2 and qubit1 <= 2.
@@ -814,12 +812,12 @@ class SimulatorAVX final {
       _mm256_store_ps(rstate + p + 8, in);
     };
 
-    For::Run(num_threads_, std::max(uint64_t{1}, sizei / 16), f, q,
-             ml1, ml2, ml3, matrix, rstate);
+    for_.Run(
+        std::max(uint64_t{1}, sizei / 16), f, q, ml1, ml2, ml3, matrix, rstate);
   }
 
+  For for_;
   unsigned num_qubits_;
-  unsigned num_threads_;
 };
 
 }  // namespace qsim

diff --git a/lib/simulator_basic.h b/lib/simulator_basic.h
@@ -29,8 +29,9 @@ class SimulatorBasic final {
   using State = typename StateSpace::State;
   using fp_type = typename StateSpace::fp_type;
 
-  SimulatorBasic(unsigned num_qubits, unsigned num_threads)
-      : num_qubits_(num_qubits), num_threads_(num_threads) {}
+  template <typename... Args>
+  explicit SimulatorBasic(unsigned num_qubits, Args&&... args)
+      : for_(args...), num_qubits_(num_qubits) {}
 
   /**
    * Applies a single-qubit gate using sparse matrix-vector multiplication.
@@ -66,7 +67,7 @@ class SimulatorBasic final {
       rstate[si1 + 1] = s0r * u[5] + s0i * u[4] + s1r * u[7] + s1i * u[6];
     };
 
-    For::Run(num_threads_, sizei / 2, f, sizek, mask0, mask1, matrix, rstate);
+    for_.Run(sizei / 2, f, sizek, mask0, mask1, matrix, rstate);
   }
 
   /**
@@ -129,13 +130,12 @@ class SimulatorBasic final {
           + s2r * u[29] + s2i * u[28] + s3r * u[31] + s3i * u[30];
     };
 
-    For::Run(num_threads_, sizei / 2, f, sizej, sizek, mask0, mask1, mask2,
-             matrix, rstate);
+    for_.Run(sizei / 2, f, sizej, sizek, mask0, mask1, mask2, matrix, rstate);
   }
 
  private:
+  For for_;
   unsigned num_qubits_;
-  unsigned num_threads_;
 };
 
 }  // namespace qsim

diff --git a/lib/simulator_sse.h b/lib/simulator_sse.h
@@ -33,8 +33,9 @@ class SimulatorSSE final {
   using State = typename StateSpace::State;
   using fp_type = typename StateSpace::fp_type;
 
-  SimulatorSSE(unsigned num_qubits, unsigned num_threads)
-      : num_qubits_(num_qubits), num_threads_(num_threads) {}
+  template <typename... Args>
+  explicit SimulatorSSE(unsigned num_qubits, Args&&... args)
+      : for_(args...), num_qubits_(num_qubits) {}
 
   /**
    * Applies a single-qubit gate using SSE instructions.
@@ -131,7 +132,7 @@ class SimulatorSSE final {
       _mm_store_ps(rstate + p + 4, in);
     };
 
-    For::Run(num_threads_, sizei / 8, f, sizek, mask0, mask1, matrix, rstate);
+    for_.Run(sizei / 8, f, sizek, mask0, mask1, matrix, rstate);
   }
 
   // Applies a single-qubit gate for qubit <= 1.
@@ -212,8 +213,7 @@ class SimulatorSSE final {
       _mm_store_ps(rstate + p + 4, in);
     };
 
-    For::Run(num_threads_, std::max(uint64_t{1}, sizei / 8), f, q0,
-             matrix, rstate);
+    for_.Run(std::max(uint64_t{1}, sizei / 8), f, q0, matrix, rstate);
   }
 
   // Applies two-qubit gate for qubit0 > 1 and qubit1 > 1.
@@ -365,8 +365,7 @@ class SimulatorSSE final {
       _mm_store_ps(rstate + p + 4, in);
     };
 
-    For::Run(num_threads_, sizei / 8, f, sizej, sizek, mask0, mask1, mask2,
-             matrix, rstate);
+    for_.Run(sizei / 8, f, sizej, sizek, mask0, mask1, mask2, matrix, rstate);
   }
 
   // Applies a two-qubit gate for qubit0 <= 1 and qubit1 > 1.
@@ -557,8 +556,7 @@ class SimulatorSSE final {
       _mm_store_ps(rstate + p + 4, in);
     };
 
-    For::Run(num_threads_, sizei / 8, f, sizej, mask0, mask1, q0,
-             matrix, rstate);
+    for_.Run(sizei / 8, f, sizej, mask0, mask1, q0, matrix, rstate);
   }
 
   // Applies a two-qubit gate for qubit0 = 0 and qubit1 = 1.
@@ -601,11 +599,11 @@ class SimulatorSSE final {
           + s2r * u[29] + s2i * u[28] + s3r * u[31] + s3i * u[30];
     };
 
-    For::Run(num_threads_, sizei / 8, f, matrix, rstate);
+    for_.Run(sizei / 8, f, matrix, rstate);
   }
 
+  For for_;
   unsigned num_qubits_;
-  unsigned num_threads_;
 };
 
 }  // namespace qsim