From 4421bfaf933074f71f22a0761bc5a4424ef60d13 Mon Sep 17 00:00:00 2001
From: Fionn Malone <fmalone@google.com>
Date: Tue, 19 Sep 2023 05:37:44 +0000
Subject: [PATCH] Delete extra tutorial for later PR.

---
 dev_tools/autogenerate-bloqs-notebooks.py |  10 +
 qualtran/bloqs/chemistry/thc.ipynb        | 370 ++-----------------
 qualtran/bloqs/chemistry/thc.py           |  29 +-
 qualtran/bloqs/chemistry/thc_test.py      |  10 +-
 qualtran/bloqs/chemistry/thc_tutorial.py  | 419 ----------------------
 5 files changed, 58 insertions(+), 780 deletions(-)
 delete mode 100644 qualtran/bloqs/chemistry/thc_tutorial.py

diff --git a/dev_tools/autogenerate-bloqs-notebooks.py b/dev_tools/autogenerate-bloqs-notebooks.py
index d400b177e..ad5497724 100644
--- a/dev_tools/autogenerate-bloqs-notebooks.py
+++ b/dev_tools/autogenerate-bloqs-notebooks.py
@@ -59,6 +59,7 @@
 import qualtran.bloqs.basic_gates.toffoli_test
 import qualtran.bloqs.basic_gates.x_basis_test
 import qualtran.bloqs.basic_gates.z_basis_test
+import qualtran.bloqs.chemistry.thc_test
 import qualtran.bloqs.factoring.mod_exp
 import qualtran.bloqs.factoring.mod_exp_test
 import qualtran.bloqs.factoring.mod_mul_test
@@ -134,6 +135,15 @@
         ],
         directory=f'{SOURCE_DIR}/bloqs/factoring',
     ),
+    NotebookSpec(
+        title='Tensor Hypercontraction',
+        module=qualtran.bloqs.chemistry.thc,
+        gate_specs=[
+            BloqNbSpec(qualtran.bloqs.chemistry.thc_test._make_uniform_superposition),
+            BloqNbSpec(qualtran.bloqs.chemistry.thc_test._make_prepare),
+        ],
+        directory=f'{SOURCE_DIR}/bloqs/chemistry',
+    ),
 ]
 
 
diff --git a/qualtran/bloqs/chemistry/thc.ipynb b/qualtran/bloqs/chemistry/thc.ipynb
index e577e041c..5e3b34528 100644
--- a/qualtran/bloqs/chemistry/thc.ipynb
+++ b/qualtran/bloqs/chemistry/thc.ipynb
@@ -9,8 +9,7 @@
    "source": [
     "# Tensor Hypercontraction\n",
     "\n",
-    "In this notebook we describe the implementation of SELECT and PREPARE for the tensor hypercontraction (THC) factorized form of the second quantized chemistry hamiltonian.\n",
-    "We will discuss the THC Hamiltonian (which asymptotically yields the lowest complexity of all second quantized algorithms), before discussing the symmetries which can be exploited, and finally provide circuits and costs for PREPARE and SELECT, comparing to available [reference values](https://arxiv.org/pdf/2011.03494.pdf) where possible."
+    "SELECT and PREPARE for the molecular tensor hypercontraction (THC) hamiltonian"
    ]
   },
   {
@@ -25,194 +24,7 @@
     "from qualtran import Bloq, CompositeBloq, BloqBuilder, Signature, Register\n",
     "from qualtran.drawing import show_bloq\n",
     "from typing import *\n",
-    "import numpy as np\n",
-    "import cirq"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "980ee37f",
-   "metadata": {},
-   "source": [
-    "## `State preparation`\n",
-    "\n",
-    "PREPARE in THC requires state preparation to occur in two steps. In the first step we prepare a uniform superposition over a number of registers before we load the appropriate Hamiltonian coefficients which are defined over the registers we prepared in the first step. That is, if $H = \\sum_{pq}^N \\zeta_{pq} a^{\\dagger}_p a_q$ then we need at least two registers ($p$ and $q$) each of size $\\lceil \\log N \\rceil$ bits. The state we want to prepare is of the form $|\\psi\\rangle \\sim \\sum_{pq} \\sqrt{\\zeta_{pq}}|pq\\rangle$. This second step usually involves loading the parameters in QROM (or more commonly QRO(A)M) before performing alias sampling to prepare the appropriate superposition state. At a high level step one usually has a logarithmic cost in the size of the registers, while step 2 has a cost that goes like $\\sqrt{d}$ where $d$ is the amount of data to load and arises from the QROM circuit (in the above example $d = N^2$). As step 2 is typically **the** dominant cost in the second quantized algorithms, a lot of effort is placed on loading the **minimum** amount of data required to specify the Hamiltonian. In particular, the quantum chemistry Hamiltonian exhibits several symmetries which can be exploited to reduce the QROM cost by several factors of two.\n",
-    "\n",
-    "### `Symmetries`\n",
-    "\n",
-    "### `Simplified Example`\n",
-    "\n",
-    "#### `Uniform superposition`\n",
-    "Before discussing the two THC PREPARE circuits, it is helpful to think about implementing a simplified state preparation circuit which can be analyzed more easily. Specifically let us consider the example above and try to implement the state\n",
-    "\n",
-    "$$\n",
-    "|0\\rangle^{\\otimes 2 \\log M} \\rightarrow \\frac{1}{\\sqrt{\\lambda}} \\sum_{\\mu\\le\\nu}^M\\sqrt{\\zeta_{\\mu\\nu}}|\\mu\\rangle|\\nu\\rangle,\n",
-    "$$\n",
-    "which demonstrates the main ideas. Above we are essentially assuming our matrix $\\zeta$ is symmetric so that we only need to load $d = M (M+1)/2$ data rather than $M^2$ which is a modest, but useful improvement.\n",
-    "We will also assume for the moment that the matrix has all positive matrix elements.\n",
-    "\n",
-    "Step 1 is to prepare a uniform superposition over our registers. We will use a single round of amplitude amplification to boost our success probability to close to 1. Our circuit is a simplified version of that given in Fig. 3 of [Ref 1](https://arxiv.org/pdf/2011.03494.pdf). Here it is: "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e42cfb87",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "import cirq_ft\n",
-    "from cirq_ft.infra import testing as cft_testing\n",
-    "from cirq_ft.infra.jupyter_tools import display_gate_and_compilation\n",
-    "from qualtran.bloqs.chemistry.thc_tutorial import UniformPrepareUpperTriangular\n",
-    "\n",
-    "# dim := M here \n",
-    "# Let's keep it simple with a 2x2 matrix to prepare\n",
-    "dim = 2\n",
-    "zeta = np.random.randint(0, 10, size=(dim, dim))\n",
-    "\n",
-    "upper_triang = UniformPrepareUpperTriangular(n=dim)\n",
-    "context_manager = cirq_ft.GreedyQubitManager(prefix=\"_a\", maximize_reuse=True)\n",
-    "upper_triang_gh = cft_testing.GateHelper(\n",
-    "    upper_triang, context=context_manager\n",
-    ")\n",
-    "\n",
-    "display_gate_and_compilation(upper_triang_gh)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "76f02a3f",
-   "metadata": {},
-   "source": [
-    "Walking through the circuit left to right we have:\n",
-    "\n",
-    "1. Prepare a uniform superposition over the $p$ and $q$ registers with Hadamards with zero Toffoli cost.\n",
-    "2. We rotate an ancilla qubit by \"adding a constant into a phase gradient register\". There are a few things to unpack here:\n",
-    "    * The constant (i.e. angle) is given by $0.436 \\pi = \\arccos(1-8/10)$, where $8=2^{4-1}$ and $10 = 4 \\times (4 + 1) / 2$. That is, (one minus) the ratio of the number of bits used needed to prepare our state and the amount of data we are going to prepare. The choice of this angle is described in more detail in the [amplitude amplification tutorial]().\n",
-    "    * The phase gradient register is a relatively cheap way to implement single qubit rotations fault tolerantly. For the moment let us just note that for us we need a single qubit $R_y$ gate and it costs b_r - 3 Toffolis, where $b_r=7$ is the number of bits used to represent the angle and this has found to be sufficient. See [techniques for single qubit rotations]() for further discussion on fault tolerant implementations of single qubit rotations.\n",
-    "3. Next we perform an inequality test on the $p$ and $q$ registers, and write the result of $p\\le q$ into another ancilla register. This has cost n_p - 1 Toffolis where $n_p$ is the number of bits required to represent the range of $p$ values.\n",
-    "4. We use a Toffoli gate to control off of the result of the inequality test and the $Ry$ rotation. If both these succeed then we set the third ancilla to on and phase this ancilla with a $Z$ gate. We then undo the Toffoli. This step has a cost of 2 Toffoli gates and 1 Clifford operation.\n",
-    "5. We undo the inequality test and Hadamards and reflect around zero and apply the Hadamards again. The reflection has a Toffoli cost 2 n_p + 1\n",
-    "\n",
-    "Phew! That's quite a lot just to prepare a uniform superposition state. Let's check our predicted T count (which we take from the THC paper) against that computed by our circuit: "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "20ee9c2a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from cirq_ft.infra import t_complexity\n",
-    "tcomp_cirq = t_complexity(upper_triang)\n",
-    "n_p = 2 # we need 2 bits for our example\n",
-    "b_r = 7 # Taken from the THC reference.\n",
-    "# vs (only count toffolis)\n",
-    "# If we assume the same cost as cirq_ft for a Toffoli of 7 T gates then we multiply everything by 7\n",
-    "expected = 7 * (\n",
-    "    2*(b_r - 3) # 2x rotations\n",
-    "    +\n",
-    "    2*(n_p - 1)# 2x inequality\n",
-    "    +\n",
-    "    2 # Two Toffolis for phasing third ancilla.\n",
-    "    +\n",
-    "    2 * n_p + 1 # Cost for reflections\n",
-    ")\n",
-    "# Add in the cost of rotations to the cirq t complexity cost\n",
-    "print(f\"Computed = {tcomp_cirq.t + 2*(b_r-3)*7} vs Expected = {expected}\")\n",
-    "# print(f\"Computed = {tcomp_cirq.t} vs Expected = {expected}. Difference = {tcomp_cirq.t-expected}.\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "adb605bd",
-   "metadata": {},
-   "source": [
-    "Oh no! Our computed estimate differs from our enumeration. What happened? We see that our calculated cost for the inequality tests is 24 T gates vs what we expect from $2\\times7\\times(n_p - 1) = 14$. Our computed reflections also cost 40 T gates rather 35 T gates which we expect. These two differences make up 10 + 5 = 15. These small differences highlight some of the complications which arise when trying to reproduce resource estimates `exactly`, i.e. the costs associated with different primitives may be slightly different than those listed in the paper, which is an unfortunate fact of life.\n",
-    "\n",
-    "Anyway, let's move on and check that our circuit is actually correct. Here we will make use of some utility functions and analyze the statevector."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1918b311",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from qualtran.bloqs.chemistry.thc_tutorial import analyze_state_vector\n",
-    "state_vector = analyze_state_vector(upper_triang_gh, length=dim*dim)\n",
-    "print(cirq.dirac_notation(state_vector))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b058f227",
-   "metadata": {},
-   "source": [
-    "We see that (as hoped), we have successfully prepared the state with $p \\le q$, where $p$ is stored in binary in the first two bits and q the second two, and our normalization seems to be consistent. (What is cirq doing when I just provide the circuit and not the decomposed one?) "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "795cec00",
-   "metadata": {},
-   "source": [
-    "#### `State Preparation`\n",
-    "\n",
-    "Now that we have our initial state we can prepare our amplitudes using [coherent alias sampling](). Here is our simplified circuit:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fa073074",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from qualtran.bloqs.chemistry.thc_tutorial import PrepareUpperTriangular\n",
-    "flat_data = zeta[np.triu_indices(dim)]\n",
-    "assert len(flat_data) == dim * (dim + 1) // 2\n",
-    "lambda_zeta  = np.sum(np.abs(flat_data))\n",
-    "probs = flat_data / np.sqrt(lambda_zeta)\n",
-    "# number of bits of precision for lcu probabilities\n",
-    "mu = 3\n",
-    "prep = PrepareUpperTriangular.build(dim, probs, epsilon=2**-mu / len(probs))\n",
-    "ut_prep_gh = cft_testing.GateHelper(\n",
-    "    prep\n",
-    ")\n",
-    "\n",
-    "display_gate_and_compilation(ut_prep_gh)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "08f0a247",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from qualtran.bloqs.chemistry.thc_tutorial import analyze_state_vector\n",
-    "state_vector = analyze_state_vector(ut_prep_gh, length=dim*dim)\n",
-    "print(state_vector)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "496c9c45",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "prep = cirq_ft.PrepareUniformSuperposition(3)\n",
-    "one_d_prep = cft_testing.GateHelper(\n",
-    "    prep\n",
-    ")\n",
-    "state_vector = analyze_state_vector(gate_helper, length=dim*dim)\n",
-    "print(state_vector*state_vector.conj())"
+    "import numpy as np"
    ]
   },
   {
@@ -267,41 +79,6 @@
     "show_bloq(bloq)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ebfe7689",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from qualtran.drawing import get_musical_score_data, draw_musical_score\n",
-    "cbloq = bloq.decompose_bloq()\n",
-    "msd = get_musical_score_data(cbloq)\n",
-    "fig, ax = draw_musical_score(msd)\n",
-    "fig.tight_layout()\n",
-    "print(cbloq.t_complexity())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "08a77982",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import matplotlib.pyplot as plt\n",
-    "t_comp = []\n",
-    "num_spatial = range(50, 10_000, 100)\n",
-    "for ns in num_spatial:\n",
-    "    bloq = UniformSuperpositionTHC(num_mu=6*ns, num_spin_orb=2*ns)\n",
-    "    t_comp.append(bloq.decompose_bloq().t_complexity().t)\n",
-    "\n",
-    "plt.plot(num_spatial, np.array(t_comp)/11, marker='o')\n",
-    "plt.plot(num_spatial, 10*np.array([(6*ns).bit_length() for ns in num_spatial]) + 3)\n",
-    "# plt.yscale(\"log\")\n",
-    "# plt.xscale(\"log\")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "5b253057",
@@ -321,15 +98,26 @@
     "    \\right].\n",
     "$$\n",
     "\n",
+    "Note we use UniformSuperpositionTHC as a subroutine as part of this bloq in\n",
+    "contrast to the reference which keeps them separate.\n",
+    "\n",
     "#### Parameters\n",
     " - `num_mu`: THC auxiliary index dimension $M$\n",
     " - `num_spin_orb`: number of spin orbitals $N$\n",
+    " - `alt_mu`: Alternate values for mu indices.\n",
+    " - `alt_nu`: Alternate values for nu indices.\n",
+    " - `alt_theta`: Alternate values for theta indices.\n",
+    " - `theta`: Signs of lcu coefficients.\n",
+    " - `keep`: keep values.\n",
     " - `keep_bitsize`: number of bits for keep register for coherent alias sampling. \n",
     "\n",
     "Registers:\n",
     " - mu: $\\mu$ register.\n",
     " - nu: $\\nu$ register.\n",
     " - theta: sign register.\n",
+    " - succ: success flag qubit from uniform state preparation\n",
+    " - eq_nu_mp1: flag for if $nu = M+1$\n",
+    " - plus_a / plus_b: plus state for controlled swaps on spins.\n",
     "\n",
     "#### References\n",
     "[Even more efficient quantum computations of chemistry through tensor hypercontraction](https://arxiv.org/pdf/2011.03494.pdf) Fig. 2 and Fig. 3.\n"
@@ -346,133 +134,15 @@
    "source": [
     "from qualtran.bloqs.chemistry.thc import PrepareTHC\n",
     "\n",
-    "num_mu = 10\n",
-    "num_spin_orb = 4\n",
-    "\n",
-    "bloq = PrepareTHC(num_mu=num_mu, num_spin_orb=num_spin_orb, keep_bitsize=8)\n",
+    "num_spat = 4\n",
+    "num_mu = 8\n",
+    "t_l = np.random.normal(0, 1, size=num_spat)\n",
+    "zeta = np.random.normal(0, 1, size=(num_mu, num_mu))\n",
+    "zeta = 0.5 * (zeta + zeta.T)\n",
+    "eps = 1e-3\n",
+    "bloq = PrepareTHC.build(t_l, zeta, probability_epsilon=eps)\n",
     "show_bloq(bloq)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "93e23c6b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cbloq = bloq.decompose_bloq() \n",
-    "show_bloq(cbloq)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "dca4a7e0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from qualtran.drawing import get_musical_score_data, draw_musical_score\n",
-    "cbloq = bloq.decompose_bloq()\n",
-    "msd = get_musical_score_data(cbloq)\n",
-    "fig, ax = draw_musical_score(msd)\n",
-    "fig.tight_layout()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7b4ffed8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(cbloq.t_complexity())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "70282cd5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%pip install openfermion\n",
-    "%pip install pyscf"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "da33771f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "from openfermion.resource_estimates.utils import QR, QI\n",
-    "def prepare_cost(M, n, chi, stps):\n",
-    "    # The number of bits used for each register.\n",
-    "    nM = np.ceil(np.log2(M + 1))\n",
-    "    # This is the number of distinct items of data we need to output.\n",
-    "    d = M * (M + 1) // 2 + n // 2\n",
-    "    # The number of bits used for the contiguous register.\n",
-    "    # and 2 for the two sign bits, as in Eq. (29).\n",
-    "    m = 2 * nM + 2 + chi\n",
-    "    # Set it to be the number of bits that minimises the cost, usually 7.\n",
-    "    # Python is 0-index, so need to add the one back in vs mathematica nb\n",
-    "    br = 7 + 1\n",
-    "\n",
-    "    # This is the costing for preparing the equal superposition over the\n",
-    "    # input registers from below Eq. (27).\n",
-    "    cp1 = 2 * (10 * nM + 2 * br - 9)\n",
-    "\n",
-    "    # This is the cost of computing the contiguous register and inverting it.\n",
-    "    # This is with a sophisticated scheme adding together triplets of bits.\n",
-    "    # This is the cost of step 2 in the list on pages 15 and 16, with a factor\n",
-    "    # of 2 to account for its inverse.\n",
-    "    cp2 = 2 * (nM**2 + nM - 1)\n",
-    "\n",
-    "    # This is the cost of the QROM for the state preparation in step 3 and its\n",
-    "    # inverse. Note: arg_min is first value, min is second value\n",
-    "    cp3 = QR(d, m)[1] + QI(d)[1]\n",
-    "\n",
-    "    # The cost for the inequality test in step 4 and its inverse.\n",
-    "    cp4 = 2 * chi\n",
-    "\n",
-    "    # The cost 2*nM for the controlled swap in step 5 and its inverse.\n",
-    "    cp5 = 4 * nM\n",
-    "\n",
-    "    # Then there is a cost of nM + 1 for swapping the mu and nu registers in\n",
-    "    # step 6, where the + 1 is because we need to control on two registers.\n",
-    "    # There is the same cost for the inverse.\n",
-    "    cp6 = 2 * nM + 2\n",
-    "\n",
-    "    # This is the total cost in Eq. (32).\n",
-    "    print(cp1, cp2, cp3, cp4, cp5, cp6)\n",
-    "    CPCP = cp1 + cp2 + cp3 + cp4 + cp5 + cp6\n",
-    "    return CPCP"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cb556edd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import matplotlib.pyplot as plt\n",
-    "t_comp = []\n",
-    "t_comp_of = []\n",
-    "num_spatial = range(10, 100, 100)\n",
-    "for ns in num_spatial:\n",
-    "    bloq = PrepareTHC(num_mu=6*ns, num_spin_orb=2*ns, keep_bitsize=12)\n",
-    "    bloq2 = UniformSuperpositionTHC(num_mu=6*ns, num_spin_orb=2*ns)\n",
-    "    t_comp.append(bloq.decompose_bloq().t_complexity().t-bloq2.decompose_bloq().t_complexity().t)\n",
-    "    t_comp_of.append(prepare_cost(6*ns, 2*ns, 12, 10))\n",
-    "\n",
-    "plt.xscale(\"log\")\n",
-    "plt.yscale(\"log\")\n",
-    "plt.plot(num_spatial, np.array(t_comp), marker='o')\n",
-    "plt.plot(num_spatial, np.array(t_comp_of), marker='^')"
-   ]
   }
  ],
  "metadata": {
diff --git a/qualtran/bloqs/chemistry/thc.py b/qualtran/bloqs/chemistry/thc.py
index 8c5373479..5cbcdf75a 100644
--- a/qualtran/bloqs/chemistry/thc.py
+++ b/qualtran/bloqs/chemistry/thc.py
@@ -244,12 +244,20 @@ class PrepareTHC(Bloq):
     Args:
         num_mu: THC auxiliary index dimension $M$
         num_spin_orb: number of spin orbitals $N$
+        alt_mu: Alternate values for mu indices.
+        alt_nu: Alternate values for nu indices.
+        alt_theta: Alternate values for theta indices.
+        theta: Signs of lcu coefficients.
+        keep: keep values.
         keep_bitsize: number of bits for keep register for coherent alias sampling.
 
     Registers:
      - mu: $\mu$ register.
      - nu: $\nu$ register.
      - theta: sign register.
+     - succ: success flag qubit from uniform state preparation
+     - eq_nu_mp1: flag for if $nu = M+1$
+     - plus_a / plus_b: plus state for controlled swaps on spins.
 
     References:
         [Even more efficient quantum computations of chemistry through
@@ -272,6 +280,10 @@ def signature(self) -> Signature:
                 Register("mu", bitsize=(self.num_mu).bit_length()),
                 Register("nu", bitsize=(self.num_mu).bit_length()),
                 Register("theta", bitsize=1),
+                Register("succ", bitsize=1),
+                Register("eq_nu_mp1", bitsize=1),
+                Register("plus_a", bitsize=1),
+                Register("plus_b", bitsize=1),
             ]
         )
 
@@ -310,7 +322,7 @@ def build(cls, t_l, zeta, probability_epsilon=1e-8) -> 'PrepareTHC':
             alt_nu=tuple(alt_nu),
             alt_theta=tuple(alt_theta),
             theta=tuple(thetas),
-            keep=keep,
+            keep=tuple(keep),
             keep_bitsize=mu,
         )
 
@@ -319,7 +331,7 @@ def build_composite_bloq(self, bb: 'BloqBuilder', **regs: SoquetT) -> Dict[str,
         log_mu = (self.num_mu).bit_length()
         log_d = (data_size - 1).bit_length()
         # Allocate ancillae
-        succ, eq_nu_mp1, flag_plus, less_than, alt_theta = bb.split(bb.allocate(5))
+        less_than, alt_theta = bb.split(bb.allocate(2))
         s = bb.allocate(log_d)
         sigma = bb.allocate(self.keep_bitsize)
         keep = bb.allocate(self.keep_bitsize)
@@ -331,8 +343,8 @@ def build_composite_bloq(self, bb: 'BloqBuilder', **regs: SoquetT) -> Dict[str,
             UniformSuperpositionTHC(num_mu=self.num_mu, num_spin_orb=self.num_spin_orb),
             mu=regs['mu'],
             nu=regs['nu'],
-            succ=succ,
-            eq_nu_mp1=eq_nu_mp1,
+            succ=regs['succ'],
+            eq_nu_mp1=regs['eq_nu_mp1'],
         )
         # 2. Make contiguous register from mu and nu and store in register `s`.
         mu, nu, s = bb.add(ToContiguousIndex(log_mu, log_d), mu=mu, nu=nu, s=s)
@@ -359,7 +371,6 @@ def build_composite_bloq(self, bb: 'BloqBuilder', **regs: SoquetT) -> Dict[str,
         keep, sigma, less_than = add_from_bloq_register_flat_qubits(
             bb, lte_gate, keep=keep, sigma=sigma, less_than=less_than
         )
-        # TODO: uncomment once controlled bloq decomposes correctly.
         cz = CirqGateAsBloq(cirq.ControlledGate(cirq.Z))
         alt_theta, less_than = add_from_bloq_register_flat_qubits(
             bb, cz, alt_theta=alt_theta, less_than=less_than
@@ -374,17 +385,19 @@ def build_composite_bloq(self, bb: 'BloqBuilder', **regs: SoquetT) -> Dict[str,
         keep, sigma, less_than = add_from_bloq_register_flat_qubits(
             bb, lte_gate, keep=keep, sigma=sigma, less_than=less_than
         )
-        flag_plus = bb.add(Hadamard(), q=flag_plus)
+        # Reuse one of the |+> states later in prepare
+        plus_a = bb.add(Hadamard(), q=regs['plus_a'])
+        plus_b = bb.add(Hadamard(), q=regs['plus_b'])
         # negative cotrol on flag register
         less_than, flag_plus = add_from_bloq_register_flat_qubits(
             bb, cz, less_than=less_than, flag_plus=flag_plus
         )
         flag_plus, mu, nu = bb.add(CSwapApprox(bitsize=log_mu), ctrl=flag_plus, x=mu, y=nu)
-        bb.free(bb.join(np.array([succ, eq_nu_mp1, flag_plus, less_than, alt_theta])))
+        bb.free(bb.join(np.array([less_than, alt_theta])))
         bb.free(s)
         bb.free(sigma)
         bb.free(keep)
         bb.free(alt_mu)
         bb.free(alt_nu)
-        out_regs = {'mu': mu, 'nu': nu, 'theta': theta}
+        out_regs = {'mu': mu, 'nu': nu, 'theta': theta, 'plus_a': plus_a, 'plus_b': plus_b}
         return out_regs
diff --git a/qualtran/bloqs/chemistry/thc_test.py b/qualtran/bloqs/chemistry/thc_test.py
index 2444b5ac7..ca5389887 100644
--- a/qualtran/bloqs/chemistry/thc_test.py
+++ b/qualtran/bloqs/chemistry/thc_test.py
@@ -39,9 +39,13 @@ def _make_uniform_superposition():
 def _make_prepare():
     from qualtran.bloqs.chemistry.thc import PrepareTHC
 
-    num_mu = 10
-    num_spin_orb = 4
-    return PrepareTHC(num_mu=num_mu, num_spin_orb=num_spin_orb, keep_bitsize=8)
+    num_spat = 4
+    num_mu = 8
+    t_l = np.random.normal(0, 1, size=num_spat)
+    zeta = np.random.normal(0, 1, size=(num_mu, num_mu))
+    zeta = 0.5 * (zeta + zeta.T)
+    eps = 1e-3
+    return PrepareTHC.build(t_l, zeta, probability_epsilon=eps)
 
 
 def test_split_join_arithmetic_gates():
diff --git a/qualtran/bloqs/chemistry/thc_tutorial.py b/qualtran/bloqs/chemistry/thc_tutorial.py
deleted file mode 100644
index c2091e751..000000000
--- a/qualtran/bloqs/chemistry/thc_tutorial.py
+++ /dev/null
@@ -1,419 +0,0 @@
-# Copyright 2023 The Cirq Developers
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Iterable, Sequence, Tuple, Union
-
-import attr
-import cirq
-import numpy as np
-from cirq._compat import cached_property
-from cirq_ft import infra, linalg
-from cirq_ft.algos import arithmetic_gates, swap_network
-from cirq_ft.algos.multi_control_multi_target_pauli import MultiControlPauli
-from cirq_ft.algos.select_swap_qrom import SelectSwapQROM
-from numpy.typing import NDArray
-
-# TODO: This is more similar to THC circuit but there seems to be a bug with multicontrolled-Z
-# @attr.frozen
-# class UniformPrepareUpperTriangular(infra.GateWithRegisters):
-#     r"""Prepares a uniform superposition over first $n * (n + 1) / 2$ basis states.
-
-#     Prepares the state
-
-#     $$
-#         |\psi\rangle = \frace{1}{\sqrt{n*(n+1)/2}} \sum_{p \le q} |p q\rangle
-#     $$
-
-#     Args:
-#         n: The gate prepares a uniform superposition over two $n$ basis state registers.
-#         cv: Control values for each control qubit. If specified, a controlled version
-#             of the gate is constructed.
-
-#     References:
-#         See Fig 12 of https://arxiv.org/abs/1805.03662 for more details.
-#     """
-
-#     n: int
-
-#     @cached_property
-#     def registers(self) -> infra.Registers:
-#         return infra.Registers.build(p=(self.n - 1).bit_length(), q=(self.n - 1).bit_length())
-
-#     def __repr__(self) -> str:
-#         return f"cirq_ft.PrepareUniformSuperpositionTwoRegisters({self.n}, {self.n})"
-
-#     def _circuit_diagram_info_(self, args: cirq.CircuitDiagramInfoArgs) -> cirq.CircuitDiagramInfo:
-#         target_symbols = ['p'] * self.registers['p'].total_bits()
-#         target_symbols += ['q'] * self.registers['q'].total_bits()
-#         target_symbols[0] = f"UNIFORM({self.n})"
-#         return cirq.CircuitDiagramInfo(wire_symbols=target_symbols)
-
-#     def decompose_from_registers(
-#         self,
-#         *,
-#         context: cirq.DecompositionContext,
-#         **quregs: NDArray[cirq.Qid],  # type:ignore[type-var]
-#     ) -> cirq.OP_TREE:
-#         p, q = quregs['p'], quregs['q']
-#         # Find K and L as per https://arxiv.org/abs/1805.03662 Fig 12.
-#         yield cirq.H.on_each(*p)
-#         yield cirq.H.on_each(*q)
-#         # Alloc
-#         ancilla = context.qubit_manager.qalloc(3)
-
-#         l = self.n * (self.n + 1) // 2
-#         theta = np.arccos(1 - (2 ** np.floor(np.log2(l))) / l)
-#         yield cirq.Ry(rads=theta)(ancilla[0])
-#         logn = (self.n - 1).bit_length()
-#         yield arithmetic_gates.LessThanEqualGate(logn, logn).on(*p, *q, ancilla[1])
-#         yield cirq.X.on(ancilla[2])
-#         yield MultiControlPauli(cvs=[1, 1], target_gate=cirq.Z).on_registers(
-#             controls=ancilla[:2], target=ancilla[2]
-#         )
-#         yield arithmetic_gates.LessThanEqualGate(logn, logn).on(*p, *q, ancilla[1])
-#         yield cirq.Ry(rads=-theta)(ancilla[0])
-
-#         # Second half of circuit
-#         yield cirq.H.on_each(*p)
-#         yield cirq.H.on_each(*q)
-#         control_pq = self.registers.merge_qubits(**quregs)
-#         yield MultiControlPauli(cvs=[1] * 2 * logn, target_gate=cirq.Z).on_registers(
-#             controls=control_pq, target=ancilla[0]
-#         )
-#         yield cirq.H.on_each(*p)
-#         yield cirq.H.on_each(*q)
-#         yield arithmetic_gates.LessThanEqualGate(logn, logn).on(*p, *q, ancilla[1])
-#         yield MultiControlPauli(cvs=[1, 1], target_gate=cirq.Z).on_registers(
-#             controls=ancilla[:2], target=ancilla[2]
-#         )
-#         yield arithmetic_gates.LessThanEqualGate(logn, logn).on(*p, *q, ancilla[1])
-#         yield cirq.X.on(ancilla[2])
-
-#         # Free
-#         context.qubit_manager.qfree([*ancilla])
-
-
-def analyze_state_vector(gate_helper, length: int):
-    circuit = cirq.Circuit(cirq.decompose_once(gate_helper.operation))
-    # result = cirq.Simulator(dtype=np.complex128).simulate(gate_helper.circuit)
-    result = cirq.Simulator(dtype=np.complex128).simulate(circuit)
-    state_vector = result.final_state_vector
-    # State vector is of the form |l>|temp_{l}>. We trace out the |temp_{l}> part to
-    # get the coefficients corresponding to |l>.
-    # L, logL = length, length.bit_length()
-    # state_vector = state_vector.reshape(2**logL, len(state_vector) // 2**logL)
-    # num_non_zero = (abs(state_vector) > 1e-6).sum(axis=1)
-    # prepared_state = state_vector.sum(axis=1)
-    # assert all(num_non_zero[:L] > 0) and all(num_non_zero[L:] == 0)
-    # assert all(np.abs(prepared_state[:L]) > 1e-6) and all(np.abs(prepared_state[L:]) <= 1e-6)
-    # prepared_state = prepared_state[:L] / np.sqrt(num_non_zero[:L])
-    return state_vector
-
-
-# index_1 = lambda x, y: (x)*(x-1)//2 + y
-# ndim = 4
-# for i in range(0, ndim):
-#     for j in range(0, i + 1):
-#         print(i, j, index_1(i, j))
-
-# print()
-# index_2 = lambda x, y: (x+1)*(x)//2 + y
-# for i in range(0, ndim):
-#     for j in range(0, i + 1):
-#         print(i, j, index_2(i, j))
-
-
-@attr.frozen
-class ContiguousRegisterGateEqual(cirq.ArithmeticGate):
-    """Applies U|x>|y>|0> -> |x>|y>|(y+1)y//2 + x>
-
-    This is useful in the case when $|x>$ and $|y>$ represent two selection registers such that
-     $x <= y$. For example, imagine a classical for-loop over two variables $x$ and $y$:
-
-     References:
-         [Even More Efficient Quantum Computations of Chemistry Through Tensor Hypercontraction]
-         (https://arxiv.org/abs/2011.03494)
-            Lee et. al. (2020). Appendix F, Page 67.
-    """
-
-    bitsize: int
-    target_bitsize: int
-
-    def registers(self) -> Sequence[Union[int, Sequence[int]]]:
-        return [2] * self.bitsize, [2] * self.bitsize, [2] * self.target_bitsize
-
-    def with_registers(self, *new_registers) -> 'ContiguousRegisterGateEqual':
-        x_bitsize, y_bitsize, target_bitsize = [len(reg) for reg in new_registers]
-        assert (
-            x_bitsize == y_bitsize
-        ), f'x_bitsize={x_bitsize} should be same as y_bitsize={y_bitsize}'
-        return ContiguousRegisterGateEqual(x_bitsize, target_bitsize)
-
-    def apply(self, *register_vals: int) -> Union[int, Iterable[int]]:
-        x, y, target = register_vals
-        return x, y, target ^ ((y * (y + 1)) // 2 + x)
-
-    def _circuit_diagram_info_(self, _) -> cirq.CircuitDiagramInfo:
-        wire_symbols = ["In(x)"] * self.bitsize
-        wire_symbols += ["In(y)"] * self.bitsize
-        wire_symbols += ['+(y(y-1)/2 + x)'] * self.target_bitsize
-        return cirq.CircuitDiagramInfo(wire_symbols=wire_symbols)
-
-    def _t_complexity_(self) -> infra.TComplexity:
-        # See the linked reference for explanation of the Toffoli complexity.
-        toffoli_complexity = infra.t_complexity(cirq.CCNOT)
-        return (self.bitsize**2 + self.bitsize - 1) * toffoli_complexity
-
-    def __repr__(self) -> str:
-        return f'cirq_ft.ContiguousRegisterGateEqual({self.bitsize}, {self.target_bitsize})'
-
-    def __pow__(self, power: int):
-        if power in [1, -1]:
-            return self
-        return NotImplemented  # pragma: no cover
-
-
-@attr.frozen
-class UniformPrepareAlt(infra.GateWithRegisters):
-    r"""Prepares a uniform superposition over first $n * (n + 1) / 2$ basis states.
-
-    Prepares the state
-
-    $$
-        |\psi\rangle = \frace{1}{\sqrt{n*(n+1)/2}} \sum_{p \le q} |p q\rangle
-    $$
-
-    Args:
-        n: The gate prepares a uniform superposition over two $n$ basis state registers.
-
-        cv: Control values for each control qubit. If specified, a controlled version
-            of the gate is constructed.
-
-    References:
-        See Fig 12 of https://arxiv.org/abs/1805.03662 for more details.
-    """
-
-    n: int
-
-    @cached_property
-    def registers(self) -> infra.Registers:
-        return infra.Registers.build(p=(self.n - 1).bit_length())
-
-    def __repr__(self) -> str:
-        return f"cirq_ft.PrepareUniformAlt({self.n}, {self.n})"
-
-    def _circuit_diagram_info_(self, args: cirq.CircuitDiagramInfoArgs) -> cirq.CircuitDiagramInfo:
-        target_symbols = ['p'] * self.registers['p'].total_bits()
-        target_symbols[0] = f"UNIFORM({self.n})"
-        return cirq.CircuitDiagramInfo(wire_symbols=target_symbols)
-
-    def decompose_from_registers(
-        self,
-        *,
-        context: cirq.DecompositionContext,
-        **quregs: NDArray[cirq.Qid],  # type:ignore[type-var]
-    ) -> cirq.OP_TREE:
-        p = quregs['p']
-        yield cirq.H.on_each(*p)
-        # Alloc
-        ancilla = context.qubit_manager.qalloc(3)
-
-        l = self.n
-        theta = np.arccos(1 - (2 ** np.floor(np.log2(l))) / l)
-        logn = (self.n - 1).bit_length()
-        yield arithmetic_gates.LessThanGate(logn, self.n).on(*p, ancilla[0])
-        yield cirq.Ry(rads=theta)(ancilla[1])
-        yield cirq.CCX(ancilla[0], ancilla[1], ancilla[2])
-        yield cirq.Z(ancilla[2])
-        yield cirq.CCX(ancilla[0], ancilla[1], ancilla[2])
-        yield cirq.Ry(rads=-theta)(ancilla[1])
-        yield arithmetic_gates.LessThanGate(logn, self.n).on(*p, ancilla[0])
-
-        # Second half of circuit
-        yield cirq.H.on_each(*p)
-        yield MultiControlPauli(cvs=[0] * (logn + 2), target_gate=cirq.X).on_registers(
-            controls=list(p) + list(ancilla[:2]), target=ancilla[2]
-        )
-        yield cirq.Z(ancilla[2])
-        yield MultiControlPauli(cvs=[0] * (logn + 2), target_gate=cirq.X).on_registers(
-            controls=list(p) + list(ancilla[:2]), target=ancilla[2]
-        )
-        yield cirq.H.on_each(*p)
-        yield cirq.Ry(rads=theta)(ancilla[1])
-
-        # Free
-        context.qubit_manager.qfree([*ancilla])
-
-
-@attr.frozen
-class UniformPrepareUpperTriangular(infra.GateWithRegisters):
-    r"""Prepares a uniform superposition over first $n * (n + 1) / 2$ basis states.
-
-    Prepares the state
-
-    $$
-        |\psi\rangle = \frace{1}{\sqrt{n*(n+1)/2}} \sum_{p \le q} |p q\rangle
-    $$
-
-    Args:
-        n: The gate prepares a uniform superposition over two $n$ basis state registers.
-
-        cv: Control values for each control qubit. If specified, a controlled version
-            of the gate is constructed.
-
-    References:
-        See Fig 12 of https://arxiv.org/abs/1805.03662 for more details.
-    """
-
-    n: int
-
-    @cached_property
-    def registers(self) -> infra.Registers:
-        return infra.Registers.build(p=(self.n - 1).bit_length(), q=(self.n - 1).bit_length())
-
-    def __repr__(self) -> str:
-        return f"cirq_ft.PrepareUniformSuperpositionTwoRegisters({self.n}, {self.n})"
-
-    def _circuit_diagram_info_(self, args: cirq.CircuitDiagramInfoArgs) -> cirq.CircuitDiagramInfo:
-        target_symbols = ['p'] * self.registers['p'].total_bits()
-        target_symbols += ['q'] * self.registers['q'].total_bits()
-        target_symbols[0] = f"UNIFORM({self.n})"
-        return cirq.CircuitDiagramInfo(wire_symbols=target_symbols)
-
-    def decompose_from_registers(
-        self,
-        *,
-        context: cirq.DecompositionContext,
-        **quregs: NDArray[cirq.Qid],  # type:ignore[type-var]
-    ) -> cirq.OP_TREE:
-        p, q = quregs['p'], quregs['q']
-        # Find K and L as per https://arxiv.org/abs/1805.03662 Fig 12.
-        yield cirq.H.on_each(*p)
-        yield cirq.H.on_each(*q)
-        # Alloc
-        ancilla = context.qubit_manager.qalloc(3)
-
-        l = self.n * (self.n + 1) // 2
-        theta = np.arccos(1 - (2 ** np.floor(np.log2(l))) / l)
-        logn = (self.n - 1).bit_length()
-        yield arithmetic_gates.LessThanEqualGate(logn, logn).on(*p, *q, ancilla[0])
-        yield cirq.Ry(rads=theta)(ancilla[1])
-        yield cirq.CCX(ancilla[0], ancilla[1], ancilla[2])
-        yield cirq.Z(ancilla[2])
-        yield cirq.CCX(ancilla[0], ancilla[1], ancilla[2])
-        yield cirq.Ry(rads=-theta)(ancilla[1])
-        yield arithmetic_gates.LessThanEqualGate(logn, logn).on(*p, *q, ancilla[0])
-
-        # Second half of circuit
-        yield cirq.H.on_each(*p)
-        yield cirq.H.on_each(*q)
-        control_pqa = self.registers.merge_qubits(**quregs) + ancilla[:2]
-        yield MultiControlPauli(cvs=[0] * (2 * logn + 2), target_gate=cirq.X).on_registers(
-            controls=control_pqa, target=ancilla[2]
-        )
-        yield cirq.Z(ancilla[2])
-        yield MultiControlPauli(cvs=[0] * (2 * logn + 2), target_gate=cirq.X).on_registers(
-            controls=control_pqa, target=ancilla[2]
-        )
-        yield cirq.H.on_each(*p)
-        yield cirq.H.on_each(*q)
-        # Reflect around ancilla
-        yield cirq.Ry(rads=theta)(ancilla[1])
-
-        # Free
-        context.qubit_manager.qfree([*ancilla])
-
-
-@attr.frozen
-class PrepareUpperTriangular(infra.GateWithRegisters):
-    r"""Prepares a uniform superposition over first $n * (n + 1) / 2$ basis states."""
-
-    dim: int
-    alt_p: Tuple[int, ...]
-    alt_q: Tuple[int, ...]
-    keep: Tuple[int, ...]
-    mu: int
-
-    @classmethod
-    def build(
-        cls, dim: int, probs: NDArray[np.int_], epsilon: float = 1e-8
-    ) -> "PrepareUpperTriangular":
-        alt, keep, mu = linalg.preprocess_lcu_coefficients_for_reversible_sampling(
-            lcu_coefficients=probs, epsilon=epsilon
-        )
-        p_upper, q_upper = np.triu_indices(dim)
-        # Get the correct p and q indices from their corresponding upper triangular values.
-        alt_p = tuple([int(p) for p in p_upper[alt]])
-        alt_q = tuple([int(q) for q in q_upper[alt]])
-        keep = tuple([int(k) for k in keep])
-        return PrepareUpperTriangular(dim=dim, alt_p=alt_p, alt_q=alt_q, keep=keep, mu=mu)
-
-    @cached_property
-    def registers(self) -> infra.Registers:
-        return infra.Registers.build(p=(self.dim - 1).bit_length(), q=(self.dim - 1).bit_length())
-
-    def __repr__(self) -> str:
-        return f"cirq_ft.PrepareUpperTriangular({self.n}, {self.n})"
-
-    def _circuit_diagram_info_(self, args: cirq.CircuitDiagramInfoArgs) -> cirq.CircuitDiagramInfo:
-        target_symbols = ['p'] * self.registers['p'].total_bits()
-        target_symbols += ['q'] * self.registers['q'].total_bits()
-        target_symbols[0] = f"Prepare({self.dim})"
-        return cirq.CircuitDiagramInfo(wire_symbols=target_symbols)
-
-    def decompose_from_registers(
-        self,
-        *,
-        context: cirq.DecompositionContext,
-        **quregs: NDArray[cirq.Qid],  # type:ignore[type-var]
-    ) -> cirq.OP_TREE:
-        p, q = quregs['p'], quregs['q']
-        # Allocate ancillas
-        log_dim = (self.dim - 1).bit_length()
-        contiguous_reg_anc = context.qubit_manager.qalloc(2 * log_dim)
-        alt_p_anc = context.qubit_manager.qalloc(log_dim)
-        alt_q_anc = context.qubit_manager.qalloc(log_dim)
-        keep_anc = context.qubit_manager.qalloc(self.mu)
-        sigma_anc = context.qubit_manager.qalloc(self.mu)
-        ineq_anc = context.qubit_manager.qalloc(1)
-        # 0. Prepare uniform superposition
-        yield UniformPrepareUpperTriangular(n=self.dim).on_registers(p=p, q=q)
-        # 1. Contiguous register gate from p and q
-        yield ContiguousRegisterGateEqual(log_dim, 2 * log_dim).on(*p, *q, *contiguous_reg_anc)
-        # 2. SelectSwapQROM for alt / keep
-        yield SelectSwapQROM(
-            self.alt_p, self.alt_q, self.keep, target_bitsizes=(log_dim, log_dim, self.mu)
-        ).on_registers(
-            selection=contiguous_reg_anc, target0=alt_p_anc, target1=alt_q_anc, target2=keep_anc
-        )
-        yield cirq.H.on_each(*sigma_anc)
-        # # 3. Inequality test
-        yield arithmetic_gates.LessThanEqualGate(self.mu, self.mu).on(
-            *keep_anc, *sigma_anc, *ineq_anc
-        )
-        # # 4. Swaps
-        yield swap_network.MultiTargetCSwap.make_on(
-            control=ineq_anc, target_x=alt_p_anc, target_y=p
-        )
-        yield swap_network.MultiTargetCSwap.make_on(
-            control=ineq_anc, target_x=alt_q_anc, target_y=q
-        )
-        # # Uncompute inequality
-        yield arithmetic_gates.LessThanEqualGate(self.mu, self.mu).on(
-            *keep_anc, *sigma_anc, *ineq_anc
-        )
-
-        # Free
-        context.qubit_manager.qfree(
-            [*contiguous_reg_anc, *alt_p_anc, *alt_q_anc, *keep_anc, *sigma_anc, *ineq_anc]
-        )