From e500821945e47ab9bc7905de29df3f0a96509a46 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 21 Aug 2023 16:49:53 -0400
Subject: [PATCH 01/91] Remove deprecated jax... experimental and ops.index_add

---
 quax/external_integrals/tmp_potential.py | 245 ++++++------
 quax/integrals/integrals_utils.py        |  23 +-
 quax/integrals/oei.py                    | 253 ++++++------
 quax/integrals/tei.py                    | 466 ++++++++++++-----------
 quax/methods/ccsd.py                     |   1 -
 quax/methods/ccsd_t.py                   |  95 +++--
 quax/methods/energy_utils.py             |   1 -
 quax/methods/mp2.py                      |  16 +-
 setup.py                                 |   5 +-
 9 files changed, 611 insertions(+), 494 deletions(-)

diff --git a/quax/external_integrals/tmp_potential.py b/quax/external_integrals/tmp_potential.py
index dee8984..29fff39 100644
--- a/quax/external_integrals/tmp_potential.py
+++ b/quax/external_integrals/tmp_potential.py
@@ -1,37 +1,44 @@
 # Temporary potential integrals since libint does allow beyond 2nd order at the moment.
 import jax 
 from jax.config import config; config.update("jax_enable_x64", True)
-import jax.numpy as np
-import numpy as onp
-from jax.experimental import loops
+import jax.numpy as jnp
+from jax.lax import fori_loop, while_loop
 
 from ..integrals.integrals_utils import boys, binomial_prefactor, gaussian_product, boys, factorials, double_factorials, neg_one_pow, cartesian_product, am_leading_indices, angular_momentum_combinations
 from ..integrals.basis_utils import flatten_basis_data, get_nbf
 
 def A_array(l1,l2,PA,PB,CP,g,A_vals):
-    with loops.Scope() as s:
-      # Hard code only up to f functions (fxxx | fxxx) => l1 + l2 + 1 = 7
-      s.A = A_vals
-      s.i = 0
-      s.r = 0
-      s.u = 0 
-
-      s.i = l1 + l2  
-      for _ in s.while_range(lambda: s.i > -1):   
-        Aterm = neg_one_pow[s.i] * binomial_prefactor(s.i,l1,l2,PA,PB) * factorials[s.i]
-        s.r = s.i // 2
-        for _ in s.while_range(lambda: s.r > -1):
-          s.u = (s.i - 2 * s.r) // 2 
-          for _ in s.while_range(lambda: s.u > -1):
-            I = s.i - 2 * s.r - s.u 
-            tmp = I - s.u
-            fact_ratio = 1 / (factorials[s.r] * factorials[s.u] * factorials[tmp])
-            Aterm *= neg_one_pow[s.u]  * CP[tmp] * (0.25 / g)**(s.r+s.u) * fact_ratio 
-            s.A = jax.ops.index_add(s.A, I, Aterm)
-            s.u -= 1
-          s.r -= 1
-        s.i -= 1
-      return s.A
+
+    def loop_i(arr0):
+       i, r, u, A = arr0
+       Aterm = neg_one_pow[i] * binomial_prefactor(i,l1,l2,PA,PB) * factorials[i]
+       r = i // 2
+
+       def loop_r(arr1):
+          i, r, u, Aterm, A = arr1
+          u = (i - 2 * r) // 2
+
+          def loop_u(arr2):
+             i, r, u, Aterm, A = arr2
+             I = i - 2 * r - u
+             tmp = I - u
+             fact_ratio = 1 / (factorials[r] * factorials[u] * factorials[tmp])
+             Aterm *= neg_one_pow[u]  * CP[tmp] * (0.25 / g)**(r+u) * fact_ratio
+             A = A.at[I].set(u)
+             u -= 1
+             return (i, r, u, Aterm, A)
+
+          i_, r_, u_, Aterm_, A_ = while_loop(lambda arr2: arr2[1] > -1, loop_u, (i, r, u, Aterm, A))
+          r_ -= 1
+          return (i_, r_, u_, Aterm_, A_)
+
+       i_, r_, u_, Aterm_, A_ = while_loop(lambda arr1: arr1[1] > -1, loop_r, (i, r, u, Aterm, A))
+       i_ -= 1
+       return (i_, r_, u_, A_)
+
+    i_, r_, u_, A = while_loop(lambda arr0: arr0[0] > -1, loop_i, (l1 + l2, 0, 0, A_vals)) # (i, r, u, A)
+
+    return A
 
 @jax.jit
 def potential(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,Pgeom_pow,boys_eval,prefactor,charges,A_vals):
@@ -39,31 +46,42 @@ def potential(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,Pgeom_pow,boys_eval,prefacto
     Computes a single electron-nuclear attraction integral primitive
     """
     gamma = aa + bb
-    prefactor *= -2 * np.pi / gamma
-
-    with loops.Scope() as s:
-      s.val = 0.
-      for i in s.range(Pgeom_pow.shape[0]):
-        Ax = A_array(la,lb,PA_pow[0],PB_pow[0],Pgeom_pow[i,0,:],gamma,A_vals)
-        Ay = A_array(ma,mb,PA_pow[1],PB_pow[1],Pgeom_pow[i,1,:],gamma,A_vals)
-        Az = A_array(na,nb,PA_pow[2],PB_pow[2],Pgeom_pow[i,2,:],gamma,A_vals)
-
-        with loops.Scope() as S:
-          S.total = 0.
-          S.I = 0
-          S.J = 0
-          S.K = 0
-          for _ in S.while_range(lambda: S.I < la + lb + 1):
-            S.J = 0 
-            for _ in S.while_range(lambda: S.J < ma + mb + 1):
-              S.K = 0 
-              for _ in S.while_range(lambda: S.K < na + nb + 1):
-                S.total += Ax[S.I] * Ay[S.J] * Az[S.K] * boys_eval[S.I + S.J + S.K, i]
-                S.K += 1
-              S.J += 1
-            S.I += 1
-        s.val += charges[i] * prefactor * S.total
-      return s.val
+    prefactor *= -2 * jnp.pi / gamma
+
+    def loop_val(n, val):
+      Ax = A_array(la,lb,PA_pow[0],PB_pow[0],Pgeom_pow[n,0,:],gamma,A_vals)
+      Ay = A_array(ma,mb,PA_pow[1],PB_pow[1],Pgeom_pow[n,1,:],gamma,A_vals)
+      Az = A_array(na,nb,PA_pow[2],PB_pow[2],Pgeom_pow[n,2,:],gamma,A_vals)
+
+      I, J, K, total = 0, 0, 0, 0
+      def loop_I(arr0):
+         I, J, K, val, total = arr0
+         J = 0
+
+         def loop_J(arr1):
+            I, J, K, val, total = arr1
+            K = 0
+
+            def loop_K(arr2):
+               I, J, K, val, total = arr2
+               total += Ax[I] * Ay[J] * Az[K] * boys_eval[I + J + K, n]
+               K += 1
+               return (I, J, K, val, total)
+
+            I_, J_, K_, val_, total_ = while_loop(lambda arr2: arr2[2] < na + nb + 1, loop_K, (I, J, K, val, total))
+            J_ += 1
+            return (I_, J_, K_, val_, total_)
+
+         I_, J_, K_, val_, total_ = while_loop(lambda arr1: arr1[1] < ma + mb + 1, loop_I, (I, J, K, val, total))
+         I_ += 1
+         return (I_, J_, K_, val_, total_)
+
+      I_, J_, K_, val_, total_ = while_loop(lambda arr0: arr0[0] < la + lb + 1, loop_I, (I, J, K, val, total))
+      val_ += charges[n] * prefactor * total_
+      return val_
+
+    val = fori_loop(0, Pgeom_pow.shape[0], loop_val, 0)
+    return val
 
 def tmp_potential(geom, basis, charges):
     """
@@ -72,66 +90,71 @@ def tmp_potential(geom, basis, charges):
     coeffs, exps, atoms, ams, indices, dims = flatten_basis_data(basis)
     nbf = get_nbf(basis)
     nprim = coeffs.shape[0]
-    max_am = np.max(ams)
-    A_vals = np.zeros(2*max_am+1)
+    max_am = jnp.max(ams)
+    A_vals = jnp.zeros(2*max_am+1)
 
     # Save various AM distributions for indexing
     # Obtain all possible primitive duet index combinations 
-    primitive_duets = cartesian_product(np.arange(nprim), np.arange(nprim))
-
-    with loops.Scope() as s:
-      s.V = np.zeros((nbf,nbf))
-      s.a = 0  # center A angular momentum iterator 
-      s.b = 0  # center B angular momentum iterator 
-
-      for prim_duet in s.range(primitive_duets.shape[0]):
-        p1,p2 = primitive_duets[prim_duet]
-        coef = coeffs[p1] * coeffs[p2]
-        aa, bb = exps[p1], exps[p2]
-        atom1, atom2 = atoms[p1], atoms[p2]
-        am1, am2 = ams[p1], ams[p2]
-        A, B = geom[atom1], geom[atom2]
-        ld1, ld2 = am_leading_indices[am1], am_leading_indices[am2]
-
-        gamma = aa + bb
-        prefactor = np.exp(-aa * bb * np.dot(A-B,A-B) / gamma)
-        P = (aa * A + bb * B) / gamma
-        # Maximum angular momentum: hard coded
-        # Precompute all powers up to 2+max_am of Pi-Ai, Pi-Bi. 
-        # We need 2+max_am since kinetic requires incrementing angluar momentum by +2
-        PA_pow = np.power(np.broadcast_to(P-A, (max_am+3,3)).T, np.arange(max_am+3))
-        PB_pow = np.power(np.broadcast_to(P-B, (max_am+3,3)).T, np.arange(max_am+3))
-
-        # For potential integrals, we need the difference between 
-        # the gaussian product center P and ALL atoms in the molecule, 
-        # and then take all possible powers up to 2*max_am. 
-        # We pre-collect this into a 3d array, and then just pull out what we need via indexing in the loops, so they need not be recomputed.
-        # The resulting array has dimensions (atom, cartesian component, power) so index (0, 1, 3) would return (Py - atom0_y)^3
-        P_minus_geom = np.broadcast_to(P, geom.shape) - geom
-        Pgeom_pow = np.power(np.transpose(np.broadcast_to(P_minus_geom, (2*max_am + 1,geom.shape[0],geom.shape[1])), (1,2,0)), np.arange(2*max_am + 1))
-        # All possible np.dot(P-atom,P-atom) 
-        rcp2 = np.einsum('ij,ij->i', P_minus_geom, P_minus_geom)
-        # All needed (and unneeded, for am < max_am) boys function evaluations
-        boys_arg = np.broadcast_to(rcp2 * gamma, (2*max_am+1, geom.shape[0]))
-        boys_nu = np.tile(np.arange(2*max_am+1), (geom.shape[0],1)).T
-        boys_eval = boys(boys_nu,boys_arg)
-
-        s.a = 0
-        for _ in s.while_range(lambda: s.a < dims[p1]):
-          s.b = 0
-          for _ in s.while_range(lambda: s.b < dims[p2]):
-            # Gather angular momentum and index
-            la,ma,na = angular_momentum_combinations[s.a + ld1]
-            lb,mb,nb = angular_momentum_combinations[s.b + ld2]
-            # To only create unique indices, need to have separate indices arrays for i and j.
-            i = indices[p1] + s.a
-            j = indices[p2] + s.b
-            # Compute one electron integrals and add to appropriate index
-            potential_int = potential(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,Pgeom_pow,boys_eval,prefactor,charges,A_vals) * coef
-            s.V = jax.ops.index_add(s.V, jax.ops.index[i,j],  potential_int)
-
-            s.b += 1
-          s.a += 1
-      return s.V
-
-
+    primitive_duets = cartesian_product(jnp.arange(nprim), jnp.arange(nprim))
+    V = jnp.zeros((nbf,nbf))
+
+    for n in range(primitive_duets.shape[0]):
+       p1,p2 = primitive_duets[n]
+       coef = coeffs[p1] * coeffs[p2]
+       aa, bb = exps[p1], exps[p2]
+       atom1, atom2 = atoms[p1], atoms[p2]
+       am1, am2 = ams[p1], ams[p2]
+       A, B = geom[atom1], geom[atom2]
+       ld1, ld2 = am_leading_indices[am1], am_leading_indices[am2]
+
+       gamma = aa + bb
+       prefactor = jnp.exp(-aa * bb * jnp.dot(A-B,A-B) / gamma)
+       P = (aa * A + bb * B) / gamma
+       # Maximum angular momentum: hard coded
+       # Precompute all powers up to 2+max_am of Pi-Ai, Pi-Bi.
+       # We need 2+max_am since kinetic requires incrementing angluar momentum by +2
+       PA_pow = jnp.power(jnp.broadcast_to(P-A, (max_am+3,3)).T, jnp.arange(max_am+3))
+       PB_pow = jnp.power(jnp.broadcast_to(P-B, (max_am+3,3)).T, jnp.arange(max_am+3))
+
+       # For potential integrals, we need the difference between
+       # the gaussian product center P and ALL atoms in the molecule,
+       # and then take all possible powers up to 2*max_am.
+       # We pre-collect this into a 3d array, and then just pull out what we need via indexing in the loops, so they need not be recomputed.
+       # The resulting array has dimensions (atom, cartesian component, power) so index (0, 1, 3) would return (Py - atom0_y)^3
+       P_minus_geom = jnp.broadcast_to(P, geom.shape) - geom
+       Pgeom_pow = jnp.power(jnp.transpose(jnp.broadcast_to(P_minus_geom, (2*max_am + 1,geom.shape[0],geom.shape[1])), (1,2,0)), jnp.arange(2*max_am + 1))
+       # All possible jnp.dot(P-atom,P-atom)
+       rcp2 = jnp.einsum('ij,ij->i', P_minus_geom, P_minus_geom)
+       # All needed (and unneeded, for am < max_am) boys function evaluations
+       boys_arg = jnp.broadcast_to(rcp2 * gamma, (2*max_am+1, geom.shape[0]))
+       boys_nu = jnp.tile(jnp.arange(2*max_am+1), (geom.shape[0],1)).T
+       boys_eval = boys(boys_nu,boys_arg)
+
+       a, b = 0, 0
+       def loop_a(arr0):
+          a, b, oei = arr0
+          b = 0
+
+          def loop_b(arr1):
+             a, b, oei = arr1
+             # Gather angular momentum and index
+             la,ma,na = angular_momentum_combinations[a + ld1]
+             lb,mb,nb = angular_momentum_combinations[b + ld2]
+             # To only create unique indices, need to have separate indices arrays for i and j.
+             i = indices[p1] + a
+             j = indices[p2] + b
+             # Compute one electron integrals and add to appropriate index
+             potential_int = potential(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,Pgeom_pow,boys_eval,prefactor,charges,A_vals) * coef
+             oei = oei.at[i,j].set(potential_int)
+             b += 1
+             return (a, b, oei)
+
+          a_, b_, oei_ = while_loop(lambda arr1: arr1[1] < dims[p2], loop_b, (a, b, oei))
+          a_ += 1
+          return (a_, b_, oei_)
+
+       a_, b_, oei_ = while_loop(lambda arr0: arr0[0] < dims[p1], loop_a, (a, b, V))
+
+       return oei_
+
+    return V
diff --git a/quax/integrals/integrals_utils.py b/quax/integrals/integrals_utils.py
index 8239298..fd0545c 100644
--- a/quax/integrals/integrals_utils.py
+++ b/quax/integrals/integrals_utils.py
@@ -3,7 +3,7 @@
 import jax.numpy as jnp
 import numpy as np
 from functools import partial
-from jax.experimental import loops
+from jax.lax import while_loop
 
 def boys(m,x,eps=1e-12):
     return 0.5 * (x + eps)**(-(m + 0.5)) * jax.lax.igamma(m + 0.5, x + eps) \
@@ -19,15 +19,16 @@ def binomial_prefactor(k, l1, l2, PAx, PBx):
     """
     q = jax.lax.max(-k, k-2*l2)
     q_final = jax.lax.min(k, 2*l1-k)
-    with loops.Scope() as L:
-      L.total = 0.
-      L.q = q
-      for _ in L.while_range(lambda: L.q <= q_final):
-        i = (k+L.q)//2
-        j = (k-L.q)//2
-        L.total += PAx[l1-i] * PBx[l2-j] * binomials[l1,i] * binomials[l2,j]
-        L.q += 2
-    return L.total
+
+    def loop_q(arr):
+       q_n, total = arr
+       i = (k+q_n)//2
+       j = (k-q_n)//2
+       return (q_n+2, total + PAx[l1-i] * PBx[l2-j] * binomials[l1,i] * binomials[l2,j])
+
+    q_, total_sum = while_loop(lambda arr: arr[0] < q_final, loop_q, (q,0))
+
+    return total_sum
 
 def gaussian_product(alpha1,A,alpha2,B):
     '''Gaussian product theorem. Returns center.'''
@@ -36,7 +37,7 @@ def gaussian_product(alpha1,A,alpha2,B):
 def find_unique_shells(nshells):
     '''Find shell quartets which correspond to corresponding to unique two-electron integrals, i>=j, k>=l, IJ>=KL'''
     v = np.arange(nshells,dtype=jnp.int16) 
-    indices = old_cartesian_product(v,v,v,v)
+    indices = cartesian_product(v,v,v,v)
     cond1 = (indices[:,0] >= indices[:,1]) & (indices[:,2] >= indices[:,3]) 
     cond2 = indices[:,0] * (indices[:,0] + 1)/2 + indices[:,1] >= indices[:,2] * (indices[:,2] + 1)/2 + indices[:,3]
     mask = cond1 & cond2 
diff --git a/quax/integrals/oei.py b/quax/integrals/oei.py
index ac6bf69..4327b9f 100644
--- a/quax/integrals/oei.py
+++ b/quax/integrals/oei.py
@@ -1,7 +1,8 @@
 import jax 
 from jax.config import config; config.update("jax_enable_x64", True)
 import jax.numpy as jnp
-from jax.experimental import loops
+import numpy as np
+from jax.lax import fori_loop, while_loop
 from functools import partial
 
 from .integrals_utils import gaussian_product, boys, binomial_prefactor, factorials, double_factorials, neg_one_pow, cartesian_product, am_leading_indices, angular_momentum_combinations
@@ -34,13 +35,14 @@ def overlap_component(l1,l2,PAx,PBx,gamma):
     The 1d overlap integral component. Taketa, Huzinaga, Oohata 2.12
     """
     K = 1 + (l1 + l2) // 2  
-    with loops.Scope() as s:
-      s.total = 0.
-      s.i = 0
-      for _ in s.while_range(lambda: s.i < K):
-        s.total += binomial_prefactor(2*s.i,l1,l2,PAx,PBx) * double_factorials[2*s.i-1] / (2*gamma)**s.i
-        s.i += 1
-      return s.total
+
+    def loop_i(arr):
+       i, total = arr
+       return (i+1, total + binomial_prefactor(2*i,l1,l2,PAx,PBx) * double_factorials[2*i-1] / (2*gamma)**i)
+
+    i_accu, total_sum = while_loop(lambda arr: arr[0] < K, loop_i, (0, 0)) # (i, total)
+
+    return total_sum
 
 def kinetic(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,prefactor):
     """
@@ -68,29 +70,37 @@ def kinetic(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,prefactor):
     return prefactor * (term1 + term2 + term3)
 
 def A_array(l1,l2,PA,PB,CP,g,A_vals):
-    with loops.Scope() as s:
-      # Hard code only up to f functions (fxxx | fxxx) => l1 + l2 + 1 = 7
-      s.A = A_vals
-      s.i = 0
-      s.r = 0
-      s.u = 0 
-
-      s.i = l1 + l2  
-      for _ in s.while_range(lambda: s.i > -1):   
-        Aterm = neg_one_pow[s.i] * binomial_prefactor(s.i,l1,l2,PA,PB) * factorials[s.i]
-        s.r = s.i // 2
-        for _ in s.while_range(lambda: s.r > -1):
-          s.u = (s.i - 2 * s.r) // 2 
-          for _ in s.while_range(lambda: s.u > -1):
-            I = s.i - 2 * s.r - s.u 
-            tmp = I - s.u
-            fact_ratio = 1 / (factorials[s.r] * factorials[s.u] * factorials[tmp])
-            Aterm *= neg_one_pow[s.u]  * CP[tmp] * (0.25 / g)**(s.r+s.u) * fact_ratio 
-            s.A = jax.ops.index_add(s.A, I, Aterm)
-            s.u -= 1
-          s.r -= 1
-        s.i -= 1
-      return s.A
+
+    def loop_i(arr0):
+       i, r, u, A = arr0
+       Aterm = neg_one_pow[i] * binomial_prefactor(i,l1,l2,PA,PB) * factorials[i]
+       r = i // 2
+
+       def loop_r(arr1):
+          i, r, u, Aterm, A = arr1
+          u = (i - 2 * r) // 2
+
+          def loop_u(arr2):
+             i, r, u, Aterm, A = arr2
+             I = i - 2 * r - u
+             tmp = I - u
+             fact_ratio = 1 / (factorials[r] * factorials[u] * factorials[tmp])
+             Aterm *= neg_one_pow[u]  * CP[tmp] * (0.25 / g)**(r+u) * fact_ratio
+             A = A.at[I].set(u)
+             u -= 1
+             return (i, r, u, Aterm, A)
+
+          i_, r_, u_, Aterm_, A_ = while_loop(lambda arr2: arr2[1] > -1, loop_u, (i, r, u, Aterm, A))
+          r_ -= 1
+          return (i_, r_, u_, Aterm_, A_)
+
+       i_, r_, u_, Aterm_, A_ = while_loop(lambda arr1: arr1[1] > -1, loop_r, (i, r, u, Aterm, A))
+       i_ -= 1
+       return (i_, r_, u_, A_)
+
+    i_, r_, u_, A = while_loop(lambda arr0: arr0[0] > -1, loop_i, (l1 + l2, 0, 0, A_vals)) # (i, r, u, A)
+
+    return A
 
 def potential(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,Pgeom_pow,boys_eval,prefactor,charges,A_vals):
     """
@@ -99,29 +109,40 @@ def potential(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,Pgeom_pow,boys_eval,prefacto
     gamma = aa + bb
     prefactor *= -2 * jnp.pi / gamma
 
-    with loops.Scope() as s:
-      s.val = 0.
-      for i in s.range(Pgeom_pow.shape[0]):
-        Ax = A_array(la,lb,PA_pow[0],PB_pow[0],Pgeom_pow[i,0,:],gamma,A_vals)
-        Ay = A_array(ma,mb,PA_pow[1],PB_pow[1],Pgeom_pow[i,1,:],gamma,A_vals)
-        Az = A_array(na,nb,PA_pow[2],PB_pow[2],Pgeom_pow[i,2,:],gamma,A_vals)
-
-        with loops.Scope() as S:
-          S.total = 0.
-          S.I = 0
-          S.J = 0
-          S.K = 0
-          for _ in S.while_range(lambda: S.I < la + lb + 1):
-            S.J = 0 
-            for _ in S.while_range(lambda: S.J < ma + mb + 1):
-              S.K = 0 
-              for _ in S.while_range(lambda: S.K < na + nb + 1):
-                S.total += Ax[S.I] * Ay[S.J] * Az[S.K] * boys_eval[S.I + S.J + S.K, i]
-                S.K += 1
-              S.J += 1
-            S.I += 1
-        s.val += charges[i] * prefactor * S.total
-      return s.val
+    def loop_val(n, val):
+      Ax = A_array(la,lb,PA_pow[0],PB_pow[0],Pgeom_pow[n,0,:],gamma,A_vals)
+      Ay = A_array(ma,mb,PA_pow[1],PB_pow[1],Pgeom_pow[n,1,:],gamma,A_vals)
+      Az = A_array(na,nb,PA_pow[2],PB_pow[2],Pgeom_pow[n,2,:],gamma,A_vals)
+
+      I, J, K, total = 0, 0, 0, 0
+      def loop_I(arr0):
+         I, J, K, val, total = arr0
+         J = 0
+
+         def loop_J(arr1):
+            I, J, K, val, total = arr1
+            K = 0
+
+            def loop_K(arr2):
+               I, J, K, val, total = arr2
+               total += Ax[I] * Ay[J] * Az[K] * boys_eval[I + J + K, n]
+               K += 1
+               return (I, J, K, val, total)
+
+            I_, J_, K_, val_, total_ = while_loop(lambda arr2: arr2[2] < na + nb + 1, loop_K, (I, J, K, val, total))
+            J_ += 1
+            return (I_, J_, K_, val_, total_)
+
+         I_, J_, K_, val_, total_ = while_loop(lambda arr1: arr1[1] < ma + mb + 1, loop_J, (I, J, K, val, total))
+         I_ += 1
+         return (I_, J_, K_, val_, total_)
+
+      I_, J_, K_, val_, total_ = while_loop(lambda arr0: arr0[0] < la + lb + 1, loop_I, (I, J, K, val, total))
+      val_ += charges[n] * prefactor * total_
+      return val_
+
+    val = fori_loop(0, Pgeom_pow.shape[0], loop_val, 0)
+    return val
 
 def oei_arrays(geom, basis, charges):
     """
@@ -136,63 +157,69 @@ def oei_arrays(geom, basis, charges):
     # Save various AM distributions for indexing
     # Obtain all possible primitive quartet index combinations 
     primitive_duets = cartesian_product(jnp.arange(nprim), jnp.arange(nprim))
-
-    with loops.Scope() as s:
-      s.oei = jnp.zeros((3,nbf,nbf))
-      s.a = 0  # center A angular momentum iterator 
-      s.b = 0  # center B angular momentum iterator 
-
-      for prim_duet in s.range(primitive_duets.shape[0]):
-        p1,p2 = primitive_duets[prim_duet]
-        coef = coeffs[p1] * coeffs[p2]
-        aa, bb = exps[p1], exps[p2]
-        atom1, atom2 = atoms[p1], atoms[p2]
-        am1, am2 = ams[p1], ams[p2]
-        A, B = geom[atom1], geom[atom2]
-        ld1, ld2 = am_leading_indices[am1], am_leading_indices[am2]
-
-        gamma = aa + bb
-        prefactor = jnp.exp(-aa * bb * jnp.dot(A-B,A-B) / gamma)
-        P = (aa * A + bb * B) / gamma
-        # Maximum angular momentum: hard coded
-        #max_am = 3 # f function support
-        # Precompute all powers up to 2+max_am of Pi-Ai, Pi-Bi. 
-        # We need 2+max_am since kinetic requires incrementing angluar momentum by +2
-        PA_pow = jnp.power(jnp.broadcast_to(P-A, (max_am+3,3)).T, jnp.arange(max_am+3))
-        PB_pow = jnp.power(jnp.broadcast_to(P-B, (max_am+3,3)).T, jnp.arange(max_am+3))
-
-        # For potential integrals, we need the difference between 
-        # the gaussian product center P and ALL atoms in the molecule, 
-        # and then take all possible powers up to 2*max_am. 
-        # We pre-collect this into a 3d array, and then just pull out what we need via indexing in the loops, so they need not be recomputed.
-        # The resulting array has dimensions (atom, cartesian component, power) so index (0, 1, 3) would return (Py - atom0_y)^3
-        P_minus_geom = jnp.broadcast_to(P, geom.shape) - geom
-        Pgeom_pow = jnp.power(jnp.transpose(jnp.broadcast_to(P_minus_geom, (2*max_am + 1,geom.shape[0],geom.shape[1])), (1,2,0)), jnp.arange(2*max_am + 1))
-        # All possible jnp.dot(P-atom,P-atom) 
-        rcp2 = jnp.einsum('ij,ij->i', P_minus_geom, P_minus_geom)
-        # All needed (and unneeded, for am < max_am) boys function evaluations
-        boys_arg = jnp.broadcast_to(rcp2 * gamma, (2*max_am+1, geom.shape[0]))
-        boys_nu = jnp.tile(jnp.arange(2*max_am+1), (geom.shape[0],1)).T
-        boys_eval = boys(boys_nu,boys_arg)
-
-        s.a = 0
-        for _ in s.while_range(lambda: s.a < dims[p1]):
-          s.b = 0
-          for _ in s.while_range(lambda: s.b < dims[p2]):
-            # Gather angular momentum and index
-            la,ma,na = angular_momentum_combinations[s.a + ld1]
-            lb,mb,nb = angular_momentum_combinations[s.b + ld2]
-            # To only create unique indices, need to have separate indices arrays for i and j.
-            i = indices[p1] + s.a
-            j = indices[p2] + s.b
-            # Compute one electron integrals and add to appropriate index
-            overlap_int = overlap(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,prefactor) * coef
-            kinetic_int = kinetic(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,prefactor) * coef
-            potential_int = potential(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,Pgeom_pow,boys_eval,prefactor,charges,A_vals) * coef
-            s.oei = jax.ops.index_add(s.oei, ([0,1,2],[i,i,i],[j,j,j]), (overlap_int, kinetic_int, potential_int))
-
-            s.b += 1
-          s.a += 1
-    S, T, V = s.oei[0], s.oei[1], s.oei[2]
-    return S, T, V
+    STV = jnp.zeros((3,nbf,nbf))
+
+    for n in range(primitive_duets.shape[0]):
+       p1,p2 = primitive_duets[n]
+       coef = coeffs[p1] * coeffs[p2]
+       aa, bb = exps[p1], exps[p2]
+       atom1, atom2 = atoms[p1], atoms[p2]
+       am1, am2 = ams[p1], ams[p2]
+       A, B = geom[atom1], geom[atom2]
+       ld1, ld2 = am_leading_indices[am1], am_leading_indices[am2]
+
+       gamma = aa + bb
+       prefactor = jnp.exp(-aa * bb * jnp.dot(A-B,A-B) / gamma)
+       P = (aa * A + bb * B) / gamma
+       # Maximum angular momentum: hard coded
+       #max_am = 3 # f function support
+       # Precompute all powers up to 2+max_am of Pi-Ai, Pi-Bi.
+       # We need 2+max_am since kinetic requires incrementing angluar momentum by +2
+       PA_pow = jnp.power(jnp.broadcast_to(P-A, (max_am+3, 3)).T, jnp.arange(max_am+3))
+       PB_pow = jnp.power(jnp.broadcast_to(P-B, (max_am+3, 3)).T, jnp.arange(max_am+3))
+
+       # For potential integrals, we need the difference between
+       # the gaussian product center P and ALL atoms in the molecule,
+       # and then take all possible powers up to 2*max_am.
+       # We pre-collect this into a 3d array, and then just pull out what we need via indexing in the loops, so they need not be recomputed.
+       # The resulting array has dimensions (atom, cartesian component, power) so index (0, 1, 3) would return (Py - atom0_y)^3
+       P_minus_geom = jnp.broadcast_to(P, geom.shape) - geom
+       Pgeom_pow = jnp.power(jnp.transpose(jnp.broadcast_to(P_minus_geom, (2*max_am + 1,geom.shape[0],geom.shape[1])), (1,2,0)), jnp.arange(2*max_am + 1))
+       # All possible jnp.dot(P-atom,P-atom)
+       rcp2 = jnp.einsum('ij,ij->i', P_minus_geom, P_minus_geom)
+       # All needed (and unneeded, for am < max_am) boys function evaluations
+       boys_arg = jnp.broadcast_to(rcp2 * gamma, (2*max_am+1, geom.shape[0]))
+       boys_nu = jnp.tile(jnp.arange(2*max_am+1), (geom.shape[0],1)).T
+       boys_eval = boys(boys_nu,boys_arg)
+
+       a, b = 0, 0
+       def loop_a(arr0):
+          a, b, oei = arr0
+          b = 0
+
+          def loop_b(arr1):
+             a, b, oei = arr1
+             # Gather angular momentum and index
+             la,ma,na = angular_momentum_combinations[a + ld1]
+             lb,mb,nb = angular_momentum_combinations[b + ld2]
+             # To only create unique indices, need to have separate indices arrays for i and j.
+             i = indices[p1] + a
+             j = indices[p2] + b
+             # Compute one electron integrals and add to appropriate index
+             overlap_int = overlap(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,prefactor) * coef
+             kinetic_int = kinetic(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,prefactor) * coef
+             potential_int = potential(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,Pgeom_pow,boys_eval,prefactor,charges,A_vals) * coef
+             oei = oei.at[([0,1,2],[i,i,i],[j,j,j])].set((overlap_int, kinetic_int, potential_int))
+             b += 1
+             return (a, b, oei)
+
+          a_, b_, oei_ = while_loop(lambda arr1: arr1[1] < dims[p2], loop_b, (a, b, oei))
+          a_ += 1
+          return (a_, b_, oei_)
+
+       a_, b_, oei_ = while_loop(lambda arr0: arr0[0] < dims[p1], loop_a, (a, b, STV))
+
+       return oei_
+
+    return STV[0], STV[1], STV[2]
 
diff --git a/quax/integrals/tei.py b/quax/integrals/tei.py
index 8658817..4ddcb5d 100644
--- a/quax/integrals/tei.py
+++ b/quax/integrals/tei.py
@@ -2,7 +2,7 @@
 from jax.config import config
 config.update("jax_enable_x64", True)
 import jax.numpy as jnp
-from jax.experimental import loops
+from jax.lax import fori_loop, while_loop
 
 from .basis_utils import flatten_basis_data, get_nbf
 from .integrals_utils import gaussian_product, boys, binomial_prefactor, cartesian_product, am_leading_indices, angular_momentum_combinations, fact_ratio2, neg_one_pow
@@ -12,100 +12,118 @@ def B_array(l1,l2,l3,l4,pa_pow,pb_pow,qc_pow,qd_pow,qp_pow,g1_pow,g2_pow,oodelta
     # Can you split it into two Scopes?
     # Can you convert  all or part of this to a tensor contraction?  
     # It does not appear to help to pull out binomial prefactors and compute outside loop.
-    with loops.Scope() as s:
-      s.B = B_vals
-      s.i2 = 0
-      s.r1 = 0
-      s.r2 = 0
-      s.u = 0 
-      s.i1 = l1 + l2  
-      for _ in s.while_range(lambda: s.i1 > -1):
-        Bterm = binomial_prefactor(s.i1,l1,l2,pa_pow,pb_pow) 
-        tmp = s.i1
-        s.r1 = s.i1 // 2
-        for _ in s.while_range(lambda: s.r1 > -1):
-          Bterm *= fact_ratio2[s.i1,s.r1]
-          Bterm *= g1_pow[s.r1-s.i1]
-          tmp -= 2 * s.r1
-          s.i2 = l3 + l4 
-          for _ in s.while_range(lambda: s.i2 > -1):
-            Bterm *= neg_one_pow[s.i2]
-            Bterm *= binomial_prefactor(s.i2,l3,l4,qc_pow,qd_pow) 
-            tmp += s.i2
-            s.r2 = s.i2 // 2
-            for _ in s.while_range(lambda: s.r2 > -1):
-              Bterm *= fact_ratio2[s.i2,s.r2]
-              Bterm *= g2_pow[s.r2-s.i2]
-              tmp -= 2 * s.r2
-              s.u = tmp // 2
-              for _ in s.while_range(lambda: s.u > -1):
-                I = tmp - s.u 
-                Bterm *= neg_one_pow[s.u] 
-                Bterm *= fact_ratio2[tmp,s.u]
-                Bterm *= qp_pow[tmp - 2 * s.u]
-                Bterm *= oodelta_pow[I]
-                s.B = jax.ops.index_add(s.B, I, Bterm)
-                s.u -= 1
-              s.r2 -= 1
-            s.i2 -= 1
-          s.r1 -= 1
-        s.i1 -= 1
-      return s.B
-
-def primitive_tei(La,Lb,Lc,Ld, A, B, C, D, aa, bb, cc, dd, c1, c2, c3, c4): 
-    """
-    TODO can define a jvp rule for this, have it increment arguments appropriately
-    Computes a single contracted two electron integral. 
-    given angular momentum vectors, centers, and single value exponents and contraction coefficients
-    """
-    # NOTE THIS FUNCTION IS NOT USED. 
-    # For debugging. This is implementation is directly coded into tei_array 
-    # in order to save some intermediates.
-    la, ma, na = La
-    lb, mb, nb = Lb
-    lc, mc, nc = Lc
-    ld, md, nd = Ld
-    xa,ya,za = A 
-    xb,yb,zb = B 
-    xc,yc,zc = C 
-    xd,yd,zd = D 
-
-    rab2 = jnp.dot(A-B,A-B)
-    rcd2 = jnp.dot(C-D,C-D)
-    coef = c1 * c2 * c3 * c4
-    xyzp = gaussian_product(aa,A,bb,B)
-    xyzq = gaussian_product(cc,C,dd,D)
-    xp,yp,zp = xyzp
-    xq,yq,zq = xyzq
-    rpq2 = jnp.dot(xyzp-xyzq,xyzp-xyzq)
-    gamma1 = aa + bb
-    gamma2 = cc + dd
-    delta = 0.25*(1/gamma1+1/gamma2)
-    Bx = B_array(la,lb,lc,ld,xp,xa,xb,xq,xc,xd,gamma1,gamma2,delta)
-    By = B_array(ma,mb,mc,md,yp,ya,yb,yq,yc,yd,gamma1,gamma2,delta)
-    Bz = B_array(na,nb,nc,nd,zp,za,zb,zq,zc,zd,gamma1,gamma2,delta)
-    boys_arg = 0.25*rpq2/delta
-    boys_eval = boys(jnp.arange(13), boys_arg) # supports up to f functions
-
-    with loops.Scope() as s:
-      s.I = 0
-      s.J = 0  
-      s.K = 0 
-      s.primitive = 0.
-      s.I = 0 
-      for _ in s.while_range(lambda: s.I < la + lb + lc + ld + 1):
-        s.J = 0 
-        for _ in s.while_range(lambda: s.J < ma + mb + mc + md + 1):
-          s.K = 0 
-          for _ in s.while_range(lambda: s.K < na + nb + nc + nd + 1):
-            s.primitive += Bx[s.I] * By[s.J] * Bz[s.K] * boys_eval[s.I + s.J + s.K]
-            s.K += 1
-          s.J += 1
-        s.I += 1
-      value = 2*jax.lax.pow(jnp.pi,2.5)/(gamma1*gamma2*jnp.sqrt(gamma1+gamma2)) \
-              *jnp.exp(-aa*bb*rab2/gamma1) \
-              *jnp.exp(-cc*dd*rcd2/gamma2)*s.primitive*coef
-      return value
+
+    def loop_i1(arr0):
+       i1, i2, r1, r2, u, B = arr0
+       Bterm = binomial_prefactor(arr0[0],l1,l2,pa_pow,pb_pow)
+       tmp = i1
+       r1 = i1 // 2
+
+       def loop_r1(arr1):
+          i1, i2, r1, r2, u, B = arr1
+          Bterm *= fact_ratio2[i1,r1]
+          Bterm *= g1_pow[r1-i1]
+          tmp -= 2 * r1
+          i2 = l3 + l4
+
+          def loop_i2(arr2):
+             i1, i2, r1, r2, u, B = arr2
+             Bterm *= neg_one_pow[i2]
+             Bterm *= binomial_prefactor(i2,l3,l4,qc_pow,qd_pow)
+             tmp += i2
+             r2 = i2 // 2
+
+             def loop_r2(arr3):
+                i1, i2, r1, r2, u, B = arr3
+                Bterm *= fact_ratio2[i2,r2]
+                Bterm *= g2_pow[r2-i2]
+                tmp -= 2 * r2
+                u = tmp // 2
+
+                def loop_u(arr4):
+                   i1, i2, r1, r2, u, B = arr4
+                   I = tmp - u
+                   Bterm *= neg_one_pow[u]
+                   Bterm *= fact_ratio2[tmp,u]
+                   Bterm *= qp_pow[tmp - 2 * u]
+                   Bterm *= oodelta_pow[I]
+                   B = B.at[I].set(Bterm)
+                   u -= 1
+                   return (i1, i2, r1, r2, u, B)
+
+                i1_, i2_, r1_, r2_, u_, B_ = while_loop(lambda arr4: arr4[4] > -1, loop_u, (i1, i2, r1, r2, u, B))
+                r2_ -= 1
+                return (i1_, i2_, r1_, r2_, u_, B_)
+
+             i1_, i2_, r1_, r2_, u_, B_ = while_loop(lambda arr3: arr3[3] > -1, loop_r2, (i1, i2, r1, r2, u, B))
+             i2_ -= 1
+             return (i1_, i2_, r1_, r2_, u_, B_)
+
+          i1_, i2_, r1_, r2_, u_, B_ = while_loop(lambda arr2: arr2[1] > -1, loop_i2, (i1, i2, r1, r2, u, B))
+          r1_ -= 1
+          return (i1_, i2_, r1_, r2_, u_, B_)
+
+       i1_, i2_, r1_, r2_, u_, B_ = while_loop(lambda arr1: arr1[2] > -1, loop_r1, (i1, i2, r1, r2, u, B))
+       i1_ -= 1
+       return (i1_, i2_, r1_, r2_, u_, B_)
+
+    i1, i2, r1, r2, u, B = while_loop(lambda arr0: arr0[0] > -1, loop_i1, (l1 + l2, 0, 0, 0, 0, B_vals)) # (i1, i2, r1, r2, u, B)
+    return B
+
+# def primitive_tei(La,Lb,Lc,Ld, A, B, C, D, aa, bb, cc, dd, c1, c2, c3, c4):
+#     """
+#     TODO can define a jvp rule for this, have it increment arguments appropriately
+#     Computes a single contracted two electron integral.
+#     given angular momentum vectors, centers, and single value exponents and contraction coefficients
+#     """
+#     # NOTE THIS FUNCTION IS NOT USED.
+#     # For debugging. This is implementation is directly coded into tei_array
+#     # in order to save some intermediates.
+#     la, ma, na = La
+#     lb, mb, nb = Lb
+#     lc, mc, nc = Lc
+#     ld, md, nd = Ld
+#     xa,ya,za = A
+#     xb,yb,zb = B
+#     xc,yc,zc = C
+#     xd,yd,zd = D
+
+#     rab2 = jnp.dot(A-B,A-B)
+#     rcd2 = jnp.dot(C-D,C-D)
+#     coef = c1 * c2 * c3 * c4
+#     xyzp = gaussian_product(aa,A,bb,B)
+#     xyzq = gaussian_product(cc,C,dd,D)
+#     xp,yp,zp = xyzp
+#     xq,yq,zq = xyzq
+#     rpq2 = jnp.dot(xyzp-xyzq,xyzp-xyzq)
+#     gamma1 = aa + bb
+#     gamma2 = cc + dd
+#     delta = 0.25*(1/gamma1+1/gamma2)
+#     Bx = B_array(la,lb,lc,ld,xp,xa,xb,xq,xc,xd,gamma1,gamma2,delta)
+#     By = B_array(ma,mb,mc,md,yp,ya,yb,yq,yc,yd,gamma1,gamma2,delta)
+#     Bz = B_array(na,nb,nc,nd,zp,za,zb,zq,zc,zd,gamma1,gamma2,delta)
+#     boys_arg = 0.25*rpq2/delta
+#     boys_eval = boys(jnp.arange(13), boys_arg) # supports up to f functions
+
+#     with loops.Scope() as s:
+#       s.I = 0
+#       s.J = 0
+#       s.K = 0
+#       s.primitive = 0.
+#       s.I = 0
+#       for _ in s.while_range(lambda: s.I < la + lb + lc + ld + 1):
+#         s.J = 0
+#         for _ in s.while_range(lambda: s.J < ma + mb + mc + md + 1):
+#           s.K = 0
+#           for _ in s.while_range(lambda: s.K < na + nb + nc + nd + 1):
+#             s.primitive += Bx[s.I] * By[s.J] * Bz[s.K] * boys_eval[s.I + s.J + s.K]
+#             s.K += 1
+#           s.J += 1
+#         s.I += 1
+#       value = 2*jax.lax.pow(jnp.pi,2.5)/(gamma1*gamma2*jnp.sqrt(gamma1+gamma2)) \
+#               *jnp.exp(-aa*bb*rab2/gamma1) \
+#               *jnp.exp(-cc*dd*rcd2/gamma2)*s.primitive*coef
+#       return value
 
 def tei_array(geom, basis):
     """
@@ -149,122 +167,144 @@ def tei_array(geom, basis):
     # Shape: (nprim, nprim, natom, 3, max_am+1). In loop index PA_pow as [p1,p2,atoms[p1],:,:]
     PminusA_pow = jnp.power(jnp.transpose(jnp.broadcast_to(PminusA, (max_am+1,nprim,nprim,natom,3)), (1,2,3,4,0)), jnp.arange(max_am+1))
 
-    with loops.Scope() as s:
-      s.G = jnp.zeros((nbf,nbf,nbf,nbf))
-      s.a = 0  # center A angular momentum iterator 
-      s.b = 0  # center B angular momentum iterator 
-      s.c = 0  # center C angular momentum iterator 
-      s.d = 0  # center D angular momentum iterator 
-
-      # Loop over primitive quartets, compute integral, add to appropriate index in G
-      for prim_quar in s.range(primitive_quartets.shape[0]):
-        # Load in primitive indices, coeffs, exponents, centers, angular momentum index, and leading placement index in TEI array
-        p1,p2,p3,p4 = primitive_quartets[prim_quar] 
-        coef = coeffs[p1] * coeffs[p2] * coeffs[p3] * coeffs[p4]
-        aa, bb, cc, dd = exps[p1], exps[p2], exps[p3], exps[p4]
-        ld1, ld2, ld3, ld4 = am_leading_indices[ams[p1]],am_leading_indices[ams[p2]],am_leading_indices[ams[p3]],am_leading_indices[ams[p4]]
-        idx1, idx2, idx3, idx4 = indices[p1],indices[p2],indices[p3],indices[p4],
-        #A, B, C, D = geom[atoms[p1]], geom[atoms[p2]], geom[atoms[p3]], geom[atoms[p4]]
-
-        # Compute common intermediates before looping over AM distributions.
-        # Avoids redundant recomputations/reassignment for all classes other than (ss|ss).
-        #AB = A - B
-        #CD = C - D
-        #rab2 = jnp.dot(AB,AB)
-        #rcd2 = jnp.dot(CD,CD)
-        #P = (aa * A + bb * B) / gamma1
-        #Q = (cc * C + dd * D) / gamma2
-        gamma1 = aa + bb
-        gamma2 = cc + dd
-
-        #TODO
-        P = gaussian_products[p1,p2]
-        Q = gaussian_products[p3,p4]
-        rab2 = AmBdot[atoms[p1],atoms[p2]]
-        rcd2 = AmBdot[atoms[p3],atoms[p4]]
-        #PA = PminusA[p1,p2,atoms[p1]]
-        #PB = PminusA[p1,p2,atoms[p2]]
-        #QC = PminusA[p3,p4,atoms[p3]]
-        #QD = PminusA[p3,p4,atoms[p4]]
-        #TODO
-
-        PQ = P - Q
-        rpq2 = jnp.dot(PQ,PQ)
-        delta = 0.25*(1/gamma1+1/gamma2)
-
-        boys_arg = 0.25 * rpq2 / delta
-        boys_eval = boys(jnp.arange(max_am_idx), boys_arg) 
-
-        # Need all powers of Pi-Ai,Pi-Bi,Qi-Ci,Qi-Di (i=x,y,z) up to max_am and Qi-Pi up to max_am_idx
-        # note: this computes unncessary quantities for lower angular momentum, 
-        # but avoids repeated computation of the same quantities in loops for higher angular momentum
-
-        #PA_pow = jnp.power(jnp.broadcast_to(P-A, (max_am+1,3)).T, jnp.arange(max_am+1))
-        #PB_pow = jnp.power(jnp.broadcast_to(P-B, (max_am+1,3)).T, jnp.arange(max_am+1))
-        #QC_pow = jnp.power(jnp.broadcast_to(Q-C, (max_am+1,3)).T, jnp.arange(max_am+1))
-        #QD_pow = jnp.power(jnp.broadcast_to(Q-D, (max_am+1,3)).T, jnp.arange(max_am+1))
-
-        PA_pow = PminusA_pow[p1,p2,atoms[p1],:,:]
-        PB_pow = PminusA_pow[p1,p2,atoms[p2],:,:]
-        QC_pow = PminusA_pow[p3,p4,atoms[p3],:,:]
-        QD_pow = PminusA_pow[p3,p4,atoms[p4],:,:]
-
-        QP_pow = jnp.power(jnp.broadcast_to(Q-P, (max_am_idx,3)).T, jnp.arange(max_am_idx))
-        # Gamma powers are negative, up to -(l1+l2). 
-        # Make array such that the given negative index returns the same negative power.
-        g1_pow = jnp.power(4*gamma1, -jnp.roll(jnp.flip(jnp.arange(2*max_am+1)),1)) 
-        g2_pow = jnp.power(4*gamma2, -jnp.roll(jnp.flip(jnp.arange(2*max_am+1)),1)) 
-        oodelta_pow = jnp.power(1 / delta, jnp.arange(max_am_idx))  # l1 + l2 + l3 + l4 + 1
-
-        prefactor = 34.986836655249726 / (gamma1*gamma2*jnp.sqrt(gamma1+gamma2)) \
-                    * jnp.exp(-aa*bb*rab2/gamma1 + -cc*dd*rcd2/gamma2) * coef
-
-        # TODO is there symmetry here?
-        s.a = 0
-        for _ in s.while_range(lambda: s.a < dims[p1]):
-          s.b = 0
-          for _ in s.while_range(lambda: s.b < dims[p2]):
-            s.c = 0
-            for _ in s.while_range(lambda: s.c < dims[p3]):
-              s.d = 0
-              for _ in s.while_range(lambda: s.d < dims[p4]):
-                # Collect angular momentum and index in G
-                la, ma, na = angular_momentum_combinations[s.a + ld1]
-                lb, mb, nb = angular_momentum_combinations[s.b + ld2]
-                lc, mc, nc = angular_momentum_combinations[s.c + ld3]
-                ld, md, nd = angular_momentum_combinations[s.d + ld4]
-                i = idx1 + s.a
-                j = idx2 + s.b
-                k = idx3 + s.c
-                l = idx4 + s.d
-                # Compute the primitive quartet tei and add to appropriate index in G
-                Bx = B_array(la,lb,lc,ld,PA_pow[0],PB_pow[0],QC_pow[0],QD_pow[0],QP_pow[0],g1_pow,g2_pow,oodelta_pow,B_vals)
-                By = B_array(ma,mb,mc,md,PA_pow[1],PB_pow[1],QC_pow[1],QD_pow[1],QP_pow[1],g1_pow,g2_pow,oodelta_pow,B_vals)
-                Bz = B_array(na,nb,nc,nd,PA_pow[2],PB_pow[2],QC_pow[2],QD_pow[2],QP_pow[2],g1_pow,g2_pow,oodelta_pow,B_vals)
-
-                with loops.Scope() as S:
-                  S.primitive = 0.
-                  S.I = 0
-                  S.J = 0
-                  S.K = 0
-                  for _ in S.while_range(lambda: S.I < la + lb + lc + ld + 1):
-                    S.J = 0 
-                    tmp = Bx[S.I] 
-                    for _ in S.while_range(lambda: S.J < ma + mb + mc + md + 1):
-                      S.K = 0 
-                      tmp *= By[S.J] 
-                      for _ in S.while_range(lambda: S.K < na + nb + nc + nd + 1):
-                        tmp *= Bz[S.K] * boys_eval[S.I + S.J + S.K]
-                        S.primitive += tmp
-                        S.K += 1
-                      S.J += 1
-                    S.I += 1
-                tei = prefactor * S.primitive
-                s.G = jax.ops.index_add(s.G, jax.ops.index[i,j,k,l], tei) 
-
-                s.d += 1
-              s.c += 1
-            s.b += 1
-          s.a += 1
-      return s.G
+    def loop_prim_quartets(n, G):
+      # Load in primitive indices, coeffs, exponents, centers, angular momentum index, and leading placement index in TEI array
+      p1,p2,p3,p4 = primitive_quartets[n]
+      coef = coeffs[p1] * coeffs[p2] * coeffs[p3] * coeffs[p4]
+      aa, bb, cc, dd = exps[p1], exps[p2], exps[p3], exps[p4]
+      ld1, ld2, ld3, ld4 = am_leading_indices[ams[p1]],am_leading_indices[ams[p2]],am_leading_indices[ams[p3]],am_leading_indices[ams[p4]]
+      idx1, idx2, idx3, idx4 = indices[p1],indices[p2],indices[p3],indices[p4],
+      #A, B, C, D = geom[atoms[p1]], geom[atoms[p2]], geom[atoms[p3]], geom[atoms[p4]]
+
+      # Compute common intermediates before looping over AM distributions.
+      # Avoids redundant recomputations/reassignment for all classes other than (ss|ss).
+      #AB = A - B
+      #CD = C - D
+      #rab2 = jnp.dot(AB,AB)
+      #rcd2 = jnp.dot(CD,CD)
+      #P = (aa * A + bb * B) / gamma1
+      #Q = (cc * C + dd * D) / gamma2
+      gamma1 = aa + bb
+      gamma2 = cc + dd
+
+      #TODO
+      P = gaussian_products[p1,p2]
+      Q = gaussian_products[p3,p4]
+      rab2 = AmBdot[atoms[p1],atoms[p2]]
+      rcd2 = AmBdot[atoms[p3],atoms[p4]]
+      #PA = PminusA[p1,p2,atoms[p1]]
+      #PB = PminusA[p1,p2,atoms[p2]]
+      #QC = PminusA[p3,p4,atoms[p3]]
+      #QD = PminusA[p3,p4,atoms[p4]]
+      #TODO
+
+      PQ = P - Q
+      rpq2 = jnp.dot(PQ,PQ)
+      delta = 0.25*(1/gamma1+1/gamma2)
+      boys_arg = 0.25 * rpq2 / delta
+      boys_eval = boys(jnp.arange(max_am_idx), boys_arg)
+
+      # Need all powers of Pi-Ai,Pi-Bi,Qi-Ci,Qi-Di (i=x,y,z) up to max_am and Qi-Pi up to max_am_idx
+      # note: this computes unncessary quantities for lower angular momentum,
+      # but avoids repeated computation of the same quantities in loops for higher angular momentum
+
+      #PA_pow = jnp.power(jnp.broadcast_to(P-A, (max_am+1,3)).T, jnp.arange(max_am+1))
+      #PB_pow = jnp.power(jnp.broadcast_to(P-B, (max_am+1,3)).T, jnp.arange(max_am+1))
+      #QC_pow = jnp.power(jnp.broadcast_to(Q-C, (max_am+1,3)).T, jnp.arange(max_am+1))
+      #QD_pow = jnp.power(jnp.broadcast_to(Q-D, (max_am+1,3)).T, jnp.arange(max_am+1))
+
+      PA_pow = PminusA_pow[p1,p2,atoms[p1],:,:]
+      PB_pow = PminusA_pow[p1,p2,atoms[p2],:,:]
+      QC_pow = PminusA_pow[p3,p4,atoms[p3],:,:]
+      QD_pow = PminusA_pow[p3,p4,atoms[p4],:,:]
+      QP_pow = jnp.power(jnp.broadcast_to(Q-P, (max_am_idx,3)).T, jnp.arange(max_am_idx))
+
+      # Gamma powers are negative, up to -(l1+l2).
+      # Make array such that the given negative index returns the same negative power.
+      g1_pow = jnp.power(4*gamma1, -jnp.roll(jnp.flip(jnp.arange(2*max_am+1)),1))
+      g2_pow = jnp.power(4*gamma2, -jnp.roll(jnp.flip(jnp.arange(2*max_am+1)),1))
+      oodelta_pow = jnp.power(1 / delta, jnp.arange(max_am_idx))  # l1 + l2 + l3 + l4 + 1
+
+      prefactor = 34.986836655249726 / (gamma1*gamma2*jnp.sqrt(gamma1+gamma2)) \
+                  * jnp.exp(-aa*bb*rab2/gamma1 + -cc*dd*rcd2/gamma2) * coef
+
+      a, b, c, d = 0, 0, 0, 0
+      def loop_a(arr0):
+         a, b, c, d, G = arr0
+         b = 0
+
+         def loop_b(arr1):
+            a, b, c, d, G = arr1
+            c = 0
+
+            def loop_c(arr2):
+               a, b, c, d, G = arr2
+               d = 0
+
+               def loop_d(arr3):
+                  a, b, c, d, G = arr3
+                  # Collect angular momentum and index in G
+                  la, ma, na = angular_momentum_combinations[a + ld1]
+                  lb, mb, nb = angular_momentum_combinations[b + ld2]
+                  lc, mc, nc = angular_momentum_combinations[c + ld3]
+                  ld, md, nd = angular_momentum_combinations[d + ld4]
+                  i = idx1 + a
+                  j = idx2 + b
+                  k = idx3 + c
+                  l = idx4 + d
+                  # Compute the primitive quartet tei and add to appropriate index in G
+                  Bx = B_array(la,lb,lc,ld,PA_pow[0],PB_pow[0],QC_pow[0],QD_pow[0],QP_pow[0],g1_pow,g2_pow,oodelta_pow,B_vals)
+                  By = B_array(ma,mb,mc,md,PA_pow[1],PB_pow[1],QC_pow[1],QD_pow[1],QP_pow[1],g1_pow,g2_pow,oodelta_pow,B_vals)
+                  Bz = B_array(na,nb,nc,nd,PA_pow[2],PB_pow[2],QC_pow[2],QD_pow[2],QP_pow[2],g1_pow,g2_pow,oodelta_pow,B_vals)
+
+                  I, J, K, primitive = 0, 0, 0, 0.0
+                  def loop_I(arrI):
+                     I, J, K, primitive = arrI
+                     J = 0
+                     tmp = Bx[I]
+
+                     def loop_J(arrJ):
+                        I, J, K, primitive = arrJ
+                        K = 0
+                        tmp *= By[J]
+
+                        def loop_K(arrK):
+                           I, J, K, primitive = arrK
+                           tmp *= Bz[K] * boys_eval[I + J + K]
+                           primitive += tmp
+                           K += 1
+                           return (I, J, K, primitive)
+
+                        I_, J_, K_, primitive_ = while_loop(lambda arrK: arrK[2] < na + nb + nc + nd + 1, loop_K, (I, J, K, primitive))
+                        J_ += 1
+                        return (I_, J_, K_, primitive_)
+
+                     I_, J_, K_, primitive_ = while_loop(lambda arrJ: arrJ[1] < ma + mb + mc + md + 1, loop_J, (I, J, K, primitive))
+                     I_ += 1 # I
+                     return (I_, J_, K_, primitive_)
+
+                  I_, J_, K_, primitive_ = while_loop(lambda arrI: arrI[0] < la + lb + lc + ld + 1, loop_I, (I, J, K, primitive))
+
+                  tei = prefactor * primitive_
+                  G = G.at[i, j, k, l].set(tei)
+                  d += 1
+                  return (a, b, c, d, G)
+
+               a_, b_, c_, d_, G_ = while_loop(lambda arr3: arr3[3] < dims[arr3[6]], loop_d, arr2)
+               c_ += 1
+               return (a_, b_, c_, d_, G_)
+
+            a_, b_, c_, d_, G_ = while_loop(lambda arr2: arr2[2] < dims[arr2[5]], loop_c, arr1)
+            b_ += 1
+            return (a_, b_, c_, d_, G_)
+
+         a_, b_, c_, d_, G_ = while_loop(lambda arr1: arr1[1] < dims[arr1[4]], loop_b, arr0)
+         a_ += 1
+         return (a_, b_, c_, d_, G_)
+
+      a_, b_, c_, d_, G_ = while_loop(lambda arr0: arr0[0] < dims[p1], loop_a, (a, b, c, d, G))
+      return G_
+
+    G = fori_loop(0, primitive_quartets.shape[0], loop_prim_quartets, jnp.zeros((nbf,nbf,nbf,nbf)))
+    return G
 
diff --git a/quax/methods/ccsd.py b/quax/methods/ccsd.py
index a018cf5..3acc6b8 100644
--- a/quax/methods/ccsd.py
+++ b/quax/methods/ccsd.py
@@ -1,7 +1,6 @@
 import jax 
 from jax.config import config; config.update("jax_enable_x64", True)
 import jax.numpy as jnp
-from jax.experimental import loops
 import psi4
 
 from .energy_utils import nuclear_repulsion, partial_tei_transformation, tei_transformation
diff --git a/quax/methods/ccsd_t.py b/quax/methods/ccsd_t.py
index d15ecb8..fc6ea6c 100644
--- a/quax/methods/ccsd_t.py
+++ b/quax/methods/ccsd_t.py
@@ -1,7 +1,7 @@
 import jax 
 from jax.config import config; config.update("jax_enable_x64", True)
 import jax.numpy as jnp
-from jax.experimental import loops
+from jax.lax import while_loop
 
 from .energy_utils import nuclear_repulsion, partial_tei_transformation, tei_transformation
 from .ccsd import rccsd 
@@ -35,41 +35,64 @@ def inner_func(i,j,k):
 
         delta_occ = 2 - delta_ij - delta_jk
         Dd_occ = fock_Od[i] + fock_Od[j] + fock_Od[k] 
-        with loops.Scope() as s:
-          s.pT_contribution = 0.0
-          s.a, s.b, s.c = 0,0,0
-          for _ in s.while_range(lambda: s.a < v): #TODO this could be converted to s.range, may improve autodiff performance
-            s.b = 0
-            for _ in s.while_range(lambda: s.b < s.a + 1):
-              delta_vir = 1 + delta_v[s.a,s.b] 
-              s.c = 0
-              for _ in s.while_range(lambda: s.c < s.b + 1):
-                delta_vir = delta_vir + delta_v[s.b,s.c]
-                Dd = Dd_occ - (fock_Vd[s.a] + fock_Vd[s.b] + fock_Vd[s.c])
-                X = W[s.a,s.b,s.c]*V[s.a,s.b,s.c] + W[s.a,s.c,s.b]*V[s.a,s.c,s.b] + W[s.b,s.a,s.c]*V[s.b,s.a,s.c]  \
-                  + W[s.b,s.c,s.a]*V[s.b,s.c,s.a] + W[s.c,s.a,s.b]*V[s.c,s.a,s.b] + W[s.c,s.b,s.a]*V[s.c,s.b,s.a]
-                Y = (V[s.a,s.b,s.c] + V[s.b,s.c,s.a] + V[s.c,s.a,s.b])
-                Z = (V[s.a,s.c,s.b] + V[s.b,s.a,s.c] + V[s.c,s.b,s.a])
-                E = (Y - 2*Z)*(W[s.a,s.b,s.c] + W[s.b,s.c,s.a] + W[s.c,s.a,s.b]) + (Z - 2*Y)*(W[s.a,s.c,s.b]+W[s.b,s.a,s.c]+W[s.c,s.b,s.a]) + 3*X
-                s.pT_contribution += E * delta_occ / (Dd * delta_vir)
-                s.c += 1
-              s.b += 1
-            s.a += 1
-          return s.pT_contribution
-
-    with loops.Scope() as S:
-      S.pT = 0.0
-      S.i, S.j, S.k = 0, 0, 0
-      for _ in S.while_range(lambda: S.i < o): 
-        S.j = 0
-        for _ in S.while_range(lambda: S.j < S.i + 1): 
-          S.k = 0
-          for _ in S.while_range(lambda: S.k < S.j + 1): 
-            S.pT += inner_func(S.i,S.j,S.k)
-            S.k += 1
-          S.j += 1
-        S.i += 1
-      return S.pT
+
+        def loop_a(arr0):
+           a, b, c, pT_contribution = arr0
+           b = 0
+
+           def loop_b(arr1):
+              a, b, c, pT_contribution = arr1
+              c = 0
+              delta_vir = 1 + delta_v[a,b]
+
+              def loop_c(arr2):
+                 a, b, c, pT_contribution = arr2
+                 delta_vir = delta_vir + delta_v[b,c]
+                 Dd = Dd_occ - (fock_Vd[a] + fock_Vd[b] + fock_Vd[c])
+                 X = W[a,b,c]*V[a,b,c] + W[a,c,b]*V[a,c,b] + W[b,a,c]*V[b,a,c]  \
+                   + W[b,c,a]*V[b,c,a] + W[c,a,b]*V[c,a,b] + W[c,b,a]*V[c,b,a]
+                 Y = (V[a,b,c] + V[b,c,a] + V[c,a,b])
+                 Z = (V[a,c,b] + V[b,a,c] + V[c,b,a])
+                 E = (Y - 2*Z)*(W[a,b,c] + W[b,c,a] + W[c,a,b]) + (Z - 2*Y)*(W[a,c,b]+W[b,a,c]+W[c,b,a]) + 3*X
+                 pT_contribution += E * delta_occ / (Dd * delta_vir)
+                 c += 1
+                 return (a, b, c, pT_contribution)
+
+              a_, b_, c_, pT_contribution_ = while_loop(lambda arr2: arr2[2] < arr2[1] + 1, loop_c, (a, b, c, pT_contribution))
+              b_ += 1
+              return (a_, b_, c_, pT_contribution_)
+
+           a_, b_, c_, pT_contribution_ = while_loop(lambda arr1: arr1[1] < arr1[0] + 1, loop_b, (a, b, c, pT_contribution))
+           a_ += 1
+           return (a_, b_, c_, pT_contribution_)
+
+        a_, b_. c_, dE_pT = while_loop(lambda arr0: arr0[0] < v, loop_a, (0, 0, 0, 0.0)) # (a, b, c, pT_contribution)
+        return dE_pT
+
+    def loop_i(arr0):
+       i, j, k, pT = arr0
+       j = 0
+
+       def loop_j(arr1):
+          i, j, k, pT = arr1
+          k = 0
+
+          def loop_k(arr2):
+             i, j, k, pT = arr2
+             pT += inner_func(i, j, k)
+             k += 1
+             return (i, j, k, pT)
+
+          i_, j_, k_, pT_ = while_loop(lambda arr2: arr2[2] < arr2[1] + 1, loop_k, (i, j, k, pT))
+          j_ += 1
+          return (i_, j_, k_, pT_)
+
+       i_, j_, k_, pT_ = while_loop(lambda arr1: arr1[1] < arr1[0] + 1, loop_j, (i, j, k, pT))
+       i_ += 1
+       return (i_, j_, k_, pT_)
+
+    i_, j_, k_, pT = while_loop(lambda arr0: arr0[0] < o, loop_i, (0, 0, 0, 0.0)) # (i, j, k, pT)
+    return pT
 
 def rccsd_t(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=0):
     E_ccsd, T1, T2, V, fock_Od, fock_Vd = rccsd(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
diff --git a/quax/methods/energy_utils.py b/quax/methods/energy_utils.py
index 95b70b6..5881b91 100644
--- a/quax/methods/energy_utils.py
+++ b/quax/methods/energy_utils.py
@@ -1,6 +1,5 @@
 import jax
 from jax.config import config; config.update("jax_enable_x64", True)
-from jax.experimental import loops
 import jax.numpy as jnp
 from functools import partial
 
diff --git a/quax/methods/mp2.py b/quax/methods/mp2.py
index 3324b4c..792b33b 100644
--- a/quax/methods/mp2.py
+++ b/quax/methods/mp2.py
@@ -1,7 +1,7 @@
 import jax 
 from jax.config import config; config.update("jax_enable_x64", True)
 import jax.numpy as jnp
-from jax.experimental import loops
+from jax.lax import fori_loop
 import psi4
 
 from .energy_utils import nuclear_repulsion, partial_tei_transformation, tei_transformation, cartesian_product
@@ -30,10 +30,14 @@ def restricted_mp2(geom, basis_name, xyz_path, nuclear_charges, charge, options,
     # Loop algo (lower memory, but tei transform is the memory bottleneck)
     # Create all combinations of four loop variables to make XLA compilation easier
     indices = cartesian_product(jnp.arange(ndocc),jnp.arange(ndocc),jnp.arange(nvirt),jnp.arange(nvirt))
-    with loops.Scope() as s:
-      s.mp2_correlation = 0.
-      for idx in s.range(indices.shape[0]):
+
+    mp2_correlation = 0.0
+    def loop_mp2(idx, mp2_corr):
         i,j,a,b = indices[idx]
-        s.mp2_correlation += G[i, a, j, b] * (2 * G[i, a, j, b] - G[i, b, j, a]) * e_denom[i,a,j,b]
-      return E_scf + s.mp2_correlation
+        mp2_corr += G[i, a, j, b] * (2 * G[i, a, j, b] - G[i, b, j, a]) * e_denom[i,a,j,b]
+        return mp2_corr
+
+    dE_mp2 = fori_loop(0, indices.shape[0], loop_mp2, mp2_correlation)
+
+    return E_scf + dE_mp2
 
diff --git a/setup.py b/setup.py
index 122c16e..8b7562a 100644
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,8 @@
         author_email='adabbott@uga.edu',
         url="none",
         license='BSD-3C',
-        packages=setuptools.find_packages(),
+        packages=setuptools.find_packages(where="quax"),
+        package_dir={"": "quax"},
         install_requires=[
             'numpy>=1.7',
             'jax>=0.2.9',
@@ -31,5 +32,5 @@
             'Intended Audience :: Science/Research',
             'Programming Language :: Python :: 3',
         ],
-        zip_safe=True,
+        zip_safe=False
     )

From 30fa207994d68c5d9f468eaffe2ca19b191ab71a Mon Sep 17 00:00:00 2001
From: Erica Mitchell <ericamitch5@gmail.com>
Date: Thu, 24 Aug 2023 16:06:39 -0400
Subject: [PATCH 02/91] Differentiate loop variables

---
 quax/constants.py                        |   5 +
 quax/external_integrals/makefile         |  19 +--
 quax/external_integrals/tmp_potential.py |  94 ++++++-------
 quax/integrals/oei.py                    |  94 ++++++-------
 quax/integrals/tei.py                    | 170 +++++++++++------------
 quax/methods/ccsd_t.py                   |  82 +++++------
 6 files changed, 235 insertions(+), 229 deletions(-)

diff --git a/quax/constants.py b/quax/constants.py
index 17bcd8c..7ec3544 100644
--- a/quax/constants.py
+++ b/quax/constants.py
@@ -11,3 +11,8 @@
     if lib.match(path):
         from . import external_integrals 
         libint_imported = True
+
+if libint_imported:
+    print("Using Libint integrals...")
+else:
+    print("Using Quax integrals...")
diff --git a/quax/external_integrals/makefile b/quax/external_integrals/makefile
index 00cfd98..273df46 100644
--- a/quax/external_integrals/makefile
+++ b/quax/external_integrals/makefile
@@ -2,26 +2,26 @@
 # Eigen headers, Python headers, Pybind11 headers, Libint API headers libint2.h libint2.hpp, the rest of the Libint2 headers, and the library location of libint2.a,
 CC      := g++
 # Libint prefix location (where /include, /include/libint2, /lib, /share are located) 
-LIBINT_PREFIX := /home/adabbott/Git/libint_am3/libint/build/libint-2.7.0-beta.6/PREFIX
+LIBINT_PREFIX := /home/vulcan/ecm23353/Code/bin/libint-2.8.0/PREFIX
 
 I1 := $(LIBINT_PREFIX)/include
 I2 := $(LIBINT_PREFIX)/include/libint2
 L1 := $(LIBINT_PREFIX)/lib
 # Eigen headers location 
-I3 := /home/adabbott/anaconda3/envs/quax/include/eigen3
+I3 := /home/vulcan/ecm23353/.conda/envs/quax/include/eigen3
 # Python headers location 
-I4 := /home/adabbott/anaconda3/envs/quax/include/python3.7m
+I4 := /home/vulcan/ecm23353/.conda/envs/quax/include/python3.10
 # Pybind11 headers location 
-I5 := /home/adabbott/anaconda3/envs/quax/lib/python3.7/site-packages/pybind11/include
+I5 := /home/vulcan/ecm23353/.conda/envs/quax/lib/python3.10/site-packages/pybind11/include
 # HDF5 headers, static and shared libraries 
-I6 := /home/adabbott/Git/hdf5/hdf5-1.12.0/PREFIX/include
-L2 := /home/adabbott/Git/hdf5/hdf5-1.12.0/PREFIX/lib
+I6 := /home/vulcan/ecm23353/.conda/envs/quax/include
+L2 := /home/vulcan/ecm23353/.conda/envs/quax/lib
 # Edit path in quotes to be same location as L2 definition above
-RPATH := -Wl,-rpath,"/home/adabbott/Git/hdf5/hdf5-1.12.0/PREFIX/lib"
+RPATH := -Wl,-rpath,"/home/vulcan/ecm23353/.conda/envs/quax/lib"
 
 # This 'TARGETS' suffix should be set to whatever is returned by the command `python3-config --extension-suffix` entered on command line.
 # and it should match the same python version referenced in the above include path for I4 := (3.7 in this case)
-TARGETS := libint_interface.cpython-37m-x86_64-linux-gnu.so
+TARGETS := libint_interface.cpython-310m-x86_64-linux-gnu.so
 OBJ     := libint_interface.o
 
 # Rest is boilerplate. Do not edit unless you know what you're doing.
@@ -29,7 +29,8 @@ OBJ     := libint_interface.o
 
 all: $(TARGETS)
 
-clean: rm -f $(OBJ)
+clean:
+	rm -f $(OBJ)
 
 $(OBJ): %.o : %.cc $(DEPS)
 	$(CC) -c $< -o $@ -O3 -fPIC -std=c++11 -I $(I1) -I $(I2) -I $(I3) -I $(I4) -I $(I5) -I $(I6) -lint2 -L $(L1) -lhdf5 -lhdf5_cpp -L $(L2) $(RPATH)
diff --git a/quax/external_integrals/tmp_potential.py b/quax/external_integrals/tmp_potential.py
index 29fff39..371dd5e 100644
--- a/quax/external_integrals/tmp_potential.py
+++ b/quax/external_integrals/tmp_potential.py
@@ -10,33 +10,33 @@
 def A_array(l1,l2,PA,PB,CP,g,A_vals):
 
     def loop_i(arr0):
-       i, r, u, A = arr0
-       Aterm = neg_one_pow[i] * binomial_prefactor(i,l1,l2,PA,PB) * factorials[i]
-       r = i // 2
+       i_0, r_0, u_0, A_0 = arr0
+       Aterm_0 = neg_one_pow[i_0] * binomial_prefactor(i_0,l1,l2,PA,PB) * factorials[i_0]
+       r_0 = i_0 // 2
 
        def loop_r(arr1):
-          i, r, u, Aterm, A = arr1
-          u = (i - 2 * r) // 2
+          i_1, r_1, u_1, Aterm_1, A_1 = arr1
+          u_1 = (i_1 - 2 * r_1) // 2
 
           def loop_u(arr2):
-             i, r, u, Aterm, A = arr2
-             I = i - 2 * r - u
-             tmp = I - u
-             fact_ratio = 1 / (factorials[r] * factorials[u] * factorials[tmp])
-             Aterm *= neg_one_pow[u]  * CP[tmp] * (0.25 / g)**(r+u) * fact_ratio
-             A = A.at[I].set(u)
-             u -= 1
-             return (i, r, u, Aterm, A)
+             i_2, r_2, u_2, Aterm_2, A_2 = arr2
+             I = i_2 - 2 * r_2 - u_2
+             tmp = I - u_2
+             fact_ratio = 1 / (factorials[r_2] * factorials[u_2] * factorials[tmp])
+             Aterm_2 *= neg_one_pow[u_2]  * CP[tmp] * (0.25 / g)**(r_2+u_2) * fact_ratio
+             A_2 = A_2.at[I].set(Aterm_2)
+             u_2 -= 1
+             return (i_2, r_2, u_2, Aterm_2, A_2)
 
-          i_, r_, u_, Aterm_, A_ = while_loop(lambda arr2: arr2[1] > -1, loop_u, (i, r, u, Aterm, A))
-          r_ -= 1
-          return (i_, r_, u_, Aterm_, A_)
+          i_1_, r_1_, u_1_, Aterm_1_, A_1_ = while_loop(lambda arr2: arr2[1] > -1, loop_u, (i_1, r_1, u_1, Aterm_1, A_1))
+          r_1_ -= 1
+          return (i_1_, r_1_, u_1_, Aterm_1_, A_1_)
 
-       i_, r_, u_, Aterm_, A_ = while_loop(lambda arr1: arr1[1] > -1, loop_r, (i, r, u, Aterm, A))
-       i_ -= 1
-       return (i_, r_, u_, A_)
+       i_0_, r_0_, u_0_, Aterm_0_, A_0_ = while_loop(lambda arr1: arr1[1] > -1, loop_r, (i_0, r_0, u_0, Aterm_0, A_0))
+       i_0_ -= 1
+       return (i_0_, r_0_, u_0_, A_0_)
 
-    i_, r_, u_, A = while_loop(lambda arr0: arr0[0] > -1, loop_i, (l1 + l2, 0, 0, A_vals)) # (i, r, u, A)
+    i, r, u, A = while_loop(lambda arr0: arr0[0] > -1, loop_i, (l1 + l2, 0, 0, A_vals)) # (i, r, u, A)
 
     return A
 
@@ -55,26 +55,26 @@ def loop_val(n, val):
 
       I, J, K, total = 0, 0, 0, 0
       def loop_I(arr0):
-         I, J, K, val, total = arr0
-         J = 0
+         I_0, J_0, K_0, val_0, total_0 = arr0
+         J_0 = 0
 
          def loop_J(arr1):
-            I, J, K, val, total = arr1
-            K = 0
+            I_1, J_1, K_1, val_1, total_1 = arr1
+            K_1 = 0
 
             def loop_K(arr2):
-               I, J, K, val, total = arr2
-               total += Ax[I] * Ay[J] * Az[K] * boys_eval[I + J + K, n]
-               K += 1
-               return (I, J, K, val, total)
+               I_2, J_2, K_2, val_2, total_2 = arr2
+               total_2 += Ax[I_2] * Ay[J_2] * Az[K_2] * boys_eval[I_2 + J_2 + K_2, n]
+               K_2 += 1
+               return (I_2, J_2, K_2, val_2, total_2)
 
-            I_, J_, K_, val_, total_ = while_loop(lambda arr2: arr2[2] < na + nb + 1, loop_K, (I, J, K, val, total))
-            J_ += 1
-            return (I_, J_, K_, val_, total_)
+            I_1_, J_1_, K_1_, val_1_, total_1_ = while_loop(lambda arr2: arr2[2] < na + nb + 1, loop_K, (I_1, J_1, K_1, val_1, total_1))
+            J_1_ += 1
+            return (I_1_, J_1_, K_1_, val_1_, total_1_)
 
-         I_, J_, K_, val_, total_ = while_loop(lambda arr1: arr1[1] < ma + mb + 1, loop_I, (I, J, K, val, total))
-         I_ += 1
-         return (I_, J_, K_, val_, total_)
+         I_0_, J_0_, K_0_, val_0_, total_0_ = while_loop(lambda arr1: arr1[1] < ma + mb + 1, loop_J, (I_0, J_0, K_0, val_0, total_0))
+         I_0_ += 1
+         return (I_0_, J_0_, K_0_, val_0_, total_0_)
 
       I_, J_, K_, val_, total_ = while_loop(lambda arr0: arr0[0] < la + lb + 1, loop_I, (I, J, K, val, total))
       val_ += charges[n] * prefactor * total_
@@ -132,26 +132,26 @@ def tmp_potential(geom, basis, charges):
 
        a, b = 0, 0
        def loop_a(arr0):
-          a, b, oei = arr0
-          b = 0
+          a_0, b_0, oei_0 = arr0
+          b_0 = 0
 
           def loop_b(arr1):
-             a, b, oei = arr1
+             a_1, b_1, oei_1 = arr1
              # Gather angular momentum and index
-             la,ma,na = angular_momentum_combinations[a + ld1]
-             lb,mb,nb = angular_momentum_combinations[b + ld2]
+             la,ma,na = angular_momentum_combinations[a_1 + ld1]
+             lb,mb,nb = angular_momentum_combinations[b_1 + ld2]
              # To only create unique indices, need to have separate indices arrays for i and j.
-             i = indices[p1] + a
-             j = indices[p2] + b
+             i = indices[p1] + a_1
+             j = indices[p2] + b_1
              # Compute one electron integrals and add to appropriate index
              potential_int = potential(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,Pgeom_pow,boys_eval,prefactor,charges,A_vals) * coef
-             oei = oei.at[i,j].set(potential_int)
-             b += 1
-             return (a, b, oei)
+             oei_1 = oei_1.at[i,j].set(potential_int)
+             b_1 += 1
+             return (a_1, b_1, oei_1)
 
-          a_, b_, oei_ = while_loop(lambda arr1: arr1[1] < dims[p2], loop_b, (a, b, oei))
-          a_ += 1
-          return (a_, b_, oei_)
+          a_0_, b_0_, oei_0_ = while_loop(lambda arr1: arr1[1] < dims[p2], loop_b, (a_0, b_0, oei_0))
+          a_0_ += 1
+          return (a_0_, b_0_, oei_0_)
 
        a_, b_, oei_ = while_loop(lambda arr0: arr0[0] < dims[p1], loop_a, (a, b, V))
 
diff --git a/quax/integrals/oei.py b/quax/integrals/oei.py
index 4327b9f..b5bde7d 100644
--- a/quax/integrals/oei.py
+++ b/quax/integrals/oei.py
@@ -72,33 +72,33 @@ def kinetic(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,prefactor):
 def A_array(l1,l2,PA,PB,CP,g,A_vals):
 
     def loop_i(arr0):
-       i, r, u, A = arr0
-       Aterm = neg_one_pow[i] * binomial_prefactor(i,l1,l2,PA,PB) * factorials[i]
-       r = i // 2
+       i_0, r_0, u_0, A_0 = arr0
+       Aterm_0 = neg_one_pow[i_0] * binomial_prefactor(i_0,l1,l2,PA,PB) * factorials[i_0]
+       r_0 = i_0 // 2
 
        def loop_r(arr1):
-          i, r, u, Aterm, A = arr1
-          u = (i - 2 * r) // 2
+          i_1, r_1, u_1, Aterm_1, A_1 = arr1
+          u_1 = (i_1 - 2 * r_1) // 2
 
           def loop_u(arr2):
-             i, r, u, Aterm, A = arr2
-             I = i - 2 * r - u
-             tmp = I - u
-             fact_ratio = 1 / (factorials[r] * factorials[u] * factorials[tmp])
-             Aterm *= neg_one_pow[u]  * CP[tmp] * (0.25 / g)**(r+u) * fact_ratio
-             A = A.at[I].set(u)
-             u -= 1
-             return (i, r, u, Aterm, A)
+             i_2, r_2, u_2, Aterm_2, A_2 = arr2
+             I = i_2 - 2 * r_2 - u_2
+             tmp = I - u_2
+             fact_ratio = 1 / (factorials[r_2] * factorials[u_2] * factorials[tmp])
+             Aterm_2 *= neg_one_pow[u_2]  * CP[tmp] * (0.25 / g)**(r_2+u_2) * fact_ratio
+             A_2 = A_2.at[I].set(Aterm_2)
+             u_2 -= 1
+             return (i_2, r_2, u_2, Aterm_2, A_2)
 
-          i_, r_, u_, Aterm_, A_ = while_loop(lambda arr2: arr2[1] > -1, loop_u, (i, r, u, Aterm, A))
-          r_ -= 1
-          return (i_, r_, u_, Aterm_, A_)
+          i_1_, r_1_, u_1_, Aterm_1_, A_1_ = while_loop(lambda arr2: arr2[1] > -1, loop_u, (i_1, r_1, u_1, Aterm_1, A_1))
+          r_1_ -= 1
+          return (i_1_, r_1_, u_1_, Aterm_1_, A_1_)
 
-       i_, r_, u_, Aterm_, A_ = while_loop(lambda arr1: arr1[1] > -1, loop_r, (i, r, u, Aterm, A))
-       i_ -= 1
-       return (i_, r_, u_, A_)
+       i_0_, r_0_, u_0_, Aterm_0_, A_0_ = while_loop(lambda arr1: arr1[1] > -1, loop_r, (i_0, r_0, u_0, Aterm_0, A_0))
+       i_0_ -= 1
+       return (i_0_, r_0_, u_0_, A_0_)
 
-    i_, r_, u_, A = while_loop(lambda arr0: arr0[0] > -1, loop_i, (l1 + l2, 0, 0, A_vals)) # (i, r, u, A)
+    i, r, u, A = while_loop(lambda arr0: arr0[0] > -1, loop_i, (l1 + l2, 0, 0, A_vals)) # (i, r, u, A)
 
     return A
 
@@ -116,26 +116,26 @@ def loop_val(n, val):
 
       I, J, K, total = 0, 0, 0, 0
       def loop_I(arr0):
-         I, J, K, val, total = arr0
-         J = 0
+         I_0, J_0, K_0, val_0, total_0 = arr0
+         J_0 = 0
 
          def loop_J(arr1):
-            I, J, K, val, total = arr1
-            K = 0
+            I_1, J_1, K_1, val_1, total_1 = arr1
+            K_1 = 0
 
             def loop_K(arr2):
-               I, J, K, val, total = arr2
-               total += Ax[I] * Ay[J] * Az[K] * boys_eval[I + J + K, n]
-               K += 1
-               return (I, J, K, val, total)
+               I_2, J_2, K_2, val_2, total_2 = arr2
+               total_2 += Ax[I_2] * Ay[J_2] * Az[K_2] * boys_eval[I_2 + J_2 + K_2, n]
+               K_2 += 1
+               return (I_2, J_2, K_2, val_2, total_2)
 
-            I_, J_, K_, val_, total_ = while_loop(lambda arr2: arr2[2] < na + nb + 1, loop_K, (I, J, K, val, total))
-            J_ += 1
-            return (I_, J_, K_, val_, total_)
+            I_1_, J_1_, K_1_, val_1_, total_1_ = while_loop(lambda arr2: arr2[2] < na + nb + 1, loop_K, (I_1, J_1, K_1, val_1, total_1))
+            J_1_ += 1
+            return (I_1_, J_1_, K_1_, val_1_, total_1_)
 
-         I_, J_, K_, val_, total_ = while_loop(lambda arr1: arr1[1] < ma + mb + 1, loop_J, (I, J, K, val, total))
-         I_ += 1
-         return (I_, J_, K_, val_, total_)
+         I_0_, J_0_, K_0_, val_0_, total_0_ = while_loop(lambda arr1: arr1[1] < ma + mb + 1, loop_J, (I_0, J_0, K_0, val_0, total_0))
+         I_0_ += 1
+         return (I_0_, J_0_, K_0_, val_0_, total_0_)
 
       I_, J_, K_, val_, total_ = while_loop(lambda arr0: arr0[0] < la + lb + 1, loop_I, (I, J, K, val, total))
       val_ += charges[n] * prefactor * total_
@@ -194,28 +194,28 @@ def oei_arrays(geom, basis, charges):
 
        a, b = 0, 0
        def loop_a(arr0):
-          a, b, oei = arr0
-          b = 0
+          a_0, b_0, oei_0 = arr0
+          b_0 = 0
 
           def loop_b(arr1):
-             a, b, oei = arr1
+             a_1, b_1, oei_1 = arr1
              # Gather angular momentum and index
-             la,ma,na = angular_momentum_combinations[a + ld1]
-             lb,mb,nb = angular_momentum_combinations[b + ld2]
+             la,ma,na = angular_momentum_combinations[a_1 + ld1]
+             lb,mb,nb = angular_momentum_combinations[b_1 + ld2]
              # To only create unique indices, need to have separate indices arrays for i and j.
-             i = indices[p1] + a
-             j = indices[p2] + b
+             i = indices[p1] + a_1
+             j = indices[p2] + b_1
              # Compute one electron integrals and add to appropriate index
              overlap_int = overlap(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,prefactor) * coef
              kinetic_int = kinetic(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,prefactor) * coef
              potential_int = potential(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,Pgeom_pow,boys_eval,prefactor,charges,A_vals) * coef
-             oei = oei.at[([0,1,2],[i,i,i],[j,j,j])].set((overlap_int, kinetic_int, potential_int))
-             b += 1
-             return (a, b, oei)
+             oei_1 = oei_1.at[([0,1,2],[i,i,i],[j,j,j])].set((overlap_int, kinetic_int, potential_int))
+             b_1 += 1
+             return (a_1, b_1, oei_1)
 
-          a_, b_, oei_ = while_loop(lambda arr1: arr1[1] < dims[p2], loop_b, (a, b, oei))
-          a_ += 1
-          return (a_, b_, oei_)
+          a_0_, b_0_, oei_0_ = while_loop(lambda arr1: arr1[1] < dims[p2], loop_b, (a_0, b_0, oei_0))
+          a_0_ += 1
+          return (a_0_, b_0_, oei_0_)
 
        a_, b_, oei_ = while_loop(lambda arr0: arr0[0] < dims[p1], loop_a, (a, b, STV))
 
diff --git a/quax/integrals/tei.py b/quax/integrals/tei.py
index 4ddcb5d..258d4eb 100644
--- a/quax/integrals/tei.py
+++ b/quax/integrals/tei.py
@@ -14,58 +14,58 @@ def B_array(l1,l2,l3,l4,pa_pow,pb_pow,qc_pow,qd_pow,qp_pow,g1_pow,g2_pow,oodelta
     # It does not appear to help to pull out binomial prefactors and compute outside loop.
 
     def loop_i1(arr0):
-       i1, i2, r1, r2, u, B = arr0
-       Bterm = binomial_prefactor(arr0[0],l1,l2,pa_pow,pb_pow)
-       tmp = i1
-       r1 = i1 // 2
+       i1_0, i2_0, r1_0, r2_0, u_0, B_0 = arr0
+       Bterm = binomial_prefactor(i1_0,l1,l2,pa_pow,pb_pow)
+       tmp = i1_0
+       r1_0 = i1_0 // 2
 
        def loop_r1(arr1):
-          i1, i2, r1, r2, u, B = arr1
-          Bterm *= fact_ratio2[i1,r1]
-          Bterm *= g1_pow[r1-i1]
-          tmp -= 2 * r1
-          i2 = l3 + l4
+          i1_1, i2_1, r1_1, r2_1, u_1, B_1 = arr1
+          Bterm *= fact_ratio2[i1_1,r1_1]
+          Bterm *= g1_pow[r1_1-i1_1]
+          tmp -= 2 * r1_1
+          i2_1 = l3 + l4
 
           def loop_i2(arr2):
-             i1, i2, r1, r2, u, B = arr2
-             Bterm *= neg_one_pow[i2]
-             Bterm *= binomial_prefactor(i2,l3,l4,qc_pow,qd_pow)
-             tmp += i2
-             r2 = i2 // 2
+             i1_2, i2_2, r1_2, r2_2, u_2, B_2 = arr2
+             Bterm *= neg_one_pow[i2_2]
+             Bterm *= binomial_prefactor(i2_2,l3,l4,qc_pow,qd_pow)
+             tmp += i2_2
+             r2_2 = i2_2 // 2
 
              def loop_r2(arr3):
-                i1, i2, r1, r2, u, B = arr3
-                Bterm *= fact_ratio2[i2,r2]
-                Bterm *= g2_pow[r2-i2]
-                tmp -= 2 * r2
-                u = tmp // 2
+                i1_3, i2_3, r1_3, r2_3, u_3, B_3 = arr3
+                Bterm *= fact_ratio2[i2_3,r2_3]
+                Bterm *= g2_pow[r2_3-i2_3]
+                tmp -= 2 * r2_3
+                u_3 = tmp // 2
 
                 def loop_u(arr4):
-                   i1, i2, r1, r2, u, B = arr4
-                   I = tmp - u
-                   Bterm *= neg_one_pow[u]
-                   Bterm *= fact_ratio2[tmp,u]
-                   Bterm *= qp_pow[tmp - 2 * u]
+                   i1_4, i2_4, r1_4, r2_4, u_4, B_4 = arr4
+                   I = tmp - u_4
+                   Bterm *= neg_one_pow[u_4]
+                   Bterm *= fact_ratio2[tmp,u_4]
+                   Bterm *= qp_pow[tmp - 2 * u_4]
                    Bterm *= oodelta_pow[I]
                    B = B.at[I].set(Bterm)
-                   u -= 1
-                   return (i1, i2, r1, r2, u, B)
+                   u_4 -= 1
+                   return (i1_4, i2_4, r1_4, r2_4, u_4, B_4)
 
-                i1_, i2_, r1_, r2_, u_, B_ = while_loop(lambda arr4: arr4[4] > -1, loop_u, (i1, i2, r1, r2, u, B))
-                r2_ -= 1
-                return (i1_, i2_, r1_, r2_, u_, B_)
+                i1_3_, i2_3_, r1_3_, r2_3_, u_3_, B_3_ = while_loop(lambda arr4: arr4[4] > -1, loop_u, (i1_3, i2_3, r1_3, r2_3, u_3, B_3))
+                r2_3_ -= 1
+                return (i1_3_, i2_3_, r1_3_, r2_3_, u_3_, B_3_)
 
-             i1_, i2_, r1_, r2_, u_, B_ = while_loop(lambda arr3: arr3[3] > -1, loop_r2, (i1, i2, r1, r2, u, B))
-             i2_ -= 1
-             return (i1_, i2_, r1_, r2_, u_, B_)
+             i1_2_, i2_2_, r1_2_, r2_2_, u_2_, B_2_ = while_loop(lambda arr3: arr3[3] > -1, loop_r2, (i1_2, i2_2, r1_2, r2_2, u_2, B_2))
+             i2_2_ -= 1
+             return (i1_2_, i2_2_, r1_2_, r2_2_, u_2_, B_2_)
 
-          i1_, i2_, r1_, r2_, u_, B_ = while_loop(lambda arr2: arr2[1] > -1, loop_i2, (i1, i2, r1, r2, u, B))
-          r1_ -= 1
-          return (i1_, i2_, r1_, r2_, u_, B_)
+          i1_1_, i2_1_, r1_1_, r2_1_, u_1_, B_1_ = while_loop(lambda arr2: arr2[1] > -1, loop_i2, (i1_1, i2_1, r1_1, r2_1, u_1, B_1))
+          r1_1_ -= 1
+          return (i1_1_, i2_1_, r1_1_, r2_1_, u_1_, B_1_)
 
-       i1_, i2_, r1_, r2_, u_, B_ = while_loop(lambda arr1: arr1[2] > -1, loop_r1, (i1, i2, r1, r2, u, B))
-       i1_ -= 1
-       return (i1_, i2_, r1_, r2_, u_, B_)
+       i1_0_, i2_0_, r1_0_, r2_0_, u_0_, B_0_ = while_loop(lambda arr1: arr1[2] > -1, loop_r1, (i1_0, i2_0, r1_0, r2_0, u_0, B_0))
+       i1_0_ -= 1
+       return (i1_0_, i2_0_, r1_0_, r2_0_, u_0_, B_0_)
 
     i1, i2, r1, r2, u, B = while_loop(lambda arr0: arr0[0] > -1, loop_i1, (l1 + l2, 0, 0, 0, 0, B_vals)) # (i1, i2, r1, r2, u, B)
     return B
@@ -167,7 +167,7 @@ def tei_array(geom, basis):
     # Shape: (nprim, nprim, natom, 3, max_am+1). In loop index PA_pow as [p1,p2,atoms[p1],:,:]
     PminusA_pow = jnp.power(jnp.transpose(jnp.broadcast_to(PminusA, (max_am+1,nprim,nprim,natom,3)), (1,2,3,4,0)), jnp.arange(max_am+1))
 
-    def loop_prim_quartets(n, G):
+    def loop_prim_quartets(n, G_tei):
       # Load in primitive indices, coeffs, exponents, centers, angular momentum index, and leading placement index in TEI array
       p1,p2,p3,p4 = primitive_quartets[n]
       coef = coeffs[p1] * coeffs[p2] * coeffs[p3] * coeffs[p4]
@@ -230,28 +230,28 @@ def loop_prim_quartets(n, G):
 
       a, b, c, d = 0, 0, 0, 0
       def loop_a(arr0):
-         a, b, c, d, G = arr0
-         b = 0
+         a_0, b_0, c_0, d_0, G_0 = arr0
+         b_0 = 0
 
          def loop_b(arr1):
-            a, b, c, d, G = arr1
-            c = 0
+            a_1, b_1, c_1, d_1, G_1 = arr1
+            c_1 = 0
 
             def loop_c(arr2):
-               a, b, c, d, G = arr2
-               d = 0
+               a_2, b_2, c_2, d_2, G_2 = arr2
+               d_2 = 0
 
                def loop_d(arr3):
-                  a, b, c, d, G = arr3
+                  a_3, b_3, c_3, d_3, G_3 = arr3
                   # Collect angular momentum and index in G
-                  la, ma, na = angular_momentum_combinations[a + ld1]
-                  lb, mb, nb = angular_momentum_combinations[b + ld2]
-                  lc, mc, nc = angular_momentum_combinations[c + ld3]
-                  ld, md, nd = angular_momentum_combinations[d + ld4]
-                  i = idx1 + a
-                  j = idx2 + b
-                  k = idx3 + c
-                  l = idx4 + d
+                  la, ma, na = angular_momentum_combinations[a_3 + ld1]
+                  lb, mb, nb = angular_momentum_combinations[b_3 + ld2]
+                  lc, mc, nc = angular_momentum_combinations[c_3 + ld3]
+                  ld, md, nd = angular_momentum_combinations[d_3 + ld4]
+                  i = idx1 + a_3
+                  j = idx2 + b_3
+                  k = idx3 + c_3
+                  l = idx4 + d_3
                   # Compute the primitive quartet tei and add to appropriate index in G
                   Bx = B_array(la,lb,lc,ld,PA_pow[0],PB_pow[0],QC_pow[0],QD_pow[0],QP_pow[0],g1_pow,g2_pow,oodelta_pow,B_vals)
                   By = B_array(ma,mb,mc,md,PA_pow[1],PB_pow[1],QC_pow[1],QD_pow[1],QP_pow[1],g1_pow,g2_pow,oodelta_pow,B_vals)
@@ -259,51 +259,51 @@ def loop_d(arr3):
 
                   I, J, K, primitive = 0, 0, 0, 0.0
                   def loop_I(arrI):
-                     I, J, K, primitive = arrI
-                     J = 0
-                     tmp = Bx[I]
+                     I_I, J_I, K_I, primitive_I = arrI
+                     J_I = 0
+                     tmp = Bx[I_I]
 
                      def loop_J(arrJ):
-                        I, J, K, primitive = arrJ
-                        K = 0
-                        tmp *= By[J]
+                        I_J, J_J, K_J, primitive_J = arrJ
+                        K_J = 0
+                        tmp *= By[J_J]
 
                         def loop_K(arrK):
-                           I, J, K, primitive = arrK
-                           tmp *= Bz[K] * boys_eval[I + J + K]
-                           primitive += tmp
-                           K += 1
-                           return (I, J, K, primitive)
+                           I_K, J_K, K_K, primitive_K = arrK
+                           tmp *= Bz[K_K] * boys_eval[I_K + J_K + K_K]
+                           primitive_K += tmp
+                           K_K += 1
+                           return (I_K, J_K, K_K, primitive_K)
 
-                        I_, J_, K_, primitive_ = while_loop(lambda arrK: arrK[2] < na + nb + nc + nd + 1, loop_K, (I, J, K, primitive))
-                        J_ += 1
-                        return (I_, J_, K_, primitive_)
+                        I_J_, J_J_, K_J_, primitive_J_ = while_loop(lambda arrK: arrK[2] < na + nb + nc + nd + 1, loop_K, (I_J, J_J, K_J, primitive_J))
+                        J_J_ += 1
+                        return (I_J_, J_J_, K_J_, primitive_J_)
 
-                     I_, J_, K_, primitive_ = while_loop(lambda arrJ: arrJ[1] < ma + mb + mc + md + 1, loop_J, (I, J, K, primitive))
-                     I_ += 1 # I
-                     return (I_, J_, K_, primitive_)
+                     I_I_, J_I_, K_I_, primitive_I_ = while_loop(lambda arrJ: arrJ[1] < ma + mb + mc + md + 1, loop_J, (I_I, J_I, K_I, primitive_I))
+                     I_I_ += 1 # I
+                     return (I_I_, J_I_, K_I_, primitive_I_)
 
                   I_, J_, K_, primitive_ = while_loop(lambda arrI: arrI[0] < la + lb + lc + ld + 1, loop_I, (I, J, K, primitive))
 
                   tei = prefactor * primitive_
-                  G = G.at[i, j, k, l].set(tei)
-                  d += 1
-                  return (a, b, c, d, G)
+                  G_3 = G_3.at[i, j, k, l].set(tei)
+                  d_3 += 1
+                  return (a_3, b_3, c_3, d_3, G_3)
 
-               a_, b_, c_, d_, G_ = while_loop(lambda arr3: arr3[3] < dims[arr3[6]], loop_d, arr2)
-               c_ += 1
-               return (a_, b_, c_, d_, G_)
+               a_2_, b_2_, c_2_, d_2_, G_2_ = while_loop(lambda arr3: arr3[3] < dims[p4], loop_d, (a_2, b_2, c_2, d_2, G_2))
+               c_2_ += 1
+               return (a_2_, b_2_, c_2_, d_2_, G_2_)
 
-            a_, b_, c_, d_, G_ = while_loop(lambda arr2: arr2[2] < dims[arr2[5]], loop_c, arr1)
-            b_ += 1
-            return (a_, b_, c_, d_, G_)
+            a_1_, b_1_, c_1_, d_1_, G_1_ = while_loop(lambda arr2: arr2[2] < dims[p3], loop_c, (a_1, b_1, c_1, d_1, G_1))
+            b_1_ += 1
+            return (a_1_, b_1_, c_1_, d_1_, G_1_)
 
-         a_, b_, c_, d_, G_ = while_loop(lambda arr1: arr1[1] < dims[arr1[4]], loop_b, arr0)
-         a_ += 1
-         return (a_, b_, c_, d_, G_)
+         a_0_, b_0_, c_0_, d_0_, G_0_ = while_loop(lambda arr1: arr1[1] < dims[p2], loop_b, (a_0, b_0, c_0, d_0, G_0))
+         a_0_ += 1
+         return (a_0_, b_0_, c_0_, d_0_, G_0_)
 
-      a_, b_, c_, d_, G_ = while_loop(lambda arr0: arr0[0] < dims[p1], loop_a, (a, b, c, d, G))
-      return G_
+      a_, b_, c_, d_, G_tei_ = while_loop(lambda arr0: arr0[0] < dims[p1], loop_a, (a, b, c, d, G_tei))
+      return G_tei_
 
     G = fori_loop(0, primitive_quartets.shape[0], loop_prim_quartets, jnp.zeros((nbf,nbf,nbf,nbf)))
     return G
diff --git a/quax/methods/ccsd_t.py b/quax/methods/ccsd_t.py
index fc6ea6c..2ab6a44 100644
--- a/quax/methods/ccsd_t.py
+++ b/quax/methods/ccsd_t.py
@@ -37,61 +37,61 @@ def inner_func(i,j,k):
         Dd_occ = fock_Od[i] + fock_Od[j] + fock_Od[k] 
 
         def loop_a(arr0):
-           a, b, c, pT_contribution = arr0
-           b = 0
+           a_0, b_0, c_0, pT_contribution_0 = arr0
+           b_0 = 0
 
            def loop_b(arr1):
-              a, b, c, pT_contribution = arr1
-              c = 0
-              delta_vir = 1 + delta_v[a,b]
+              a_1, b_1, c_1, pT_contribution_1 = arr1
+              c_1 = 0
+              delta_vir = 1 + delta_v[a_1,b_1]
 
               def loop_c(arr2):
-                 a, b, c, pT_contribution = arr2
-                 delta_vir = delta_vir + delta_v[b,c]
-                 Dd = Dd_occ - (fock_Vd[a] + fock_Vd[b] + fock_Vd[c])
-                 X = W[a,b,c]*V[a,b,c] + W[a,c,b]*V[a,c,b] + W[b,a,c]*V[b,a,c]  \
-                   + W[b,c,a]*V[b,c,a] + W[c,a,b]*V[c,a,b] + W[c,b,a]*V[c,b,a]
-                 Y = (V[a,b,c] + V[b,c,a] + V[c,a,b])
-                 Z = (V[a,c,b] + V[b,a,c] + V[c,b,a])
-                 E = (Y - 2*Z)*(W[a,b,c] + W[b,c,a] + W[c,a,b]) + (Z - 2*Y)*(W[a,c,b]+W[b,a,c]+W[c,b,a]) + 3*X
-                 pT_contribution += E * delta_occ / (Dd * delta_vir)
-                 c += 1
-                 return (a, b, c, pT_contribution)
-
-              a_, b_, c_, pT_contribution_ = while_loop(lambda arr2: arr2[2] < arr2[1] + 1, loop_c, (a, b, c, pT_contribution))
-              b_ += 1
-              return (a_, b_, c_, pT_contribution_)
-
-           a_, b_, c_, pT_contribution_ = while_loop(lambda arr1: arr1[1] < arr1[0] + 1, loop_b, (a, b, c, pT_contribution))
-           a_ += 1
-           return (a_, b_, c_, pT_contribution_)
-
-        a_, b_. c_, dE_pT = while_loop(lambda arr0: arr0[0] < v, loop_a, (0, 0, 0, 0.0)) # (a, b, c, pT_contribution)
+                 a_2, b_2, c_2, pT_contribution_2 = arr2
+                 delta_vir = delta_vir + delta_v[b_2,c_2]
+                 Dd = Dd_occ - (fock_Vd[a_2] + fock_Vd[b_2] + fock_Vd[c_2])
+                 X = W[a_2,b_2,c_2]*V[a_2,b_2,c_2] + W[a_2,c_2,b_2]*V[a_2,c_2,b_2] + W[b_2,a_2,c_2]*V[b_2,a_2,c_2]  \
+                   + W[b_2,c_2,a_2]*V[b_2,c_2,a_2] + W[c_2,a_2,b_2]*V[c_2,a_2,b_2] + W[c_2,b_2,a_2]*V[c_2,b_2,a_2]
+                 Y = (V[a_2,b_2,c_2] + V[b_2,c_2,a_2] + V[c_2,a_2,b_2])
+                 Z = (V[a_2,c_2,b_2] + V[b_2,a_2,c_2] + V[c_2,b_2,a_2])
+                 E = (Y - 2*Z)*(W[a_2,b_2,c_2] + W[b_2,c_2,a_2] + W[c_2,a_2,b_2]) + (Z - 2*Y)*(W[a_2,c_2,b_2]+W[b_2,a_2,c_2]+W[c_2,b_2,a_2]) + 3*X
+                 pT_contribution_2 += E * delta_occ / (Dd * delta_vir)
+                 c_2 += 1
+                 return (a_2, b_2, c_2, pT_contribution_2)
+
+              a_1_, b_1_, c_1_, pT_contribution_1_ = while_loop(lambda arr2: arr2[2] < arr2[1] + 1, loop_c, (a_1, b_1, c_1, pT_contribution_1))
+              b_1_ += 1
+              return (a_1_, b_1_, c_1_, pT_contribution_1_)
+
+           a_0_, b_0_, c_0_, pT_contribution_0_ = while_loop(lambda arr1: arr1[1] < arr1[0] + 1, loop_b, (a_0, b_0, c_0, pT_contribution_0))
+           a_0_ += 1
+           return (a_0_, b_0_, c_0_, pT_contribution_0_)
+
+        a, b, c, dE_pT = while_loop(lambda arr0: arr0[0] < v, loop_a, (0, 0, 0, 0.0)) # (a, b, c, pT_contribution)
         return dE_pT
 
     def loop_i(arr0):
-       i, j, k, pT = arr0
-       j = 0
+       i_0, j_0, k_0, pT_0 = arr0
+       j_0 = 0
 
        def loop_j(arr1):
-          i, j, k, pT = arr1
-          k = 0
+          i_1, j_1, k_1, pT_1 = arr1
+          k_1 = 0
 
           def loop_k(arr2):
-             i, j, k, pT = arr2
-             pT += inner_func(i, j, k)
-             k += 1
-             return (i, j, k, pT)
+             i_2, j_2, k_2, pT_2 = arr2
+             pT_2 += inner_func(i_2, j_2, k_2)
+             k_2 += 1
+             return (i_2, j_2, k_2, pT_2)
 
-          i_, j_, k_, pT_ = while_loop(lambda arr2: arr2[2] < arr2[1] + 1, loop_k, (i, j, k, pT))
-          j_ += 1
-          return (i_, j_, k_, pT_)
+          i_1_, j_1_, k_1_, pT_1_ = while_loop(lambda arr2: arr2[2] < arr2[1] + 1, loop_k, (i_1, j_1, k_1, pT_1))
+          j_1_ += 1
+          return (i_1_, j_1_, k_1_, pT_1_)
 
-       i_, j_, k_, pT_ = while_loop(lambda arr1: arr1[1] < arr1[0] + 1, loop_j, (i, j, k, pT))
-       i_ += 1
-       return (i_, j_, k_, pT_)
+       i_0_, j_0_, k_0_, pT_0_ = while_loop(lambda arr1: arr1[1] < arr1[0] + 1, loop_j, (i_0, j_0, k_0, pT_0))
+       i_0_ += 1
+       return (i_0_, j_0_, k_0_, pT_0_)
 
-    i_, j_, k_, pT = while_loop(lambda arr0: arr0[0] < o, loop_i, (0, 0, 0, 0.0)) # (i, j, k, pT)
+    i, j, k, pT = while_loop(lambda arr0: arr0[0] < o, loop_i, (0, 0, 0, 0.0)) # (i, j, k, pT)
     return pT
 
 def rccsd_t(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=0):

From 86a7b803ae64a30bb4bca1976e96a96d0ca5669d Mon Sep 17 00:00:00 2001
From: Erica Mitchell <ericamitch5@gmail.com>
Date: Mon, 28 Aug 2023 15:51:49 -0400
Subject: [PATCH 03/91] Working Quax with JAX Updated

---
 quax/constants.py                |  5 -----
 quax/external_integrals/makefile |  2 +-
 quax/external_integrals/oei.py   | 12 ++++++------
 quax/external_integrals/tei.py   |  4 ++--
 tests/test_energies.py           | 16 ++++++++--------
 tests/test_gradients.py          |  8 ++++----
 tests/test_hessians.py           |  8 ++++----
 7 files changed, 25 insertions(+), 30 deletions(-)

diff --git a/quax/constants.py b/quax/constants.py
index 7ec3544..17bcd8c 100644
--- a/quax/constants.py
+++ b/quax/constants.py
@@ -11,8 +11,3 @@
     if lib.match(path):
         from . import external_integrals 
         libint_imported = True
-
-if libint_imported:
-    print("Using Libint integrals...")
-else:
-    print("Using Quax integrals...")
diff --git a/quax/external_integrals/makefile b/quax/external_integrals/makefile
index 273df46..58047d7 100644
--- a/quax/external_integrals/makefile
+++ b/quax/external_integrals/makefile
@@ -21,7 +21,7 @@ RPATH := -Wl,-rpath,"/home/vulcan/ecm23353/.conda/envs/quax/lib"
 
 # This 'TARGETS' suffix should be set to whatever is returned by the command `python3-config --extension-suffix` entered on command line.
 # and it should match the same python version referenced in the above include path for I4 := (3.7 in this case)
-TARGETS := libint_interface.cpython-310m-x86_64-linux-gnu.so
+TARGETS := libint_interface.cpython-310-x86_64-linux-gnu.so
 OBJ     := libint_interface.o
 
 # Rest is boilerplate. Do not edit unless you know what you're doing.
diff --git a/quax/external_integrals/oei.py b/quax/external_integrals/oei.py
index 4bcee2c..7f81093 100644
--- a/quax/external_integrals/oei.py
+++ b/quax/external_integrals/oei.py
@@ -43,12 +43,12 @@ def __init__(self, basis_name, xyz_path, max_deriv_order, mode):
         self.potential_deriv_p.def_impl(self.potential_deriv_impl)
 
         # Register the JVP rules with JAX
-        jax.ad.primitive_jvps[self.overlap_p] = self.overlap_jvp
-        jax.ad.primitive_jvps[self.overlap_deriv_p] = self.overlap_deriv_jvp
-        jax.ad.primitive_jvps[self.kinetic_p] = self.kinetic_jvp
-        jax.ad.primitive_jvps[self.kinetic_deriv_p] = self.kinetic_deriv_jvp
-        jax.ad.primitive_jvps[self.potential_p] = self.potential_jvp
-        jax.ad.primitive_jvps[self.potential_deriv_p] = self.potential_deriv_jvp
+        jax.interpreters.ad.primitive_jvps[self.overlap_p] = self.overlap_jvp
+        jax.interpreters.ad.primitive_jvps[self.overlap_deriv_p] = self.overlap_deriv_jvp
+        jax.interpreters.ad.primitive_jvps[self.kinetic_p] = self.kinetic_jvp
+        jax.interpreters.ad.primitive_jvps[self.kinetic_deriv_p] = self.kinetic_deriv_jvp
+        jax.interpreters.ad.primitive_jvps[self.potential_p] = self.potential_jvp
+        jax.interpreters.ad.primitive_jvps[self.potential_deriv_p] = self.potential_deriv_jvp
 
         # Register the batching rules with JAX
         jax.interpreters.batching.primitive_batchers[self.overlap_deriv_p] = self.overlap_deriv_batch
diff --git a/quax/external_integrals/tei.py b/quax/external_integrals/tei.py
index db1589a..002b421 100644
--- a/quax/external_integrals/tei.py
+++ b/quax/external_integrals/tei.py
@@ -42,8 +42,8 @@ def __init__(self, basis_name, xyz_path, max_deriv_order, mode):
         self.tei_deriv_p.def_impl(self.tei_deriv_impl)
 
         # Register the JVP rules with JAX
-        jax.ad.primitive_jvps[self.tei_p] = self.tei_jvp
-        jax.ad.primitive_jvps[self.tei_deriv_p] = self.tei_deriv_jvp
+        jax.interpreters.ad.primitive_jvps[self.tei_p] = self.tei_jvp
+        jax.interpreters.ad.primitive_jvps[self.tei_deriv_p] = self.tei_deriv_jvp
 
         # Register tei_deriv batching rule with JAX
         jax.interpreters.batching.primitive_batchers[self.tei_deriv_p] = self.tei_deriv_batch
diff --git a/tests/test_energies.py b/tests/test_energies.py
index d4331b2..91b5266 100644
--- a/tests/test_energies.py
+++ b/tests/test_energies.py
@@ -14,31 +14,31 @@
 units bohr
 """)
 basis_name = 'sto-3g'
-psi4.set_options({'basis': basis_name, 
-                  'scf_type': 'pk', 
+psi4.set_options({'basis': basis_name,
+                  'scf_type': 'pk',
                   'mp2_type':'conv',
                   'e_convergence': 1e-10,
                   'd_convergence':1e-10,
-                  'puream': 0, 
-                  'points':5, 
+                  'puream': 0,
+                  'points':5,
                   'fd_project':False})
 
 def test_hartree_fock(method='hf'):
     psi_e = psi4.energy(method + '/' + basis_name)
     quax_e = quax.core.energy(molecule, basis_name, method)
-    assert np.allclose(psi_e, quax_e) 
+    assert np.allclose(psi_e, quax_e)
 
 def test_mp2(method='mp2'):
     psi_e = psi4.energy(method + '/' + basis_name)
     quax_e = quax.core.energy(molecule, basis_name, method)
-    assert np.allclose(psi_e, quax_e) 
+    assert np.allclose(psi_e, quax_e)
 
 def test_ccsd(method='ccsd'):
     psi_e = psi4.energy(method + '/' + basis_name)
     quax_e = quax.core.energy(molecule, basis_name, method)
-    assert np.allclose(psi_e, quax_e) 
+    assert np.allclose(psi_e, quax_e)
 
 def test_ccsd_t(method='ccsd(t)'):
     psi_e = psi4.energy(method + '/' + basis_name)
     quax_e = quax.core.energy(molecule, basis_name, method)
-    assert np.allclose(psi_e, quax_e) 
+    assert np.allclose(psi_e, quax_e)
diff --git a/tests/test_gradients.py b/tests/test_gradients.py
index f508957..d31ae01 100644
--- a/tests/test_gradients.py
+++ b/tests/test_gradients.py
@@ -14,13 +14,13 @@
 units bohr
 """)
 basis_name = 'sto-3g'
-psi4.set_options({'basis': basis_name, 
-                  'scf_type': 'pk', 
+psi4.set_options({'basis': basis_name,
+                  'scf_type': 'pk',
                   'mp2_type':'conv',
                   'e_convergence': 1e-10,
                   'd_convergence':1e-10,
-                  'puream': 0, 
-                  'points':5, 
+                  'puream': 0,
+                  'points':5,
                   'fd_project':False})
 
 options = {'damping':True, 'spectral_shift':False, 'integral_algo': 'quax_core'}
diff --git a/tests/test_hessians.py b/tests/test_hessians.py
index 1e801b3..929181f 100644
--- a/tests/test_hessians.py
+++ b/tests/test_hessians.py
@@ -14,13 +14,13 @@
 units bohr
 """)
 basis_name = 'sto-3g'
-psi4.set_options({'basis': basis_name, 
-                  'scf_type': 'pk', 
+psi4.set_options({'basis': basis_name,
+                  'scf_type': 'pk',
                   'mp2_type':'conv',
                   'e_convergence': 1e-10,
                   'd_convergence':1e-10,
-                  'puream': 0, 
-                  'points':5, 
+                  'puream': 0,
+                  'points':5,
                   'fd_project':False})
 
 options = {'damping':True, 'spectral_shift':False, 'integral_algo': 'quax_core'}

From 6863468306b350dfe8e1c351fa627d158948703b Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Fri, 15 Sep 2023 15:41:18 -0400
Subject: [PATCH 04/91] Removed quax_core ints, updated Libint

---
 quax/__init__.py                              |   3 -
 quax/constants.py                             |  10 +-
 quax/core.py                                  |   2 +-
 quax/external_integrals/__init__.py           |   9 -
 quax/external_integrals/oei.py                | 252 ----------
 quax/external_integrals/tei.py                | 131 -----
 quax/external_integrals/tmp_potential.py      | 160 ------
 quax/integrals/__init__.py                    |   8 +-
 .../buffer_lookups.h                          |   0
 quax/integrals/integrals_utils.py             | 163 ------
 .../libint_interface.cc                       | 263 +++++-----
 .../makefile                                  |  18 +-
 quax/integrals/oei.py                         | 468 +++++++++---------
 quax/integrals/tei.py                         | 435 +++++-----------
 quax/methods/ccsd.py                          |  50 +-
 quax/methods/ccsd_t.py                        |  26 +-
 quax/methods/hartree_fock.py                  |  14 +-
 quax/methods/ints.py                          | 182 +------
 quax/methods/mp2.py                           |   6 +-
 tests/test_gradients.py                       |   2 +-
 tests/test_hessians.py                        |   2 +-
 21 files changed, 600 insertions(+), 1604 deletions(-)
 delete mode 100644 quax/external_integrals/__init__.py
 delete mode 100644 quax/external_integrals/oei.py
 delete mode 100644 quax/external_integrals/tei.py
 delete mode 100644 quax/external_integrals/tmp_potential.py
 rename quax/{external_integrals => integrals}/buffer_lookups.h (100%)
 delete mode 100644 quax/integrals/integrals_utils.py
 rename quax/{external_integrals => integrals}/libint_interface.cc (88%)
 rename quax/{external_integrals => integrals}/makefile (60%)

diff --git a/quax/__init__.py b/quax/__init__.py
index ecde022..5a5d9fb 100644
--- a/quax/__init__.py
+++ b/quax/__init__.py
@@ -1,9 +1,6 @@
 from . import integrals 
 from . import constants
 
-if constants.libint_imported:
-    from . import external_integrals 
-
 from . import methods 
 from . import core
 from . import utils 
diff --git a/quax/constants.py b/quax/constants.py
index 17bcd8c..98f89a6 100644
--- a/quax/constants.py
+++ b/quax/constants.py
@@ -1,13 +1,17 @@
 import os
 import re
+import sys
 
 # Get absolute module path
 module_path = os.path.dirname(os.path.abspath(__file__))
 
-# Check if libint interface is being used
+# Check if libint interface is found
 libint_imported = False
 lib = re.compile("libint_interface\.cpython.+")
-for path in os.listdir(module_path + "/external_integrals"):
+for path in os.listdir(module_path + "/integrals"):
     if lib.match(path):
-        from . import external_integrals 
+        from . import integrals
         libint_imported = True
+
+if not libint_imported:
+    sys.exit("Libint is a required dependency!")
diff --git a/quax/core.py b/quax/core.py
index 6b38098..9f663f0 100644
--- a/quax/core.py
+++ b/quax/core.py
@@ -180,7 +180,7 @@ def partial_wrapper(*args):
         else:
             print("Error: Order {} partial derivatives are not exposed to the API.".format(deriv_order))
             partial_deriv = 0
-        return jnp.round(partial_deriv,10)
+        return jnp.round(partial_deriv, 10)
 
 def energy(molecule, basis_name, method, options=None):
     """
diff --git a/quax/external_integrals/__init__.py b/quax/external_integrals/__init__.py
deleted file mode 100644
index 7cbca32..0000000
--- a/quax/external_integrals/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from . import tei
-from . import oei
-from . import libint_interface
-
-from .tei import TEI 
-from .oei import OEI 
-
-from .tmp_potential import tmp_potential
-
diff --git a/quax/external_integrals/oei.py b/quax/external_integrals/oei.py
deleted file mode 100644
index 7f81093..0000000
--- a/quax/external_integrals/oei.py
+++ /dev/null
@@ -1,252 +0,0 @@
-import jax
-import jax.numpy as jnp
-import numpy as np
-import h5py
-import os
-import psi4
-from . import libint_interface
-from ..utils import get_deriv_vec_idx, how_many_derivs
-
-jax.config.update("jax_enable_x64", True)
-
-class OEI(object):
-
-    def __init__(self, basis_name, xyz_path, max_deriv_order, mode):
-        with open(xyz_path, 'r') as f:
-            tmp = f.read()
-        molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
-        basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
-        natoms = molecule.natom()
-        nbf = basis_set.nbf()
-
-        # TODO implement core-algo for OEI's in libint_interface.cc
-        #if mode == 'core' and max_deriv_order > 0:
-            #self.oei_derivatives = {}
-
-        self.mode = mode
-        self.nbf = nbf
-        
-        # Create new JAX primitives for overlap, kinetic, potential evaluation and their derivatives 
-        self.overlap_p = jax.core.Primitive("overlap")
-        self.overlap_deriv_p = jax.core.Primitive("overlap_deriv")
-        self.kinetic_p = jax.core.Primitive("kinetic")
-        self.kinetic_deriv_p = jax.core.Primitive("kinetic_deriv")
-        self.potential_p = jax.core.Primitive("potential")
-        self.potential_deriv_p = jax.core.Primitive("potential_deriv")
-
-        # Register primitive evaluation rules
-        self.overlap_p.def_impl(self.overlap_impl)
-        self.overlap_deriv_p.def_impl(self.overlap_deriv_impl)
-        self.kinetic_p.def_impl(self.kinetic_impl)
-        self.kinetic_deriv_p.def_impl(self.kinetic_deriv_impl)
-        self.potential_p.def_impl(self.potential_impl)
-        self.potential_deriv_p.def_impl(self.potential_deriv_impl)
-
-        # Register the JVP rules with JAX
-        jax.interpreters.ad.primitive_jvps[self.overlap_p] = self.overlap_jvp
-        jax.interpreters.ad.primitive_jvps[self.overlap_deriv_p] = self.overlap_deriv_jvp
-        jax.interpreters.ad.primitive_jvps[self.kinetic_p] = self.kinetic_jvp
-        jax.interpreters.ad.primitive_jvps[self.kinetic_deriv_p] = self.kinetic_deriv_jvp
-        jax.interpreters.ad.primitive_jvps[self.potential_p] = self.potential_jvp
-        jax.interpreters.ad.primitive_jvps[self.potential_deriv_p] = self.potential_deriv_jvp
-
-        # Register the batching rules with JAX
-        jax.interpreters.batching.primitive_batchers[self.overlap_deriv_p] = self.overlap_deriv_batch
-        jax.interpreters.batching.primitive_batchers[self.kinetic_deriv_p] = self.kinetic_deriv_batch
-        jax.interpreters.batching.primitive_batchers[self.potential_deriv_p] = self.potential_deriv_batch
-
-    # Create functions to call primitives
-    def overlap(self, geom):
-        return self.overlap_p.bind(geom)
-    
-    def overlap_deriv(self, geom, deriv_vec):
-        return self.overlap_deriv_p.bind(geom, deriv_vec) 
-    
-    def kinetic(self, geom):
-        return self.kinetic_p.bind(geom)
-    
-    def kinetic_deriv(self, geom, deriv_vec):
-        return self.kinetic_deriv_p.bind(geom, deriv_vec) 
-    
-    def potential(self, geom):
-        return self.potential_p.bind(geom)
-    
-    def potential_deriv(self, geom, deriv_vec):
-        return self.potential_deriv_p.bind(geom, deriv_vec) 
-    
-    # Create primitive evaluation rules 
-    def overlap_impl(self, geom):
-        S = libint_interface.overlap()
-        S = S.reshape(self.nbf,self.nbf)
-        return jnp.asarray(S)
-    
-    def kinetic_impl(self, geom):
-        T = libint_interface.kinetic() 
-        T = T.reshape(self.nbf,self.nbf)
-        return jnp.asarray(T) 
-    
-    def potential_impl(self, geom):
-        V = libint_interface.potential()
-        V = V.reshape(self.nbf,self.nbf)
-        return jnp.asarray(V)
-
-    def overlap_deriv_impl(self, geom, deriv_vec):
-        deriv_vec = np.asarray(deriv_vec, int)
-        deriv_order = np.sum(deriv_vec)
-
-        #TODO update once core algo in libint is computed, this just computes one slice at a time
-        if self.mode == 'core':
-            S = libint_interface.overlap_deriv(np.asarray(deriv_vec, int))
-            return jnp.asarray(S).reshape(self.nbf,self.nbf)
-        else:
-            idx = get_deriv_vec_idx(deriv_vec)
-            if os.path.exists("oei_derivs.h5"):
-                file_name = "oei_derivs.h5"
-                dataset_name = "overlap_deriv" + str(deriv_order)
-            elif os.path.exists("oei_partials.h5"):
-                file_name = "oei_partials.h5"
-                dataset_name = "overlap_deriv" + str(deriv_order) + "_" + str(idx)
-            else:
-                raise Exception("Something went wrong reading integral derivative file")
-            with h5py.File(file_name, 'r') as f:
-                data_set = f[dataset_name]
-                if len(data_set.shape) == 3:
-                    S = data_set[:,:,idx]
-                elif len(data_set.shape) == 2:
-                    S = data_set[:,:]
-                else:
-                    raise Exception("Something went wrong reading integral derivative file")
-            return jnp.asarray(S)
-
-    def kinetic_deriv_impl(self, geom, deriv_vec):
-        deriv_vec = np.asarray(deriv_vec, int)
-        deriv_order = np.sum(deriv_vec)
-
-        #TODO update once core algo in libint is computed, this just computes one slice at a time
-        if self.mode == 'core':
-            T = libint_interface.kinetic_deriv(np.asarray(deriv_vec, int))
-            return jnp.asarray(T).reshape(self.nbf,self.nbf)
-        else:
-            idx = get_deriv_vec_idx(deriv_vec)
-            if os.path.exists("oei_derivs.h5"):
-                file_name = "oei_derivs.h5"
-                dataset_name = "kinetic_deriv" + str(deriv_order)
-            elif os.path.exists("oei_partials.h5"):
-                file_name = "oei_partials.h5"
-                dataset_name = "kinetic_deriv" + str(deriv_order) + "_" + str(idx)
-            else:
-                raise Exception("Something went wrong reading integral derivative file")
-            with h5py.File(file_name, 'r') as f:
-                data_set = f[dataset_name]
-                if len(data_set.shape) == 3:
-                    T = data_set[:,:,idx]
-                elif len(data_set.shape) == 2:
-                    T = data_set[:,:]
-                else:
-                    raise Exception("Something went wrong reading integral derivative file")
-            return jnp.asarray(T)
-
-    def potential_deriv_impl(self, geom, deriv_vec):
-        deriv_vec = np.asarray(deriv_vec, int)
-        deriv_order = np.sum(deriv_vec)
-
-        #TODO update once core algo in libint is computed, this just computes one slice at a time
-        if self.mode == 'core':
-            V = libint_interface.potential_deriv(np.asarray(deriv_vec, int))
-            return jnp.asarray(V).reshape(self.nbf,self.nbf)
-        else:
-            idx = get_deriv_vec_idx(deriv_vec)
-            if os.path.exists("oei_derivs.h5"):
-                file_name = "oei_derivs.h5"
-                dataset_name = "potential_deriv" + str(deriv_order)
-            elif os.path.exists("oei_partials.h5"):
-                file_name = "oei_partials.h5"
-                dataset_name = "potential_deriv" + str(deriv_order) + "_" + str(idx)
-            else:
-                raise Exception("Something went wrong reading integral derivative file")
-            with h5py.File(file_name, 'r') as f:
-                data_set = f[dataset_name]
-                if len(data_set.shape) == 3:
-                    V = data_set[:,:,idx]
-                elif len(data_set.shape) == 2:
-                    V = data_set[:,:]
-                else:
-                    raise Exception("Something went wrong reading integral derivative file")
-            return jnp.asarray(V)
-
-    def overlap_jvp(self, primals, tangents):
-        geom, = primals
-        primals_out = self.overlap(geom) 
-        tangents_out = self.overlap_deriv(geom, tangents[0])
-        return primals_out, tangents_out
-    
-    def overlap_deriv_jvp(self, primals, tangents):
-        geom, deriv_vec = primals
-        primals_out = self.overlap_deriv(geom, deriv_vec)
-        tangents_out = self.overlap_deriv(geom, deriv_vec + tangents[0])
-        return primals_out, tangents_out
-    
-    def kinetic_jvp(self, primals, tangents):
-        geom, = primals
-        primals_out = self.kinetic(geom) 
-        tangents_out = self.kinetic_deriv(geom, tangents[0])
-        return primals_out, tangents_out
-    
-    def kinetic_deriv_jvp(self, primals, tangents):
-        geom, deriv_vec = primals
-        primals_out = self.kinetic_deriv(geom, deriv_vec)
-        tangents_out = self.kinetic_deriv(geom, deriv_vec + tangents[0])
-        return primals_out, tangents_out
-    
-    def potential_jvp(self, primals, tangents):
-        geom, = primals
-        primals_out = self.potential(geom) 
-        tangents_out = self.potential_deriv(geom, tangents[0])
-        return primals_out, tangents_out
-    
-    def potential_deriv_jvp(self, primals, tangents):
-        geom, deriv_vec = primals
-        primals_out = self.potential_deriv(geom, deriv_vec)
-        tangents_out = self.potential_deriv(geom, deriv_vec + tangents[0])
-        return primals_out, tangents_out
-
-    # Define Batching rules, this is only needed since jax.jacfwd will call vmap on the JVP's
-    # of each oei function
-    def overlap_deriv_batch(self, batched_args, batch_dims):
-        # When the input argument of deriv_batch is batched along the 0'th axis
-        # we want to evaluate every 2d slice, gather up a (ncart, n,n) array, 
-        # (expand dims at 0 and concatenate at 0)
-        # and then return the results, indicating the out batch axis 
-        # is in the 0th position (return results, 0)
-        geom_batch, deriv_batch = batched_args
-        geom_dim, deriv_dim = batch_dims
-        results = []
-        for i in deriv_batch:
-            tmp = self.overlap_deriv(geom_batch, i)
-            results.append(jnp.expand_dims(tmp, axis=0))
-        results = jnp.concatenate(results, axis=0)
-        return results, 0
-    
-    def kinetic_deriv_batch(self, batched_args, batch_dims):
-        geom_batch, deriv_batch = batched_args
-        geom_dim, deriv_dim = batch_dims
-        results = []
-        for i in deriv_batch:
-            tmp = self.kinetic_deriv(geom_batch, i)
-            results.append(jnp.expand_dims(tmp, axis=0))
-        results = jnp.concatenate(results, axis=0)
-        return results, 0
-    
-    def potential_deriv_batch(self, batched_args, batch_dims):
-        geom_batch, deriv_batch = batched_args
-        geom_dim, deriv_dim = batch_dims
-        results = []
-        for i in deriv_batch:
-            tmp = self.potential_deriv(geom_batch, i)
-            results.append(jnp.expand_dims(tmp, axis=0))
-        results = jnp.concatenate(results, axis=0)
-        return results, 0
-
-
-
-
diff --git a/quax/external_integrals/tei.py b/quax/external_integrals/tei.py
deleted file mode 100644
index 002b421..0000000
--- a/quax/external_integrals/tei.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import jax 
-import jax.numpy as jnp
-import numpy as np
-import h5py
-import os
-import psi4
-from . import libint_interface
-from ..utils import get_deriv_vec_idx, how_many_derivs
-
-jax.config.update("jax_enable_x64", True)
-
-class TEI(object):
-
-    def __init__(self, basis_name, xyz_path, max_deriv_order, mode):
-        with open(xyz_path, 'r') as f:  
-            tmp = f.read()              
-        molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')                          
-        basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)   
-        natoms = molecule.natom()                                                       
-        nbf = basis_set.nbf()                                                           
-
-        if mode == 'core' and max_deriv_order > 0:
-            # An list of ERI derivative tensors, containing only unique elements
-            # corresponding to upper hypertriangle (since derivative tensors are symmetric)
-            # Length of tuple is maximum deriv order, each array is (upper triangle derivatives,nbf,nbf,nbf,nbf)
-            # Then when JAX calls JVP, read appropriate slice 
-            self.eri_derivatives = []
-            for i in range(max_deriv_order):
-                n_unique_derivs = how_many_derivs(natoms, i + 1)
-                eri_deriv = libint_interface.eri_deriv_core(i+1).reshape(n_unique_derivs,nbf,nbf,nbf,nbf)
-                self.eri_derivatives.append(eri_deriv)
-
-        self.mode = mode
-        self.nbf = nbf
-
-        # Create new JAX primitive for TEI evaluation
-        self.tei_p = jax.core.Primitive("tei")
-        self.tei_deriv_p = jax.core.Primitive("tei_deriv")
-
-        # Register primitive evaluation rules
-        self.tei_p.def_impl(self.tei_impl)
-        self.tei_deriv_p.def_impl(self.tei_deriv_impl)
-
-        # Register the JVP rules with JAX
-        jax.interpreters.ad.primitive_jvps[self.tei_p] = self.tei_jvp
-        jax.interpreters.ad.primitive_jvps[self.tei_deriv_p] = self.tei_deriv_jvp
-
-        # Register tei_deriv batching rule with JAX
-        jax.interpreters.batching.primitive_batchers[self.tei_deriv_p] = self.tei_deriv_batch
-
-    # Create functions to call primitives
-    def tei(self, geom):
-        return self.tei_p.bind(geom)
-
-    def tei_deriv(self, geom, deriv_vec):
-        return self.tei_deriv_p.bind(geom, deriv_vec) 
-
-    # Create primitive evaluation rules
-    def tei_impl(self, geom):
-        G = libint_interface.eri()
-        #d = int(np.sqrt(np.sqrt(G.shape[0])))
-        G = G.reshape(self.nbf,self.nbf,self.nbf,self.nbf)
-        return jnp.asarray(G)
-
-    def tei_deriv_impl(self, geom, deriv_vec):
-        deriv_vec = np.asarray(deriv_vec, int)
-        deriv_order = np.sum(deriv_vec)
-        idx = get_deriv_vec_idx(deriv_vec)
-
-        # Use eri derivatives in memory 
-        if self.mode == 'core':
-            G = self.eri_derivatives[deriv_order-1][idx,:,:,:,:]
-            return jnp.asarray(G)
-
-        # Read from disk
-        elif self.mode == 'disk':
-            # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
-            if os.path.exists("eri_derivs.h5"):
-                file_name = "eri_derivs.h5"
-                dataset_name = "eri_deriv" + str(deriv_order)
-            # if not found, look for partial derivative tensor file with datasets named (type)_deriv(order)_(flattened_uppertri_idx)
-            elif os.path.exists("eri_partials.h5"):
-                file_name = "eri_partials.h5"
-                dataset_name = "eri_deriv" + str(deriv_order) + "_" + str(idx)
-            else:
-                raise Exception("ERI derivatives not found on disk")
-
-            with h5py.File(file_name, 'r') as f:
-                data_set = f[dataset_name]
-                if len(data_set.shape) == 5:
-                    G = data_set[:,:,:,:,idx]
-                elif len(data_set.shape) == 4:
-                    G = data_set[:,:,:,:]
-                else:
-                    raise Exception("Something went wrong reading integral derivative file")
-            return jnp.asarray(G)
-
-
-    # Create Jacobian-vector product rule, which given some input args (primals)
-    # and a tangent std basis vector (tangent), returns the function evaluated at that point (primals_out)
-    # and the slice of the Jacobian (tangents_out)
-    def tei_jvp(self, primals, tangents):
-        geom, = primals
-        primals_out = self.tei(geom) 
-        tangents_out = self.tei_deriv(geom, tangents[0])
-        return primals_out, tangents_out
-
-    def tei_deriv_jvp(self, primals, tangents):
-        geom, deriv_vec = primals
-        primals_out = self.tei_deriv(geom, deriv_vec)
-        # Here we add the current value of deriv_vec to the incoming tangent vector, 
-        # so that nested higher order differentiation works
-        tangents_out = self.tei_deriv(geom, deriv_vec + tangents[0])
-        return primals_out, tangents_out
-
-    # Define Batching rules, this is only needed since jax.jacfwd will call vmap on the JVP of tei
-    def tei_deriv_batch(self, batched_args, batch_dims):
-        # When the input argument of deriv_batch is batched along the 0'th axis
-        # we want to evaluate every 4d slice, gather up a (ncart, n,n,n,n) array, 
-        # (expand dims at 0 and concatenate at 0)
-        # and then return the results, indicating the out batch axis 
-        # is in the 0th position (return results, 0)
-        geom_batch, deriv_batch = batched_args
-        geom_dim, deriv_dim = batch_dims
-        results = []
-        for i in deriv_batch:
-            tmp = self.tei_deriv(geom_batch, i)
-            results.append(jnp.expand_dims(tmp, axis=0))
-        results = jnp.concatenate(results, axis=0)
-        return results, 0
-
diff --git a/quax/external_integrals/tmp_potential.py b/quax/external_integrals/tmp_potential.py
deleted file mode 100644
index 371dd5e..0000000
--- a/quax/external_integrals/tmp_potential.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# Temporary potential integrals since libint does allow beyond 2nd order at the moment.
-import jax 
-from jax.config import config; config.update("jax_enable_x64", True)
-import jax.numpy as jnp
-from jax.lax import fori_loop, while_loop
-
-from ..integrals.integrals_utils import boys, binomial_prefactor, gaussian_product, boys, factorials, double_factorials, neg_one_pow, cartesian_product, am_leading_indices, angular_momentum_combinations
-from ..integrals.basis_utils import flatten_basis_data, get_nbf
-
-def A_array(l1,l2,PA,PB,CP,g,A_vals):
-
-    def loop_i(arr0):
-       i_0, r_0, u_0, A_0 = arr0
-       Aterm_0 = neg_one_pow[i_0] * binomial_prefactor(i_0,l1,l2,PA,PB) * factorials[i_0]
-       r_0 = i_0 // 2
-
-       def loop_r(arr1):
-          i_1, r_1, u_1, Aterm_1, A_1 = arr1
-          u_1 = (i_1 - 2 * r_1) // 2
-
-          def loop_u(arr2):
-             i_2, r_2, u_2, Aterm_2, A_2 = arr2
-             I = i_2 - 2 * r_2 - u_2
-             tmp = I - u_2
-             fact_ratio = 1 / (factorials[r_2] * factorials[u_2] * factorials[tmp])
-             Aterm_2 *= neg_one_pow[u_2]  * CP[tmp] * (0.25 / g)**(r_2+u_2) * fact_ratio
-             A_2 = A_2.at[I].set(Aterm_2)
-             u_2 -= 1
-             return (i_2, r_2, u_2, Aterm_2, A_2)
-
-          i_1_, r_1_, u_1_, Aterm_1_, A_1_ = while_loop(lambda arr2: arr2[1] > -1, loop_u, (i_1, r_1, u_1, Aterm_1, A_1))
-          r_1_ -= 1
-          return (i_1_, r_1_, u_1_, Aterm_1_, A_1_)
-
-       i_0_, r_0_, u_0_, Aterm_0_, A_0_ = while_loop(lambda arr1: arr1[1] > -1, loop_r, (i_0, r_0, u_0, Aterm_0, A_0))
-       i_0_ -= 1
-       return (i_0_, r_0_, u_0_, A_0_)
-
-    i, r, u, A = while_loop(lambda arr0: arr0[0] > -1, loop_i, (l1 + l2, 0, 0, A_vals)) # (i, r, u, A)
-
-    return A
-
-@jax.jit
-def potential(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,Pgeom_pow,boys_eval,prefactor,charges,A_vals):
-    """
-    Computes a single electron-nuclear attraction integral primitive
-    """
-    gamma = aa + bb
-    prefactor *= -2 * jnp.pi / gamma
-
-    def loop_val(n, val):
-      Ax = A_array(la,lb,PA_pow[0],PB_pow[0],Pgeom_pow[n,0,:],gamma,A_vals)
-      Ay = A_array(ma,mb,PA_pow[1],PB_pow[1],Pgeom_pow[n,1,:],gamma,A_vals)
-      Az = A_array(na,nb,PA_pow[2],PB_pow[2],Pgeom_pow[n,2,:],gamma,A_vals)
-
-      I, J, K, total = 0, 0, 0, 0
-      def loop_I(arr0):
-         I_0, J_0, K_0, val_0, total_0 = arr0
-         J_0 = 0
-
-         def loop_J(arr1):
-            I_1, J_1, K_1, val_1, total_1 = arr1
-            K_1 = 0
-
-            def loop_K(arr2):
-               I_2, J_2, K_2, val_2, total_2 = arr2
-               total_2 += Ax[I_2] * Ay[J_2] * Az[K_2] * boys_eval[I_2 + J_2 + K_2, n]
-               K_2 += 1
-               return (I_2, J_2, K_2, val_2, total_2)
-
-            I_1_, J_1_, K_1_, val_1_, total_1_ = while_loop(lambda arr2: arr2[2] < na + nb + 1, loop_K, (I_1, J_1, K_1, val_1, total_1))
-            J_1_ += 1
-            return (I_1_, J_1_, K_1_, val_1_, total_1_)
-
-         I_0_, J_0_, K_0_, val_0_, total_0_ = while_loop(lambda arr1: arr1[1] < ma + mb + 1, loop_J, (I_0, J_0, K_0, val_0, total_0))
-         I_0_ += 1
-         return (I_0_, J_0_, K_0_, val_0_, total_0_)
-
-      I_, J_, K_, val_, total_ = while_loop(lambda arr0: arr0[0] < la + lb + 1, loop_I, (I, J, K, val, total))
-      val_ += charges[n] * prefactor * total_
-      return val_
-
-    val = fori_loop(0, Pgeom_pow.shape[0], loop_val, 0)
-    return val
-
-def tmp_potential(geom, basis, charges):
-    """
-    Build potential one-electron integrals array
-    """
-    coeffs, exps, atoms, ams, indices, dims = flatten_basis_data(basis)
-    nbf = get_nbf(basis)
-    nprim = coeffs.shape[0]
-    max_am = jnp.max(ams)
-    A_vals = jnp.zeros(2*max_am+1)
-
-    # Save various AM distributions for indexing
-    # Obtain all possible primitive duet index combinations 
-    primitive_duets = cartesian_product(jnp.arange(nprim), jnp.arange(nprim))
-    V = jnp.zeros((nbf,nbf))
-
-    for n in range(primitive_duets.shape[0]):
-       p1,p2 = primitive_duets[n]
-       coef = coeffs[p1] * coeffs[p2]
-       aa, bb = exps[p1], exps[p2]
-       atom1, atom2 = atoms[p1], atoms[p2]
-       am1, am2 = ams[p1], ams[p2]
-       A, B = geom[atom1], geom[atom2]
-       ld1, ld2 = am_leading_indices[am1], am_leading_indices[am2]
-
-       gamma = aa + bb
-       prefactor = jnp.exp(-aa * bb * jnp.dot(A-B,A-B) / gamma)
-       P = (aa * A + bb * B) / gamma
-       # Maximum angular momentum: hard coded
-       # Precompute all powers up to 2+max_am of Pi-Ai, Pi-Bi.
-       # We need 2+max_am since kinetic requires incrementing angluar momentum by +2
-       PA_pow = jnp.power(jnp.broadcast_to(P-A, (max_am+3,3)).T, jnp.arange(max_am+3))
-       PB_pow = jnp.power(jnp.broadcast_to(P-B, (max_am+3,3)).T, jnp.arange(max_am+3))
-
-       # For potential integrals, we need the difference between
-       # the gaussian product center P and ALL atoms in the molecule,
-       # and then take all possible powers up to 2*max_am.
-       # We pre-collect this into a 3d array, and then just pull out what we need via indexing in the loops, so they need not be recomputed.
-       # The resulting array has dimensions (atom, cartesian component, power) so index (0, 1, 3) would return (Py - atom0_y)^3
-       P_minus_geom = jnp.broadcast_to(P, geom.shape) - geom
-       Pgeom_pow = jnp.power(jnp.transpose(jnp.broadcast_to(P_minus_geom, (2*max_am + 1,geom.shape[0],geom.shape[1])), (1,2,0)), jnp.arange(2*max_am + 1))
-       # All possible jnp.dot(P-atom,P-atom)
-       rcp2 = jnp.einsum('ij,ij->i', P_minus_geom, P_minus_geom)
-       # All needed (and unneeded, for am < max_am) boys function evaluations
-       boys_arg = jnp.broadcast_to(rcp2 * gamma, (2*max_am+1, geom.shape[0]))
-       boys_nu = jnp.tile(jnp.arange(2*max_am+1), (geom.shape[0],1)).T
-       boys_eval = boys(boys_nu,boys_arg)
-
-       a, b = 0, 0
-       def loop_a(arr0):
-          a_0, b_0, oei_0 = arr0
-          b_0 = 0
-
-          def loop_b(arr1):
-             a_1, b_1, oei_1 = arr1
-             # Gather angular momentum and index
-             la,ma,na = angular_momentum_combinations[a_1 + ld1]
-             lb,mb,nb = angular_momentum_combinations[b_1 + ld2]
-             # To only create unique indices, need to have separate indices arrays for i and j.
-             i = indices[p1] + a_1
-             j = indices[p2] + b_1
-             # Compute one electron integrals and add to appropriate index
-             potential_int = potential(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,Pgeom_pow,boys_eval,prefactor,charges,A_vals) * coef
-             oei_1 = oei_1.at[i,j].set(potential_int)
-             b_1 += 1
-             return (a_1, b_1, oei_1)
-
-          a_0_, b_0_, oei_0_ = while_loop(lambda arr1: arr1[1] < dims[p2], loop_b, (a_0, b_0, oei_0))
-          a_0_ += 1
-          return (a_0_, b_0_, oei_0_)
-
-       a_, b_, oei_ = while_loop(lambda arr0: arr0[0] < dims[p1], loop_a, (a, b, V))
-
-       return oei_
-
-    return V
diff --git a/quax/integrals/__init__.py b/quax/integrals/__init__.py
index ec32fb3..68a2ded 100644
--- a/quax/integrals/__init__.py
+++ b/quax/integrals/__init__.py
@@ -1,5 +1,7 @@
-from . import basis_utils
-from . import integrals_utils
-from . import oei
 from . import tei
+from . import oei
+from . import libint_interface
+
+from .tei import TEI 
+from .oei import OEI 
 
diff --git a/quax/external_integrals/buffer_lookups.h b/quax/integrals/buffer_lookups.h
similarity index 100%
rename from quax/external_integrals/buffer_lookups.h
rename to quax/integrals/buffer_lookups.h
diff --git a/quax/integrals/integrals_utils.py b/quax/integrals/integrals_utils.py
deleted file mode 100644
index fd0545c..0000000
--- a/quax/integrals/integrals_utils.py
+++ /dev/null
@@ -1,163 +0,0 @@
-import jax
-from jax.config import config; config.update("jax_enable_x64", True)
-import jax.numpy as jnp
-import numpy as np
-from functools import partial
-from jax.lax import while_loop
-
-def boys(m,x,eps=1e-12):
-    return 0.5 * (x + eps)**(-(m + 0.5)) * jax.lax.igamma(m + 0.5, x + eps) \
-           * jnp.exp(jax.lax.lgamma(m + 0.5))
-
-def binomial_prefactor(k, l1, l2, PAx, PBx):
-    """
-    Function to binomial prefactor, commonly denoted f_k()
-    Fermann, Valeev 2.46
-    Similar equivalent form in eqn 15 Augsberger Dykstra 1989 J Comp Chem 11 105-111
-    PAx, PBx are all vectors of components Pi-Ai, Pi-Bi raised to a power of angluar momentum.
-    PAx = [PAx^0, PAx^1,...,PAx^max_am
-    """
-    q = jax.lax.max(-k, k-2*l2)
-    q_final = jax.lax.min(k, 2*l1-k)
-
-    def loop_q(arr):
-       q_n, total = arr
-       i = (k+q_n)//2
-       j = (k-q_n)//2
-       return (q_n+2, total + PAx[l1-i] * PBx[l2-j] * binomials[l1,i] * binomials[l2,j])
-
-    q_, total_sum = while_loop(lambda arr: arr[0] < q_final, loop_q, (q,0))
-
-    return total_sum
-
-def gaussian_product(alpha1,A,alpha2,B):
-    '''Gaussian product theorem. Returns center.'''
-    return (alpha1*A+alpha2*B)/(alpha1+alpha2)
- 
-def find_unique_shells(nshells):
-    '''Find shell quartets which correspond to corresponding to unique two-electron integrals, i>=j, k>=l, IJ>=KL'''
-    v = np.arange(nshells,dtype=jnp.int16) 
-    indices = cartesian_product(v,v,v,v)
-    cond1 = (indices[:,0] >= indices[:,1]) & (indices[:,2] >= indices[:,3]) 
-    cond2 = indices[:,0] * (indices[:,0] + 1)/2 + indices[:,1] >= indices[:,2] * (indices[:,2] + 1)/2 + indices[:,3]
-    mask = cond1 & cond2 
-    return jnp.asarray(indices[mask,:])
-
-def cartesian_product(*arrays):
-    '''JAX-friendly version of cartesian product. Same order as other function, more memory requirements though.'''
-    tmp = jnp.asarray(jnp.meshgrid(*arrays, indexing='ij')).reshape(len(arrays),-1).T
-    return jnp.asarray(tmp)
-
-def am_vectors(am, length=3):
-    '''
-    Builds up all possible angular momentum component vectors of with total angular momentum 'am'
-    am = 2 ---> [(2, 0, 0), (1, 1, 0), (1, 0, 1), (0, 2, 0), (0, 1, 1), (0, 0, 2)]
-    Returns a generator which must be converted to an iterable,
-    for example, call the following: [list(i) for i in am_vectors(2)]
-
-    Works by building up each possibility :
-    For a given value in reversed(range(am+1)), find all other possible values for other entries in length 3 vector
-     value     am_vectors(am-value,length-1)    (value,) + permutation
-       2 --->         [0,0]                 ---> [2,0,0] ---> dxx
-       1 --->         [1,0]                 ---> [1,1,0] ---> dxy
-         --->         [0,1]                 ---> [1,0,1] ---> dxz
-       0 --->         [2,0]                 ---> [0,2,0] ---> dyy
-         --->         [1,1]                 ---> [0,1,1] ---> dyz
-         --->         [0,2]                 ---> [0,0,2] ---> dzz
-    '''
-    if length == 1:
-        yield (am,)
-    else:
-        # reverse so angular momentum order is canonical, e.g., dxx dxy dxz dyy dyz dzz
-        for value in reversed(range(am + 1)):
-            for permutation in am_vectors(am - value,length - 1):
-                yield (value,) + permutation
-
-# Need to store factorials up to l1 + l2 + l3 + l4 + 1
-# support for h functions requires up to 21!, we add a one more to be safe 
-factorials = jnp.array([1.0000000000000000e0, 1.0000000000000000e0, 2.0000000000000000e0,
-                       6.0000000000000000e0, 2.4000000000000000e1, 1.2000000000000000e2,
-                       7.2000000000000000e2, 5.0400000000000000e3, 4.0320000000000000e4,
-                       3.6288000000000000e5, 3.6288000000000000e6, 3.9916800000000000e7,
-                       4.7900160000000000e8, 6.2270208000000000e9, 8.7178291200000000e10,
-                       1.3076743680000000e12,2.0922789888000000e13,3.5568742809600000e14,
-                       6.4023737057280000e15,1.2164510040883200e17,2.4329020081766400e18],dtype=int)
-                       #6.4023737057280000e15,1.2164510040883200e17,2.4329020081766400e18,
-                       #5.1090942171709440e19,1.1240007277776077e21,2.5852016738884978e22,
-                       #6.2044840173323941e23,1.5511210043330986e25,4.0329146112660565e26],dtype=int)
-
-
-# Double factorials for overlap/kinetic. 
-# We need 0!! to (l1+l2+1+2)!! (the plus 2 is for kinetic components) 
-# but sometimes we index -1, so put a 1 at the end.
-double_factorials = jnp.array([1,1,2,3,8,15,48,105,384,945,3840,10395,46080,135135,645120,2027025,10321920,1],dtype=int)
- 
-# All elements for a,b in which satisfy a! / (b! (a-2b)!)
-# factorial(a) / factorial(b) / factorial(a-2*b)
-# Must support up to L = l1 + l2 + l3 + l4 on row dimension, L/2 col dimension 
-fact_ratio2 = jnp.array([[1,  0,     0,       0,         0,          0,            0,             0,               0,               0,                0,                0,                 0,0],
-                        [1,  0,     0,       0,         0,          0,            0,             0,               0,               0,                0,                0,                 0,0],
-                        [1,  2,     0,       0,         0,          0,            0,             0,               0,               0,                0,                0,                 0,0],
-                        [1,  6,     0,       0,         0,          0,            0,             0,               0,               0,                0,                0,                 0,0],
-                        [1, 12,    12,       0,         0,          0,            0,             0,               0,               0,                0,                0,                 0,0],
-                        [1, 20,    60,       0,         0,          0,            0,             0,               0,               0,                0,                0,                 0,0],
-                        [1, 30,   180,     120,         0,          0,            0,             0,               0,               0,                0,                0,                 0,0],
-                        [1, 42,   420,     840,         0,          0,            0,             0,               0,               0,                0,                0,                 0,0],
-                        [1, 56,   840,    3360,      1680,          0,            0,             0,               0,               0,                0,                0,                 0,0],
-                        [1, 72,  1512,   10080,     15120,          0,            0,             0,               0,               0,                0,                0,                 0,0],
-                        [1, 90,  2520,   25200,     75600,      30240,            0,             0,               0,               0,                0,                0,                 0,0],
-                        [1,110,  3960,   55440,    277200,     332640,            0,             0,               0,               0,                0,                0,                 0,0],
-                        [1,132,  5940,  110880,    831600,    1995840,       665280,             0,               0,               0,                0,                0,                 0,0],
-                        [1,156,  8580,  205920,   2162160,    8648640,      8648640,             0,               0,               0,                0,                0,                 0,0],
-                        [1,182, 12012,  360360,   5045040,   30270240,     60540480,      17297280,               0,               0,                0,                0,                 0,0],
-                        [1,210, 16380,  600600,  10810800,   90810720,    302702400,     259459200,               0,               0,                0,                0,                 0,0],
-                        [1,240, 21840,  960960,  21621600,  242161920,   1210809600,    2075673600,       518918400,               0,                0,                0,                 0,0],
-                        [1,272, 28560, 1485120,  40840800,  588107520,   4116752640,   11762150400,      8821612800,               0,                0,                0,                 0,0],
-                        [1,306, 36720, 2227680,  73513440, 1323241920,  12350257920,   52929676800,     79394515200,     17643225600,                0,                0,                 0,0],
-                        [1,342, 46512, 3255840, 126977760, 2793510720,  33522128640,  201132771840,    502831929600,    335221286400,                0,                0,                 0,0],
-                        [1,380, 58140, 4651200, 211629600, 5587021440,  83805321600,  670442572800,   2514159648000,   3352212864000,     670442572800,                0,                 0,0],
-                        [1,420, 71820, 6511680, 341863200,10666131840, 195545750400, 2011327718400,  10559470521600,  23465490048000,   14079294028800,                0,                 0,0],
-                        [1,462, 87780, 8953560, 537213600,19554575040, 430200650880, 5531151225600,  38718058579200, 129060195264000,  154872234316800,   28158588057600,                 0,0],
-                        [1,506,106260,12113640, 823727520,34596555840, 899510451840,14135164243200, 127216478188800, 593676898214400, 1187353796428800,  647647525324800,                 0,0],
-                        [1,552,127512,16151519,1235591279,59308381439,1799020903680,33924394183680, 381649434566400,2374707592857600, 7124122778572800, 7771770303897600,  1295295050649600,0],
-                        [1,600,151800,21252000,1817046000,98847302400,3459655584000,77100895872000,1060137318240000,8481098545920000,35620613892864000,64764752532480000, 32382376266240000,0]],dtype=int)
-
-# Binomial Coefficients
-# C = factorial(n) // (factorial(k) * factorial(n-k))
-# Minimum required dimension is (max_am * 2, max_am)
-binomials = jnp.array([[1, 1,  0,  0,   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,  0,  0, 0,0], 
-                      [1, 1,  0,  0,   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,  0,  0, 0,0],
-                      [1, 2,  1,  0,   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,  0,  0, 0,0],
-                      [1, 3,  3,  1,   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,  0,  0, 0,0],
-                      [1, 4,  6,  4,   1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,  0,  0, 0,0],
-                      [1, 5, 10, 10,   5,    1,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,  0,  0, 0,0],
-                      [1, 6, 15, 20,  15,    6,    1,    0,    0,    0,    0,    0,    0,    0,    0,   0,  0,  0, 0,0],
-                      [1, 7, 21, 35,  35,   21,    7,    1,    0,    0,    0,    0,    0,    0,    0,   0,  0,  0, 0,0],
-                      [1, 8, 28, 56,  70,   56,   28,    8,    1,    0,    0,    0,    0,    0,    0,   0,  0,  0, 0,0],
-                      [1, 9, 36, 84, 126,  126,   84,   36,    9,    1,    0,    0,    0,    0,    0,   0,  0,  0, 0,0],
-                      [1,10, 45,120, 210,  252,  210,  120,   45,   10,    1,    0,    0,    0,    0,   0,  0,  0, 0,0],
-                      [1,11, 55,165, 330,  462,  462,  330,  165,   55,   11,    1,    0,    0,    0,   0,  0,  0, 0,0],
-                      [1,12, 66,220, 495,  792,  924,  792,  495,  220,   66,   12,    1,    0,    0,   0,  0,  0, 0,0],
-                      [1,13, 78,286, 715, 1287, 1716, 1716, 1287,  715,  286,   78,   13,    1,    0,   0,  0,  0, 0,0],
-                      [1,14, 91,364,1001, 2002, 3003, 3432, 3003, 2002, 1001,  364,   91,   14,    1,   0,  0,  0, 0,0],
-                      [1,15,105,455,1365, 3003, 5005, 6435, 6435, 5005, 3003, 1365,  455,  105,   15,   1,  0,  0, 0,0],
-                      [1,16,120,560,1820, 4368, 8008,11440,12870,11440, 8008, 4368, 1820,  560,  120,  16,  1,  0, 0,0],
-                      [1,17,136,680,2380, 6188,12376,19448,24310,24310,19448,12376, 6188, 2380,  680, 136, 17,  1, 0,0],
-                      [1,18,153,816,3060, 8568,18564,31824,43758,48620,43758,31824,18564, 8568, 3060, 816,153, 18, 1,0],
-                      [1,19,171,969,3876,11628,27132,50388,75582,92378,92378,75582,50388,27132,11628,3876,969,171,19,1]], dtype=int)
-
-# Angular momentum distribution combinations, up to max_am=5, (h functions)
-angular_momentum_combinations = jnp.array([
-[0,0,0], 
-[1,0,0],[0,1,0],[0,0,1],
-[2,0,0],[1,1,0],[1,0,1],[0,2,0],[0,1,1],[0,0,2], 
-[3,0,0],[2,1,0],[2,0,1],[1,2,0],[1,1,1],[1,0,2],[0,3,0],[0,2,1],[0,1,2],[0,0,3], 
-[4,0,0],[3,1,0],[3,0,1],[2,2,0],[2,1,1],[2,0,2],[1,3,0],[1,2,1],[1,1,2],[1,0,3],[0,4,0],[0,3,1],[0,2,2],[0,1,3],[0,0,4], 
-[5,0,0],[4,1,0],[4,0,1],[3,2,0],[3,1,1],[3,0,2],[2,3,0],[2,2,1],[2,1,2],[2,0,3],[1,4,0],[1,3,1],[1,2,2],[1,1,3],[1,0,4],[0,5,0],[0,4,1],[0,3,2],[0,2,3],[0,1,4],[0,0,5]], dtype=int)
-
-# The first index of angular_momentum_combinations which corresponds to beginning of s-class, p-class, d-class, f-class, g-class, h-class
-am_leading_indices = jnp.array([0,1,4,10,20,35,56], dtype=int)
-
-# Powers of negative one, need indices up to l1 + l2 + l3 + l4 = 20 for h functions
-neg_one_pow = jnp.array([1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1])
-
diff --git a/quax/external_integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
similarity index 88%
rename from quax/external_integrals/libint_interface.cc
rename to quax/integrals/libint_interface.cc
index 84fe78c..e306c7e 100644
--- a/quax/external_integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -155,15 +155,15 @@ std::vector<std::vector<int>> generate_multi_index_lookup(int nparams, int deriv
 // Compute overlap integrals
 py::array overlap() {
     // Overlap integral engine
-    libint2::Engine s_engine(libint2::Operator::overlap,obs.max_nprim(),obs.max_l());
+    libint2::Engine s_engine(libint2::Operator::overlap, obs.max_nprim(), obs.max_l());
     const auto& buf_vec = s_engine.results(); // will point to computed shell sets
     size_t length = nbf * nbf;
     std::vector<double> result(length); // vector to store integral array
 
-    for(auto s1=0; s1!=obs.size(); ++s1) {
+    for(auto s1 = 0; s1 != obs.size(); ++s1) {
         auto bf1 = shell2bf[s1];  // first basis function in first shell
         auto n1 = obs[s1].size(); // number of basis functions in first shell
-        for(auto s2=0; s2!=obs.size(); ++s2) {
+        for(auto s2 = 0; s2 != obs.size(); ++s2) {
             auto bf2 = shell2bf[s2];  // first basis function in second shell
             auto n2 = obs[s2].size(); // number of basis functions in second shell
 
@@ -172,8 +172,8 @@ py::array overlap() {
             if (ints_shellset == nullptr)
                 continue;  // nullptr returned if the entire shell-set was screened out
             // Loop over shell block, keeping a total count idx for the size of shell set
-            for(auto f1=0, idx=0; f1!=n1; ++f1) {
-                for(auto f2=0; f2!=n2; ++f2, ++idx) {
+            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
                     result[(bf1 + f1) * nbf + bf2 + f2] = ints_shellset[idx];
                 }
             }
@@ -185,15 +185,15 @@ py::array overlap() {
 // Compute kinetic energy integrals
 py::array kinetic() {
     // Kinetic energy integral engine
-    libint2::Engine t_engine(libint2::Operator::kinetic,obs.max_nprim(),obs.max_l());
+    libint2::Engine t_engine(libint2::Operator::kinetic, obs.max_nprim(), obs.max_l());
     const auto& buf_vec = t_engine.results(); // will point to computed shell sets
     size_t length = nbf * nbf;
     std::vector<double> result(length);
 
-    for(auto s1=0; s1!=obs.size(); ++s1) {
+    for(auto s1 = 0; s1 != obs.size(); ++s1) {
         auto bf1 = shell2bf[s1];  // first basis function in first shell
         auto n1 = obs[s1].size(); // number of basis functions in first shell
-        for(auto s2=0; s2!=obs.size(); ++s2) {
+        for(auto s2 = 0; s2 != obs.size(); ++s2) {
             auto bf2 = shell2bf[s2];  // first basis function in second shell
             auto n2 = obs[s2].size(); // number of basis functions in second shell
 
@@ -202,8 +202,8 @@ py::array kinetic() {
             if (ints_shellset == nullptr)
                 continue;  // nullptr returned if the entire shell-set was screened out
             // Loop over shell block, keeping a total count idx for the size of shell set
-            for(auto f1=0, idx=0; f1!=n1; ++f1) {
-                for(auto f2=0; f2!=n2; ++f2, ++idx) {
+            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
                     result[ (bf1 + f1) * nbf + bf2 + f2 ] = ints_shellset[idx];
                 }
             }
@@ -215,17 +215,17 @@ py::array kinetic() {
 // Compute nuclear-electron potential energy integrals
 py::array potential() {
     // Potential integral engine
-    libint2::Engine v_engine(libint2::Operator::nuclear,obs.max_nprim(),obs.max_l());
+    libint2::Engine v_engine(libint2::Operator::nuclear, obs.max_nprim(), obs.max_l());
     v_engine.set_params(make_point_charges(atoms));
     const auto& buf_vec = v_engine.results(); // will point to computed shell sets
 
     size_t length = nbf * nbf;
     std::vector<double> result(length);
     
-    for(auto s1=0; s1!=obs.size(); ++s1) {
+    for(auto s1 = 0; s1 != obs.size(); ++s1) {
         auto bf1 = shell2bf[s1];  // first basis function in first shell
         auto n1 = obs[s1].size(); // number of basis functions in first shell
-        for(auto s2=0; s2!=obs.size(); ++s2) {
+        for(auto s2 = 0; s2 != obs.size(); ++s2) {
             auto bf2 = shell2bf[s2];  // first basis function in second shell
             auto n2 = obs[s2].size(); // number of basis functions in second shell
 
@@ -234,8 +234,8 @@ py::array potential() {
             if (ints_shellset == nullptr)
                 continue;  // nullptr returned if the entire shell-set was screened out
             // Loop over shell block, keeping a total count idx for the size of shell set
-            for(auto f1=0, idx=0; f1!=n1; ++f1) {
-                for(auto f2=0; f2!=n2; ++f2, ++idx) {
+            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
                     // idx = x + (y * width) where x = bf2 + f2 and y = bf1 + f1 
                     result[ (bf1 + f1) * nbf + bf2 + f2 ] = ints_shellset[idx];
                 }
@@ -248,22 +248,22 @@ py::array potential() {
 // Computes electron repulsion integrals
 py::array eri() {
     // workaround for data copying: perhaps pass an empty numpy array, then populate it in C++? avoids last line, which copies
-    libint2::Engine eri_engine(libint2::Operator::coulomb,obs.max_nprim(),obs.max_l());
+    libint2::Engine eri_engine(libint2::Operator::coulomb, obs.max_nprim(), obs.max_l());
     const auto& buf_vec = eri_engine.results(); // will point to computed shell sets
 
     size_t length = nbf * nbf * nbf * nbf;
     std::vector<double> result(length);
     
-    for(auto s1=0; s1!=obs.size(); ++s1) {
+    for(auto s1 = 0; s1 != obs.size(); ++s1) {
         auto bf1 = shell2bf[s1];  // first basis function in first shell
         auto n1 = obs[s1].size(); // number of basis functions in first shell
-        for(auto s2=0; s2!=obs.size(); ++s2) {
+        for(auto s2 = 0; s2 != obs.size(); ++s2) {
             auto bf2 = shell2bf[s2];  // first basis function in second shell
             auto n2 = obs[s2].size(); // number of basis functions in second shell
-            for(auto s3=0; s3!=obs.size(); ++s3) {
+            for(auto s3=0; s3 != obs.size(); ++s3) {
                 auto bf3 = shell2bf[s3];  // first basis function in third shell
                 auto n3 = obs[s3].size(); // number of basis functions in third shell
-                for(auto s4=0; s4!=obs.size(); ++s4) {
+                for(auto s4 = 0; s4 != obs.size(); ++s4) {
                     auto bf4 = shell2bf[s4];  // first basis function in fourth shell
                     auto n4 = obs[s4].size(); // number of basis functions in fourth shell
 
@@ -272,13 +272,13 @@ py::array eri() {
                     if (ints_shellset == nullptr)
                         continue;  // nullptr returned if the entire shell-set was screened out
                     // Loop over shell block, keeping a total count idx for the size of shell set
-                    for(auto f1=0, idx=0; f1!=n1; ++f1) {
+                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                         size_t offset_1 = (bf1 + f1) * nbf * nbf * nbf;
-                        for(auto f2=0; f2!=n2; ++f2) {
+                        for(auto f2 = 0; f2 != n2; ++f2) {
                             size_t offset_2 = (bf2 + f2) * nbf * nbf;
-                            for(auto f3=0; f3!=n3; ++f3) {
+                            for(auto f3 = 0; f3 != n3; ++f3) {
                                 size_t offset_3 = (bf3 + f3) * nbf;
-                                for(auto f4=0; f4!=n4; ++f4, ++idx) {
+                                for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
                                     result[offset_1 + offset_2 + offset_3 + bf4 + f4] = ints_shellset[idx];
                                 }
                             }
@@ -303,7 +303,7 @@ py::array overlap_deriv(std::vector<int> deriv_vec) {
     process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
 
     // Overlap integral derivative engine
-    libint2::Engine s_engine(libint2::Operator::overlap,obs.max_nprim(),obs.max_l(),deriv_order);
+    libint2::Engine s_engine(libint2::Operator::overlap, obs.max_nprim(), obs.max_l(), deriv_order);
 
     // Get size of overlap derivative array and allocate 
     size_t length = nbf * nbf;
@@ -311,11 +311,11 @@ py::array overlap_deriv(std::vector<int> deriv_vec) {
 
     const auto& buf_vec = s_engine.results(); // will point to computed shell sets
     
-    for(auto s1=0; s1!=obs.size(); ++s1) {
+    for(auto s1 = 0; s1 != obs.size(); ++s1) {
         auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
         auto atom1 = shell2atom[s1]; // Atom index of shell 1
         auto n1 = obs[s1].size();    // number of basis functions in shell 1
-        for(auto s2=0; s2!=obs.size(); ++s2) {
+        for(auto s2=0; s2 != obs.size(); ++s2) {
             auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
             auto atom2 = shell2atom[s2]; // Atom index of shell 2
             auto n2 = obs[s2].size();    // number of basis functions in shell 2
@@ -323,12 +323,12 @@ py::array overlap_deriv(std::vector<int> deriv_vec) {
             if (atom1 == atom2) continue;
 
             // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
-            std::vector<long> shell_atom_index_list{atom1,atom2};
+            std::vector<long> shell_atom_index_list{atom1, atom2};
 
             // We can check if EVERY differentiated atom according to deriv_vec is contained in this set of 2 atom indices
             // This will ensure the derivative we want is in the buffer.
             std::vector<int> desired_shell_atoms; 
-            for (int i=0; i < deriv_order; i++){
+            for (int i = 0; i < deriv_order; i++){
                 int desired_atom = desired_atom_indices[i];
                 if (shell_atom_index_list[0] == desired_atom) desired_shell_atoms.push_back(0); 
                 else if (shell_atom_index_list[1] == desired_atom) desired_shell_atoms.push_back(1); 
@@ -342,7 +342,7 @@ py::array overlap_deriv(std::vector<int> deriv_vec) {
 
             // Now convert these shell atom indices into a shell derivative index, a set of indices length deriv_order with values between 0 and 5, corresponding to 6 possible shell center coordinates
             std::vector<int> shell_derivative;
-            for (int i=0; i < deriv_order; i++){
+            for (int i = 0; i < deriv_order; i++){
                 shell_derivative.push_back(3 * desired_shell_atoms[i] + desired_coordinates[i]);
             }
 
@@ -367,8 +367,8 @@ py::array overlap_deriv(std::vector<int> deriv_vec) {
                 continue;  // nullptr returned if the entire shell-set was screened out
 
             // Loop over shell block, keeping a total count idx for the size of shell set
-            for(auto f1=0, idx=0; f1!=n1; ++f1) {
-                for(auto f2=0; f2!=n2; ++f2, ++idx) {
+            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
                     result[(bf1 + f1) * nbf + bf2 + f2 ] = ints_shellset[idx];
                 }
             }
@@ -389,17 +389,17 @@ py::array kinetic_deriv(std::vector<int> deriv_vec) {
     process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
 
     // Kinetic integral derivative engine
-    libint2::Engine t_engine(libint2::Operator::kinetic,obs.max_nprim(),obs.max_l(),deriv_order);
+    libint2::Engine t_engine(libint2::Operator::kinetic, obs.max_nprim(), obs.max_l(), deriv_order);
     const auto& buf_vec = t_engine.results(); // will point to computed shell sets
 
     size_t length = nbf * nbf;
     std::vector<double> result(length);
     
-    for(auto s1=0; s1!=obs.size(); ++s1) {
+    for(auto s1 = 0; s1 != obs.size(); ++s1) {
         auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
         auto atom1 = shell2atom[s1]; // Atom index of shell 1
         auto n1 = obs[s1].size();    // number of basis functions in shell 1
-        for(auto s2=0; s2!=obs.size(); ++s2) {
+        for(auto s2 = 0; s2 != obs.size(); ++s2) {
             auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
             auto atom2 = shell2atom[s2]; // Atom index of shell 2
             auto n2 = obs[s2].size();    // number of basis functions in shell 2
@@ -407,12 +407,12 @@ py::array kinetic_deriv(std::vector<int> deriv_vec) {
             if (atom1 == atom2) continue;
 
             // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
-            std::vector<long> shell_atom_index_list{atom1,atom2};
+            std::vector<long> shell_atom_index_list{atom1, atom2};
 
             // We can check if EVERY differentiated atom according to deriv_vec is contained in this set of 2 atom indices
             // This will ensure the derivative we want is in the buffer.
             std::vector<int> desired_shell_atoms; 
-            for (int i=0; i < deriv_order; i++){
+            for (int i = 0; i < deriv_order; i++){
                 int desired_atom = desired_atom_indices[i];
                 if (shell_atom_index_list[0] == desired_atom) desired_shell_atoms.push_back(0); 
                 else if (shell_atom_index_list[1] == desired_atom) desired_shell_atoms.push_back(1); 
@@ -426,7 +426,7 @@ py::array kinetic_deriv(std::vector<int> deriv_vec) {
 
             // Now convert these shell atom indices into a shell derivative index, a set of indices length deriv_order with values between 0 and 5, corresponding to 6 possible shell center coordinates
             std::vector<int> shell_derivative;
-            for (int i=0; i < deriv_order; i++){
+            for (int i = 0; i < deriv_order; i++){
                 shell_derivative.push_back(3 * desired_shell_atoms[i] + desired_coordinates[i]);
             }
 
@@ -451,8 +451,8 @@ py::array kinetic_deriv(std::vector<int> deriv_vec) {
                 continue;  // nullptr returned if the entire shell-set was screened out
 
             // Loop over shell block, keeping a total count idx for the size of shell set
-            for(auto f1=0, idx=0; f1!=n1; ++f1) {
-                for(auto f2=0; f2!=n2; ++f2, ++idx) {
+            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
                     result[(bf1 + f1) * nbf + bf2 + f2 ] = ints_shellset[idx];
                 }
             }
@@ -468,10 +468,8 @@ py::array potential_deriv(std::vector<int> deriv_vec) {
     int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
 
     // Lookup arrays for mapping shell derivative index to buffer index 
-    // Potential derivatives are weird. The dimension size is 6 + ncart + ncart 
-    // I believe only the first 6 and last ncart are relevent. Idk what is with the ghost dimension 
     // Potential lookup arrays depend on atom size
-    int dimensions = 6 + 2 * 3 * atoms.size();
+    int dimensions = 6 + 3 * atoms.size();
     static const std::vector<int> buffer_index_potential1d = generate_1d_lookup(dimensions);
     static const std::vector<std::vector<int>> buffer_index_potential2d = generate_2d_lookup(dimensions);
     static const std::vector<std::vector<std::vector<int>>> buffer_index_potential3d = generate_3d_lookup(dimensions);
@@ -483,7 +481,7 @@ py::array potential_deriv(std::vector<int> deriv_vec) {
     process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
 
     // Potential integral derivative engine
-    libint2::Engine v_engine(libint2::Operator::nuclear,obs.max_nprim(),obs.max_l(),deriv_order);
+    libint2::Engine v_engine(libint2::Operator::nuclear, obs.max_nprim(), obs.max_l(), deriv_order);
     v_engine.set_params(libint2::make_point_charges(atoms));
     const auto& buf_vec = v_engine.results(); // will point to computed shell sets
 
@@ -491,47 +489,45 @@ py::array potential_deriv(std::vector<int> deriv_vec) {
     size_t length = nbf * nbf;
     std::vector<double> result(length);
 
-    for(auto s1=0; s1!=obs.size(); ++s1) {
+    for(auto s1 = 0; s1 != obs.size(); ++s1) {
         auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
         auto atom1 = shell2atom[s1]; // Atom index of shell 1
         auto n1 = obs[s1].size();    // number of basis functions in shell 1
-        for(auto s2=0; s2!=obs.size(); ++s2) {
+        for(auto s2 = 0; s2 != obs.size(); ++s2) {
             auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
             auto atom2 = shell2atom[s2]; // Atom index of shell 2
             auto n2 = obs[s2].size();    // number of basis functions in shell 2
 
             // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
-            std::vector<long> shell_atom_index_list{atom1,atom2};
+            std::vector<long> shell_atom_index_list{atom1, atom2};
 
             // Initialize 2d vector, with DERIV_ORDER subvectors
             // Each subvector contains index candidates which are possible choices for each partial derivative operator
             // In other words, indices looks like { {choices for first deriv operator} {choices for second deriv op} {third} ...}
             // The cartesian product of these subvectors gives all combos that need to be summed to form total nuclear derivative of integrals
             std::vector<std::vector<int>> indices; 
-            for (int i=0;i<deriv_order; i++){
+            for (int i = 0; i < deriv_order; i++){
                 std::vector<int> new_vec;
                 indices.push_back(new_vec);
             }
 
             // For every desired atom derivative, check shell and nuclear indices for a match, add it to subvector for that derivative
             // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
-            for (int j=0; j < desired_atom_indices.size(); j++){
+            for (int j = 0; j < desired_atom_indices.size(); j++){
                 int desired_atom_idx = desired_atom_indices[j];
                 // Shell indices
-                for (int i=0; i<2; i++){
+                for (int i = 0; i < 2; i++){
                     int atom_idx = shell_atom_index_list[i];
                     if (atom_idx == desired_atom_idx) { 
                         int tmp = 3 * i + desired_coordinates[j];
                         indices[j].push_back(tmp);
                     }
                 }
-                // TODO weird action here by libint, theres a NCART block of zeros introduced between shell derivs and real NCART derivs
-                // So we compensate by starting from 2 + natom
-                // If this is ever changed, this needs to be edited.
-                for (int i=0; i<natom; i++){
+                
+                for (int i = 0; i < natom; i++){
                     // i = shell_atom_index_list[i];
                     if (i == desired_atom_idx) { 
-                        int offset_i = i + 2 + natom;
+                        int offset_i = i + 2;
                         int tmp = 3 * offset_i + desired_coordinates[j];
                         indices[j].push_back(tmp);
                     }
@@ -547,20 +543,20 @@ py::array potential_deriv(std::vector<int> deriv_vec) {
             // Loop over every subvector of index_combos and lookup buffer index.
             std::vector<int> buffer_indices;
             if (deriv_order == 1){
-                for (int i=0; i < index_combos.size(); i++){
+                for (int i = 0; i < index_combos.size(); i++){
                     int idx1 = index_combos[i][0];
                     buffer_indices.push_back(buffer_index_potential1d[idx1]);
                 }
             }
             else if (deriv_order == 2){
-                for (int i=0; i < index_combos.size(); i++){
+                for (int i = 0; i < index_combos.size(); i++){
                     int idx1 = index_combos[i][0];
                     int idx2 = index_combos[i][1];
                     buffer_indices.push_back(buffer_index_potential2d[idx1][idx2]);
                 }
             }
             else if (deriv_order == 3){
-                for (int i=0; i < index_combos.size(); i++){
+                for (int i = 0; i < index_combos.size(); i++){
                     int idx1 = index_combos[i][0];
                     int idx2 = index_combos[i][1];
                     int idx3 = index_combos[i][2];
@@ -568,7 +564,7 @@ py::array potential_deriv(std::vector<int> deriv_vec) {
                 }
             }
             else if (deriv_order == 4){
-                for (int i=0; i < index_combos.size(); i++){
+                for (int i = 0; i < index_combos.size(); i++){
                     int idx1 = index_combos[i][0];
                     int idx2 = index_combos[i][1];
                     int idx3 = index_combos[i][2];
@@ -578,11 +574,11 @@ py::array potential_deriv(std::vector<int> deriv_vec) {
             }
 
             // Loop over every buffer index and accumulate for every shell set.
-            for(auto i=0; i<buffer_indices.size(); ++i) {
+            for(auto i = 0; i < buffer_indices.size(); ++i) {
               auto ints_shellset = buf_vec[buffer_indices[i]]; 
               if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
-              for(auto f1=0, idx=0; f1!=n1; ++f1) {
-                for(auto f2=0; f2!=n2; ++f2, ++idx) {
+              for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
                   result[(bf1 + f1) * nbf + bf2 + f2] += ints_shellset[idx]; 
                 }
               }
@@ -604,24 +600,24 @@ py::array eri_deriv(std::vector<int> deriv_vec) {
     assert(3 * atoms.size() == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
 
     // ERI derivative integral engine
-    libint2::Engine eri_engine(libint2::Operator::coulomb,obs.max_nprim(),obs.max_l(),deriv_order);
+    libint2::Engine eri_engine(libint2::Operator::coulomb, obs.max_nprim(), obs.max_l(), deriv_order);
     const auto& buf_vec = eri_engine.results(); // will point to computed shell sets
     size_t length = nbf * nbf * nbf * nbf;
     std::vector<double> result(length);
 
-    for(auto s1=0; s1!=obs.size(); ++s1) {
+    for(auto s1 = 0; s1 != obs.size(); ++s1) {
         auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
         auto atom1 = shell2atom[s1]; // Atom index of shell 1
         auto n1 = obs[s1].size();    // number of basis functions in shell 1
-        for(auto s2=0; s2!=obs.size(); ++s2) {
+        for(auto s2 = 0; s2 != obs.size(); ++s2) {
             auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
             auto atom2 = shell2atom[s2]; // Atom index of shell 2
             auto n2 = obs[s2].size();    // number of basis functions in shell 2
-            for(auto s3=0; s3!=obs.size(); ++s3) {
+            for(auto s3 = 0; s3 != obs.size(); ++s3) {
                 auto bf3 = shell2bf[s3];     // Index of first basis function in shell 3
                 auto atom3 = shell2atom[s3]; // Atom index of shell 3
                 auto n3 = obs[s3].size();    // number of basis functions in shell 3
-                for(auto s4=0; s4!=obs.size(); ++s4) {
+                for(auto s4 = 0; s4 != obs.size(); ++s4) {
                     auto bf4 = shell2bf[s4];     // Index of first basis function in shell 4
                     auto atom4 = shell2atom[s4]; // Atom index of shell 4
                     auto n4 = obs[s4].size();    // number of basis functions in shell 4
@@ -630,7 +626,7 @@ py::array eri_deriv(std::vector<int> deriv_vec) {
                     if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
                     // Ensure all desired_atoms correspond to at least one shell atom to ensure desired derivative exists. else, skip this shell quartet.
                     bool atoms_not_present = false;
-                    for (int i=0; i < deriv_order; i++){
+                    for (int i = 0; i < deriv_order; i++){
                         if (atom1 == desired_atom_indices[i]) continue; 
                         else if (atom2 == desired_atom_indices[i]) continue;
                         else if (atom3 == desired_atom_indices[i]) continue;
@@ -640,24 +636,24 @@ py::array eri_deriv(std::vector<int> deriv_vec) {
                     if (atoms_not_present) continue;
 
                     // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
-                    std::vector<long> shell_atom_index_list{atom1,atom2,atom3,atom4};
+                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
 
                     // Initialize 2d vector, with DERIV_ORDER subvectors
                     // Each subvector contains index candidates which are possible choices for each partial derivative operator
                     // In other words, indices looks like { {choices for first deriv operator} {choices for second deriv op} {third} ...}
                     // The cartesian product of these subvectors gives all combos that need to be summed to form total nuclear derivative of integrals
                     std::vector<std::vector<int>> indices;
-                    for (int i=0;i<deriv_order; i++){
+                    for (int i = 0; i < deriv_order; i++){
                         std::vector<int> new_vec;
                         indices.push_back(new_vec);
                     }
                 
                     // For every desired atom derivative, check shell indices for a match, add it to subvector for that derivative
                     // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
-                    for (int j=0; j < desired_atom_indices.size(); j++){
+                    for (int j = 0; j < desired_atom_indices.size(); j++){
                         int desired_atom_idx = desired_atom_indices[j];
                         // Shell indices
-                        for (int i=0; i<4; i++){
+                        for (int i = 0; i < 4; i++){
                             int atom_idx = shell_atom_index_list[i];
                             if (atom_idx == desired_atom_idx) {
                                 int tmp = 3 * i + desired_coordinates[j];
@@ -675,20 +671,20 @@ py::array eri_deriv(std::vector<int> deriv_vec) {
                     // Now create buffer_indices from these index combos using lookup array
                     std::vector<int> buffer_indices;
                     if (deriv_order == 1){ 
-                        for (int i=0; i < index_combos.size(); i++){
+                        for (int i = 0; i < index_combos.size(); i++){
                             int idx1 = index_combos[i][0];
                             buffer_indices.push_back(buffer_index_eri1d[idx1]);
                         }
                     }
                     else if (deriv_order == 2){ 
-                        for (int i=0; i < index_combos.size(); i++){
+                        for (int i = 0; i < index_combos.size(); i++){
                             int idx1 = index_combos[i][0];
                             int idx2 = index_combos[i][1];
                             buffer_indices.push_back(buffer_index_eri2d[idx1][idx2]);
                         }
                     }
                     else if (deriv_order == 3){ 
-                        for (int i=0; i < index_combos.size(); i++){
+                        for (int i = 0; i < index_combos.size(); i++){
                             int idx1 = index_combos[i][0];
                             int idx2 = index_combos[i][1];
                             int idx3 = index_combos[i][2];
@@ -696,7 +692,7 @@ py::array eri_deriv(std::vector<int> deriv_vec) {
                         }
                     }
                     else if (deriv_order == 4){ 
-                        for (int i=0; i < index_combos.size(); i++){
+                        for (int i = 0; i < index_combos.size(); i++){
                             int idx1 = index_combos[i][0];
                             int idx2 = index_combos[i][1];
                             int idx3 = index_combos[i][2];
@@ -708,16 +704,16 @@ py::array eri_deriv(std::vector<int> deriv_vec) {
                     // If we made it this far, the shell derivative we want is contained in the buffer. 
                     eri_engine.compute(obs[s1], obs[s2], obs[s3], obs[s4]); // Compute shell set, fills buf_vec
 
-                    for(auto i=0; i<buffer_indices.size(); ++i) {
+                    for(auto i = 0; i<buffer_indices.size(); ++i) {
                         auto ints_shellset = buf_vec[buffer_indices[i]];
                         if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
-                        for(auto f1=0, idx=0; f1!=n1; ++f1) {
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                             size_t offset_1 = (bf1 + f1) * nbf * nbf * nbf;
-                            for(auto f2=0; f2!=n2; ++f2) {
+                            for(auto f2 = 0; f2 != n2; ++f2) {
                                 size_t offset_2 = (bf2 + f2) * nbf * nbf;
-                                for(auto f3=0; f3!=n3; ++f3) {
+                                for(auto f3 = 0; f3 != n3; ++f3) {
                                     size_t offset_3 = (bf3 + f3) * nbf;
-                                    for(auto f4=0; f4!=n4; ++f4, ++idx) {
+                                    for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
                                         result[offset_1 + offset_2 + offset_3 + bf4 + f4] += ints_shellset[idx];
                                     }
                                 }
@@ -759,7 +755,7 @@ py::array eri_deriv(std::vector<int> deriv_vec) {
 void oei_deriv_disk(int max_deriv_order) {
     std::cout << "Writing one-electron integral derivative tensors up to order " << max_deriv_order << " to disk...";
     long total_deriv_slices = 0;
-    for (int i=1; i<= max_deriv_order; i++){
+    for (int i = 1; i <= max_deriv_order; i++){
         total_deriv_slices += how_many_derivs(natom, i);
         }
 
@@ -780,9 +776,8 @@ void oei_deriv_disk(int max_deriv_order) {
         // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
         // Overlap and kinetic have different mappings than potential since potential has more elements in the buffer 
         const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
-        // Potential integrals buffer is flattened upper triangle of (6 + NCART + NCART) dimensional deriv_order tensor
-        // TODO if libint ever fixes the erroneous NCART + NCART buffer dimension size, this needs to be changed (remove *2)
-        int dimensions = 6 + 2 * 3 * natom;
+        // Potential integrals buffer is flattened upper triangle of (6 + NCART) dimensional deriv_order tensor
+        int dimensions = 6 + 3 * natom;
         const std::vector<std::vector<int>> potential_buffer_multidim_lookup = generate_multi_index_lookup(dimensions, deriv_order);
 
         // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
@@ -809,19 +804,18 @@ void oei_deriv_disk(int max_deriv_order) {
         DataSet* overlap_dataset = new DataSet(file->createDataSet(overlap_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
         DataSet* kinetic_dataset = new DataSet(file->createDataSet(kinetic_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
         DataSet* potential_dataset = new DataSet(file->createDataSet(potential_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
-        hsize_t stride[3] = {1,1,1}; // stride and block can be used to 
-        hsize_t block[3] = {1,1,1};  // add values to multiple places, useful if symmetry ever used.
-        hsize_t zerostart[3] = {0,0,0};
+        hsize_t stride[3] = {1, 1, 1}; // stride and block can be used to 
+        hsize_t block[3] = {1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
+        hsize_t zerostart[3] = {0, 0, 0};
 
-        for(auto s1=0; s1!=obs.size(); ++s1) {
+        for(auto s1 = 0; s1 != obs.size(); ++s1) {
             auto bf1 = shell2bf[s1];  // first basis function in first shell
             auto atom1 = shell2atom[s1]; // Atom index of shell 1
             auto n1 = obs[s1].size(); // number of basis functions in first shell
-            for(auto s2=0; s2!=obs.size(); ++s2) {
+            for(auto s2 = 0; s2 != obs.size(); ++s2) {
                 auto bf2 = shell2bf[s2];  // first basis function in second shell
                 auto atom2 = shell2atom[s2]; // Atom index of shell 2
                 auto n2 = obs[s2].size(); // number of basis functions in second shell
-                //if (atom1 == atom2) continue;
                 std::vector<long> shell_atom_index_list{atom1,atom2};
 
                 overlap_engine.compute(obs[s1], obs[s2]);
@@ -835,7 +829,7 @@ void oei_deriv_disk(int max_deriv_order) {
                 
                 // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
                 // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
-                for(int nuc_idx=0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
                     // Look up multidimensional cartesian derivative index
                     auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
                     // For overlap/kinetic and potential sepearately, create a vector of vectors called `indices`, where each subvector
@@ -846,7 +840,7 @@ void oei_deriv_disk(int max_deriv_order) {
                 
                     // Loop over each cartesian coordinate index which we are differentiating wrt for this nuclear cartesian derivative index
                     // and check to see if it is present in the shell duet, and where it is present in the potential operator 
-                    for (int j=0; j < multi_cart_idx.size(); j++){
+                    for (int j = 0; j < multi_cart_idx.size(); j++){
                         int desired_atom_idx = multi_cart_idx[j] / 3;
                         int desired_coord = multi_cart_idx[j] % 3;
                         // Loop over shell indices
@@ -860,10 +854,9 @@ void oei_deriv_disk(int max_deriv_order) {
                         }
                         // Now for potentials only, loop over each atom in molecule, and if this derivative
                         // differentiates wrt that atom, we also need to collect that index.
-                        // If libint ever removes that extra NCART dimension, remove the `+ natom`
-                        for (int i=0; i<natom; i++){
+                        for (int i = 0; i < natom; i++){
                             if (i == desired_atom_idx) {
-                                int offset_i = i + 2 + natom;
+                                int offset_i = i + 2;
                                 int tmp = 3 * offset_i + desired_coord;
                                 potential_indices[j].push_back(tmp);
                             }
@@ -896,21 +889,21 @@ void oei_deriv_disk(int max_deriv_order) {
 
                     // Loop over shell block for each buffer index which contributes to this derivative
                     // Overlap and Kinetic
-                    for(auto i=0; i<buffer_indices.size(); ++i) {
+                    for(auto i = 0; i < buffer_indices.size(); ++i) {
                         auto overlap_shellset = overlap_buffer[buffer_indices[i]];
                         auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
-                        for(auto f1=0, idx=0; f1!=n1; ++f1) {
-                            for(auto f2=0; f2!=n2; ++f2, ++idx) {
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
                                 overlap_shellset_slab[f1][f2][nuc_idx] += overlap_shellset[idx];
                                 kinetic_shellset_slab[f1][f2][nuc_idx] += kinetic_shellset[idx];
                             }
                         }
                     }
                     // Potential
-                    for(auto i=0; i<potential_buffer_indices.size(); ++i) {
+                    for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
                         auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
-                        for(auto f1=0, idx=0; f1!=n1; ++f1) {
-                            for(auto f2=0; f2!=n2; ++f2, ++idx) {
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
                                 potential_shellset_slab[f1][f2][nuc_idx] += potential_shellset[idx];
                             }
                         }
@@ -963,7 +956,7 @@ void eri_deriv_disk(int max_deriv_order) {
 
     // Check to make sure you are not flooding the disk.
     long total_deriv_slices = 0;
-    for (int i=1; i<= max_deriv_order; i++){
+    for (int i = 1; i <= max_deriv_order; i++){
         total_deriv_slices += how_many_derivs(natom, i);
         }
     double check = (nbf * nbf * nbf * nbf * total_deriv_slices * 8) * (1e-9);
@@ -982,7 +975,7 @@ void eri_deriv_disk(int max_deriv_order) {
         const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(natom * 3, deriv_order);
 
         // Libint engine for computing shell quartet derivatives
-        libint2::Engine eri_engine(libint2::Operator::coulomb,obs.max_nprim(),obs.max_l(), deriv_order);
+        libint2::Engine eri_engine(libint2::Operator::coulomb, obs.max_nprim(), obs.max_l(), deriv_order);
         const auto& eri_buffer = eri_engine.results(); // will point to computed shell sets
 
         // Define HDF5 dataset name
@@ -991,47 +984,47 @@ void eri_deriv_disk(int max_deriv_order) {
         DataSpace fspace(5, file_dims);
         // Create dataset for each integral type and write 0.0's into the file 
         DataSet* eri_dataset = new DataSet(file->createDataSet(eri_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
-        hsize_t stride[5] = {1,1,1,1,1}; // stride and block can be used to 
-        hsize_t block[5] = {1,1,1,1,1};  // add values to multiple places, useful if symmetry ever used.
-        hsize_t zerostart[5] = {0,0,0,0,0};
+        hsize_t stride[5] = {1, 1, 1, 1, 1}; // stride and block can be used to 
+        hsize_t block[5] = {1, 1, 1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
+        hsize_t zerostart[5] = {0, 0, 0, 0, 0};
 
         // Begin shell quartet loops
-        for(auto s1=0; s1!=obs.size(); ++s1) {
+        for(auto s1 = 0; s1 != obs.size(); ++s1) {
             auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
             auto atom1 = shell2atom[s1]; // Atom index of shell 1
             auto n1 = obs[s1].size();    // number of basis functions in shell 1
-            for(auto s2=0; s2!=obs.size(); ++s2) {
+            for(auto s2 = 0; s2 != obs.size(); ++s2) {
                 auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
                 auto atom2 = shell2atom[s2]; // Atom index of shell 2
                 auto n2 = obs[s2].size();    // number of basis functions in shell 2
-                for(auto s3=0; s3!=obs.size(); ++s3) {
+                for(auto s3 = 0; s3 != obs.size(); ++s3) {
                     auto bf3 = shell2bf[s3];     // Index of first basis function in shell 3
                     auto atom3 = shell2atom[s3]; // Atom index of shell 3
                     auto n3 = obs[s3].size();    // number of basis functions in shell 3
-                    for(auto s4=0; s4!=obs.size(); ++s4) {
+                    for(auto s4 = 0; s4 != obs.size(); ++s4) {
                         auto bf4 = shell2bf[s4];     // Index of first basis function in shell 4
                         auto atom4 = shell2atom[s4]; // Atom index of shell 4
                         auto n4 = obs[s4].size();    // number of basis functions in shell 4
 
                         if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
-                        std::vector<long> shell_atom_index_list{atom1,atom2,atom3,atom4};
+                        std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
 
                         eri_engine.compute(obs[s1], obs[s2], obs[s3], obs[s4]); // Compute shell set
 
                         // Define shell set slab, with extra dimension for unique derivatives, initialized with 0.0's
                         double eri_shellset_slab [n1][n2][n3][n4][nderivs_triu] = {};
                         // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-                        for(int nuc_idx=0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                        for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
                             // Look up multidimensional cartesian derivative index
                             auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
     
                             std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
     
                             // Find out which 
-                            for (int j=0; j < multi_cart_idx.size(); j++){
+                            for (int j = 0; j < multi_cart_idx.size(); j++){
                                 int desired_atom_idx = multi_cart_idx[j] / 3;
                                 int desired_coord = multi_cart_idx[j] % 3;
-                                for (int i=0; i<4; i++){
+                                for (int i = 0; i < 4; i++){
                                     int atom_idx = shell_atom_index_list[i];
                                     if (atom_idx == desired_atom_idx) {
                                         int tmp = 3 * i + desired_coord;
@@ -1065,13 +1058,13 @@ void eri_deriv_disk(int max_deriv_order) {
                             }
 
                             // Loop over shell block, keeping a total count idx for the size of shell set
-                            for(auto i=0; i<buffer_indices.size(); ++i) {
+                            for(auto i = 0; i < buffer_indices.size(); ++i) {
                                 auto eri_shellset = eri_buffer[buffer_indices[i]];
                                 if (eri_shellset == nullptr) continue;
-                                for(auto f1=0, idx=0; f1!=n1; ++f1) {
-                                    for(auto f2=0; f2!=n2; ++f2) {
-                                        for(auto f3=0; f3!=n3; ++f3) {
-                                            for(auto f4=0; f4!=n4; ++f4, ++idx) {
+                                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                                    for(auto f2 = 0; f2 != n2; ++f2) {
+                                        for(auto f3 = 0; f3 != n3; ++f3) {
+                                            for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
                                                 eri_shellset_slab[f1][f2][f3][f4][nuc_idx] += eri_shellset[idx];
                                             }
                                         }
@@ -1116,37 +1109,37 @@ py::array eri_deriv_core(int deriv_order) {
     const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(natom * 3, deriv_order);
 
     // Libint engine for computing shell quartet derivatives
-    libint2::Engine eri_engine(libint2::Operator::coulomb,obs.max_nprim(),obs.max_l(), deriv_order);
+    libint2::Engine eri_engine(libint2::Operator::coulomb, obs.max_nprim(), obs.max_l(), deriv_order);
     const auto& eri_buffer = eri_engine.results(); // will point to computed shell sets
 
     size_t length = nbf * nbf * nbf * nbf * nderivs_triu;
     std::vector<double> result(length);
 
     // Begin shell quartet loops
-    for(auto s1=0; s1!=obs.size(); ++s1) {
+    for(auto s1 = 0; s1 != obs.size(); ++s1) {
         auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
         auto atom1 = shell2atom[s1]; // Atom index of shell 1
         auto n1 = obs[s1].size();    // number of basis functions in shell 1
-        for(auto s2=0; s2!=obs.size(); ++s2) {
+        for(auto s2 = 0; s2 != obs.size(); ++s2) {
             auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
             auto atom2 = shell2atom[s2]; // Atom index of shell 2
             auto n2 = obs[s2].size();    // number of basis functions in shell 2
-            for(auto s3=0; s3!=obs.size(); ++s3) {
+            for(auto s3 = 0; s3 != obs.size(); ++s3) {
                 auto bf3 = shell2bf[s3];     // Index of first basis function in shell 3
                 auto atom3 = shell2atom[s3]; // Atom index of shell 3
                 auto n3 = obs[s3].size();    // number of basis functions in shell 3
-                for(auto s4=0; s4!=obs.size(); ++s4) {
+                for(auto s4 = 0; s4 != obs.size(); ++s4) {
                     auto bf4 = shell2bf[s4];     // Index of first basis function in shell 4
                     auto atom4 = shell2atom[s4]; // Atom index of shell 4
                     auto n4 = obs[s4].size();    // number of basis functions in shell 4
 
                     if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
-                    std::vector<long> shell_atom_index_list{atom1,atom2,atom3,atom4};
+                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
 
                     eri_engine.compute(obs[s1], obs[s2], obs[s3], obs[s4]); // Compute shell set
 
                     // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-                    for(int nuc_idx=0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                    for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
                         size_t offset_nuc_idx = nuc_idx * nbf * nbf * nbf * nbf;
 
                         // Look up multidimensional cartesian derivative index
@@ -1154,10 +1147,10 @@ py::array eri_deriv_core(int deriv_order) {
     
                         // Find out which shell derivatives provided by Libint correspond to this nuclear cartesian derivative
                         std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-                        for (int j=0; j < multi_cart_idx.size(); j++){
+                        for (int j = 0; j < multi_cart_idx.size(); j++){
                             int desired_atom_idx = multi_cart_idx[j] / 3;
                             int desired_coord = multi_cart_idx[j] % 3;
-                            for (int i=0; i<4; i++){
+                            for (int i = 0; i<4; i++){
                                 int atom_idx = shell_atom_index_list[i];
                                 if (atom_idx == desired_atom_idx) {
                                     int tmp = 3 * i + desired_coord;
@@ -1191,19 +1184,18 @@ py::array eri_deriv_core(int deriv_order) {
                         }
 
                         // Loop over shell block, keeping a total count idx for the size of shell set
-                        for(auto i=0; i<buffer_indices.size(); ++i) {
+                        for(auto i = 0; i < buffer_indices.size(); ++i) {
                             auto eri_shellset = eri_buffer[buffer_indices[i]];
                             if (eri_shellset == nullptr) continue;
-                            for(auto f1=0, idx=0; f1!=n1; ++f1) {
+                            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                                 size_t offset_1 = (bf1 + f1) * nbf * nbf * nbf;
-                                for(auto f2=0; f2!=n2; ++f2) {
+                                for(auto f2 = 0; f2 != n2; ++f2) {
                                     size_t offset_2 = (bf2 + f2) * nbf * nbf;
-                                    for(auto f3=0; f3!=n3; ++f3) {
+                                    for(auto f3 = 0; f3 != n3; ++f3) {
                                         size_t offset_3 = (bf3 + f3) * nbf;
-                                        for(auto f4=0; f4!=n4; ++f4, ++idx) {
+                                        for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
                                             size_t offset_4 = bf4 + f4;
                                             result[offset_1 + offset_2 + offset_3 + offset_4 + offset_nuc_idx] += eri_shellset[idx];
-                                            //eri_shellset_slab[f1][f2][f3][f4][nuc_idx] += eri_shellset[idx];
                                         }
                                     }
                                 }
@@ -1217,7 +1209,6 @@ py::array eri_deriv_core(int deriv_order) {
     return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
 } // eri_deriv_disk function
 
-
 // Define module named 'libint_interface' which can be imported with python
 // The second arg, 'm' defines a variable py::module_ which can be used to create
 // bindings. the def() methods generates binding code that exposes new functions to Python.
diff --git a/quax/external_integrals/makefile b/quax/integrals/makefile
similarity index 60%
rename from quax/external_integrals/makefile
rename to quax/integrals/makefile
index 58047d7..93dad13 100644
--- a/quax/external_integrals/makefile
+++ b/quax/integrals/makefile
@@ -2,22 +2,22 @@
 # Eigen headers, Python headers, Pybind11 headers, Libint API headers libint2.h libint2.hpp, the rest of the Libint2 headers, and the library location of libint2.a,
 CC      := g++
 # Libint prefix location (where /include, /include/libint2, /lib, /share are located) 
-LIBINT_PREFIX := /home/vulcan/ecm23353/Code/bin/libint-2.8.0/PREFIX
+LIBINT_PREFIX := /home/ecm23353/psi_env
 
 I1 := $(LIBINT_PREFIX)/include
 I2 := $(LIBINT_PREFIX)/include/libint2
 L1 := $(LIBINT_PREFIX)/lib
 # Eigen headers location 
-I3 := /home/vulcan/ecm23353/.conda/envs/quax/include/eigen3
+I3 := /home/ecm23353/psi_env/include/eigen3
 # Python headers location 
-I4 := /home/vulcan/ecm23353/.conda/envs/quax/include/python3.10
+I4 := /home/ecm23353/psi_env/include/python3.10
 # Pybind11 headers location 
-I5 := /home/vulcan/ecm23353/.conda/envs/quax/lib/python3.10/site-packages/pybind11/include
+I5 := /home/ecm23353/psi_env/lib/python3.10/site-packages/pybind11/include
 # HDF5 headers, static and shared libraries 
-I6 := /home/vulcan/ecm23353/.conda/envs/quax/include
-L2 := /home/vulcan/ecm23353/.conda/envs/quax/lib
+I6 := /home/ecm23353/psi_env/include
+L2 := /home/ecm23353/psi_env/lib
 # Edit path in quotes to be same location as L2 definition above
-RPATH := -Wl,-rpath,"/home/vulcan/ecm23353/.conda/envs/quax/lib"
+RPATH := -Wl,-rpath,"/home/ecm23353/psi_env/lib"
 
 # This 'TARGETS' suffix should be set to whatever is returned by the command `python3-config --extension-suffix` entered on command line.
 # and it should match the same python version referenced in the above include path for I4 := (3.7 in this case)
@@ -33,7 +33,7 @@ clean:
 	rm -f $(OBJ)
 
 $(OBJ): %.o : %.cc $(DEPS)
-	$(CC) -c $< -o $@ -O3 -fPIC -std=c++11 -I $(I1) -I $(I2) -I $(I3) -I $(I4) -I $(I5) -I $(I6) -lint2 -L $(L1) -lhdf5 -lhdf5_cpp -L $(L2) $(RPATH)
+	$(CC) -c $< -o $@ -O3 -fPIC -I $(I1) -I $(I2) -I $(I3) -I $(I4) -I $(I5) -I $(I6) -lint2 -L $(L1) -lhdf5 -lhdf5_cpp -L $(L2) $(RPATH)
 $(TARGETS): $(OBJ)
-	$(CC) $^ -o $@ -O3 -fPIC -shared -std=c++11 -I $(I1) -I $(I2) -I $(I3) -I $(I4) -I $(I5) -I $(I6) -lint2 -L $(L1) -lhdf5 -lhdf5_cpp -L $(L2) $(RPATH)
+	$(CC) $^ -o $@ -O3 -fPIC -shared -I $(I1) -I $(I2) -I $(I3) -I $(I4) -I $(I5) -I $(I6) -lint2 -L $(L1) -lhdf5 -lhdf5_cpp -L $(L2) $(RPATH)
 
diff --git a/quax/integrals/oei.py b/quax/integrals/oei.py
index b5bde7d..976765e 100644
--- a/quax/integrals/oei.py
+++ b/quax/integrals/oei.py
@@ -1,225 +1,249 @@
-import jax 
-from jax.config import config; config.update("jax_enable_x64", True)
+import jax
 import jax.numpy as jnp
 import numpy as np
-from jax.lax import fori_loop, while_loop
-from functools import partial
-
-from .integrals_utils import gaussian_product, boys, binomial_prefactor, factorials, double_factorials, neg_one_pow, cartesian_product, am_leading_indices, angular_momentum_combinations
-from .basis_utils import flatten_basis_data, get_nbf
-
-# Useful resources: Fundamentals of Molecular Integrals Evaluation, Fermann, Valeev https://arxiv.org/abs/2007.12057
-#                   Gaussian-Expansion methods of molecular integrals Taketa, Huzinaga, O-ohata 
-
-def overlap(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,prefactor):
-    """
-    Computes a single overlap integral. Taketa, Huzinaga, Oohata 2.12
-    P = gaussian product of aa,A; bb,B
-    PA_pow, PB_pow
-        All powers of Pi-Ai or Pi-Bi packed into an array
-        [[(Px-Ax)^0, (Px-Ax)^1, ... (Px-Ax)^max_am]
-         [(Py-Ay)^0, (Py-Ay)^1, ... (Py-Ay)^max_am]
-         [(Pz-Az)^0, (Pz-Az)^1, ... (Pz-Az)^max_am]]
-    prefactor = jnp.exp(-aa * bb * jnp.dot(A-B,A-B) / gamma)
-    """
-    gamma = aa + bb
-    prefactor *= (jnp.pi / gamma)**1.5
-
-    wx = overlap_component(la,lb,PA_pow[0],PB_pow[0],gamma)
-    wy = overlap_component(ma,mb,PA_pow[1],PB_pow[1],gamma)
-    wz = overlap_component(na,nb,PA_pow[2],PB_pow[2],gamma)
-    return prefactor * wx * wy * wz
-
-def overlap_component(l1,l2,PAx,PBx,gamma):
-    """
-    The 1d overlap integral component. Taketa, Huzinaga, Oohata 2.12
-    """
-    K = 1 + (l1 + l2) // 2  
-
-    def loop_i(arr):
-       i, total = arr
-       return (i+1, total + binomial_prefactor(2*i,l1,l2,PAx,PBx) * double_factorials[2*i-1] / (2*gamma)**i)
-
-    i_accu, total_sum = while_loop(lambda arr: arr[0] < K, loop_i, (0, 0)) # (i, total)
-
-    return total_sum
-
-def kinetic(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,prefactor):
-    """
-    Computes a single kinetic energy integral.
-    """
-    gamma = aa + bb
-    prefactor *= (jnp.pi / gamma)**1.5
-    wx = overlap_component(la,lb,PA_pow[0],PB_pow[0],gamma)
-    wy = overlap_component(ma,mb,PA_pow[1],PB_pow[1],gamma)
-    wz = overlap_component(na,nb,PA_pow[2],PB_pow[2],gamma)
-    wx_plus2 = overlap_component(la,lb+2,PA_pow[0],PB_pow[0],gamma)
-    wy_plus2 = overlap_component(ma,mb+2,PA_pow[1],PB_pow[1],gamma)
-    wz_plus2 = overlap_component(na,nb+2,PA_pow[2],PB_pow[2],gamma)
-    wx_minus2 = overlap_component(la,lb-2,PA_pow[0],PB_pow[0],gamma)
-    wy_minus2 = overlap_component(ma,mb-2,PA_pow[1],PB_pow[1],gamma)
-    wz_minus2 = overlap_component(na,nb-2,PA_pow[2],PB_pow[2],gamma)
-
-    term1 = bb*(2*(lb+mb+nb)+3) * wx * wy * wz 
-
-    term2 = -2 * bb**2 * (wx_plus2*wy*wz + wx*wy_plus2*wz + wx*wy*wz_plus2)
-
-    term3 = -0.5 * (lb * (lb-1) * wx_minus2 * wy * wz \
-                  + mb * (mb-1) * wx * wy_minus2 * wz \
-                  + nb * (nb-1) * wx * wy * wz_minus2)
-    return prefactor * (term1 + term2 + term3)
-
-def A_array(l1,l2,PA,PB,CP,g,A_vals):
-
-    def loop_i(arr0):
-       i_0, r_0, u_0, A_0 = arr0
-       Aterm_0 = neg_one_pow[i_0] * binomial_prefactor(i_0,l1,l2,PA,PB) * factorials[i_0]
-       r_0 = i_0 // 2
-
-       def loop_r(arr1):
-          i_1, r_1, u_1, Aterm_1, A_1 = arr1
-          u_1 = (i_1 - 2 * r_1) // 2
-
-          def loop_u(arr2):
-             i_2, r_2, u_2, Aterm_2, A_2 = arr2
-             I = i_2 - 2 * r_2 - u_2
-             tmp = I - u_2
-             fact_ratio = 1 / (factorials[r_2] * factorials[u_2] * factorials[tmp])
-             Aterm_2 *= neg_one_pow[u_2]  * CP[tmp] * (0.25 / g)**(r_2+u_2) * fact_ratio
-             A_2 = A_2.at[I].set(Aterm_2)
-             u_2 -= 1
-             return (i_2, r_2, u_2, Aterm_2, A_2)
-
-          i_1_, r_1_, u_1_, Aterm_1_, A_1_ = while_loop(lambda arr2: arr2[1] > -1, loop_u, (i_1, r_1, u_1, Aterm_1, A_1))
-          r_1_ -= 1
-          return (i_1_, r_1_, u_1_, Aterm_1_, A_1_)
-
-       i_0_, r_0_, u_0_, Aterm_0_, A_0_ = while_loop(lambda arr1: arr1[1] > -1, loop_r, (i_0, r_0, u_0, Aterm_0, A_0))
-       i_0_ -= 1
-       return (i_0_, r_0_, u_0_, A_0_)
-
-    i, r, u, A = while_loop(lambda arr0: arr0[0] > -1, loop_i, (l1 + l2, 0, 0, A_vals)) # (i, r, u, A)
-
-    return A
-
-def potential(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,Pgeom_pow,boys_eval,prefactor,charges,A_vals):
-    """
-    Computes a single electron-nuclear attraction integral
-    """
-    gamma = aa + bb
-    prefactor *= -2 * jnp.pi / gamma
-
-    def loop_val(n, val):
-      Ax = A_array(la,lb,PA_pow[0],PB_pow[0],Pgeom_pow[n,0,:],gamma,A_vals)
-      Ay = A_array(ma,mb,PA_pow[1],PB_pow[1],Pgeom_pow[n,1,:],gamma,A_vals)
-      Az = A_array(na,nb,PA_pow[2],PB_pow[2],Pgeom_pow[n,2,:],gamma,A_vals)
-
-      I, J, K, total = 0, 0, 0, 0
-      def loop_I(arr0):
-         I_0, J_0, K_0, val_0, total_0 = arr0
-         J_0 = 0
-
-         def loop_J(arr1):
-            I_1, J_1, K_1, val_1, total_1 = arr1
-            K_1 = 0
-
-            def loop_K(arr2):
-               I_2, J_2, K_2, val_2, total_2 = arr2
-               total_2 += Ax[I_2] * Ay[J_2] * Az[K_2] * boys_eval[I_2 + J_2 + K_2, n]
-               K_2 += 1
-               return (I_2, J_2, K_2, val_2, total_2)
-
-            I_1_, J_1_, K_1_, val_1_, total_1_ = while_loop(lambda arr2: arr2[2] < na + nb + 1, loop_K, (I_1, J_1, K_1, val_1, total_1))
-            J_1_ += 1
-            return (I_1_, J_1_, K_1_, val_1_, total_1_)
-
-         I_0_, J_0_, K_0_, val_0_, total_0_ = while_loop(lambda arr1: arr1[1] < ma + mb + 1, loop_J, (I_0, J_0, K_0, val_0, total_0))
-         I_0_ += 1
-         return (I_0_, J_0_, K_0_, val_0_, total_0_)
-
-      I_, J_, K_, val_, total_ = while_loop(lambda arr0: arr0[0] < la + lb + 1, loop_I, (I, J, K, val, total))
-      val_ += charges[n] * prefactor * total_
-      return val_
-
-    val = fori_loop(0, Pgeom_pow.shape[0], loop_val, 0)
-    return val
-
-def oei_arrays(geom, basis, charges):
-    """
-    Build one electron integral arrays (overlap, kinetic, and potential integrals)
-    """
-    coeffs, exps, atoms, ams, indices, dims = flatten_basis_data(basis)
-    nbf = get_nbf(basis)
-    nprim = coeffs.shape[0]
-    max_am = jnp.max(ams)
-    A_vals = jnp.zeros(2*max_am+1)
-
-    # Save various AM distributions for indexing
-    # Obtain all possible primitive quartet index combinations 
-    primitive_duets = cartesian_product(jnp.arange(nprim), jnp.arange(nprim))
-    STV = jnp.zeros((3,nbf,nbf))
-
-    for n in range(primitive_duets.shape[0]):
-       p1,p2 = primitive_duets[n]
-       coef = coeffs[p1] * coeffs[p2]
-       aa, bb = exps[p1], exps[p2]
-       atom1, atom2 = atoms[p1], atoms[p2]
-       am1, am2 = ams[p1], ams[p2]
-       A, B = geom[atom1], geom[atom2]
-       ld1, ld2 = am_leading_indices[am1], am_leading_indices[am2]
-
-       gamma = aa + bb
-       prefactor = jnp.exp(-aa * bb * jnp.dot(A-B,A-B) / gamma)
-       P = (aa * A + bb * B) / gamma
-       # Maximum angular momentum: hard coded
-       #max_am = 3 # f function support
-       # Precompute all powers up to 2+max_am of Pi-Ai, Pi-Bi.
-       # We need 2+max_am since kinetic requires incrementing angluar momentum by +2
-       PA_pow = jnp.power(jnp.broadcast_to(P-A, (max_am+3, 3)).T, jnp.arange(max_am+3))
-       PB_pow = jnp.power(jnp.broadcast_to(P-B, (max_am+3, 3)).T, jnp.arange(max_am+3))
-
-       # For potential integrals, we need the difference between
-       # the gaussian product center P and ALL atoms in the molecule,
-       # and then take all possible powers up to 2*max_am.
-       # We pre-collect this into a 3d array, and then just pull out what we need via indexing in the loops, so they need not be recomputed.
-       # The resulting array has dimensions (atom, cartesian component, power) so index (0, 1, 3) would return (Py - atom0_y)^3
-       P_minus_geom = jnp.broadcast_to(P, geom.shape) - geom
-       Pgeom_pow = jnp.power(jnp.transpose(jnp.broadcast_to(P_minus_geom, (2*max_am + 1,geom.shape[0],geom.shape[1])), (1,2,0)), jnp.arange(2*max_am + 1))
-       # All possible jnp.dot(P-atom,P-atom)
-       rcp2 = jnp.einsum('ij,ij->i', P_minus_geom, P_minus_geom)
-       # All needed (and unneeded, for am < max_am) boys function evaluations
-       boys_arg = jnp.broadcast_to(rcp2 * gamma, (2*max_am+1, geom.shape[0]))
-       boys_nu = jnp.tile(jnp.arange(2*max_am+1), (geom.shape[0],1)).T
-       boys_eval = boys(boys_nu,boys_arg)
-
-       a, b = 0, 0
-       def loop_a(arr0):
-          a_0, b_0, oei_0 = arr0
-          b_0 = 0
-
-          def loop_b(arr1):
-             a_1, b_1, oei_1 = arr1
-             # Gather angular momentum and index
-             la,ma,na = angular_momentum_combinations[a_1 + ld1]
-             lb,mb,nb = angular_momentum_combinations[b_1 + ld2]
-             # To only create unique indices, need to have separate indices arrays for i and j.
-             i = indices[p1] + a_1
-             j = indices[p2] + b_1
-             # Compute one electron integrals and add to appropriate index
-             overlap_int = overlap(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,prefactor) * coef
-             kinetic_int = kinetic(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,prefactor) * coef
-             potential_int = potential(la,ma,na,lb,mb,nb,aa,bb,PA_pow,PB_pow,Pgeom_pow,boys_eval,prefactor,charges,A_vals) * coef
-             oei_1 = oei_1.at[([0,1,2],[i,i,i],[j,j,j])].set((overlap_int, kinetic_int, potential_int))
-             b_1 += 1
-             return (a_1, b_1, oei_1)
-
-          a_0_, b_0_, oei_0_ = while_loop(lambda arr1: arr1[1] < dims[p2], loop_b, (a_0, b_0, oei_0))
-          a_0_ += 1
-          return (a_0_, b_0_, oei_0_)
-
-       a_, b_, oei_ = while_loop(lambda arr0: arr0[0] < dims[p1], loop_a, (a, b, STV))
-
-       return oei_
-
-    return STV[0], STV[1], STV[2]
+import h5py
+import os
+import psi4
+from . import libint_interface
+from ..utils import get_deriv_vec_idx, how_many_derivs
+
+jax.config.update("jax_enable_x64", True)
+
+class OEI(object):
+
+    def __init__(self, basis_name, xyz_path, max_deriv_order, mode):
+        with open(xyz_path, 'r') as f:
+            tmp = f.read()
+        molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
+        basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
+        natoms = molecule.natom()
+        nbf = basis_set.nbf()
+
+        # TODO implement core-algo for OEI's in libint_interface.cc
+        #if mode == 'core' and max_deriv_order > 0:
+            #self.oei_derivatives = {}
+
+        self.mode = mode
+        self.nbf = nbf
+
+        # Create new JAX primitives for overlap, kinetic, potential evaluation and their derivatives
+        self.overlap_p = jax.core.Primitive("overlap")
+        self.overlap_deriv_p = jax.core.Primitive("overlap_deriv")
+        self.kinetic_p = jax.core.Primitive("kinetic")
+        self.kinetic_deriv_p = jax.core.Primitive("kinetic_deriv")
+        self.potential_p = jax.core.Primitive("potential")
+        self.potential_deriv_p = jax.core.Primitive("potential_deriv")
+
+        # Register primitive evaluation rules
+        self.overlap_p.def_impl(self.overlap_impl)
+        self.overlap_deriv_p.def_impl(self.overlap_deriv_impl)
+        self.kinetic_p.def_impl(self.kinetic_impl)
+        self.kinetic_deriv_p.def_impl(self.kinetic_deriv_impl)
+        self.potential_p.def_impl(self.potential_impl)
+        self.potential_deriv_p.def_impl(self.potential_deriv_impl)
+
+        # Register the JVP rules with JAX
+        jax.interpreters.ad.primitive_jvps[self.overlap_p] = self.overlap_jvp
+        jax.interpreters.ad.primitive_jvps[self.overlap_deriv_p] = self.overlap_deriv_jvp
+        jax.interpreters.ad.primitive_jvps[self.kinetic_p] = self.kinetic_jvp
+        jax.interpreters.ad.primitive_jvps[self.kinetic_deriv_p] = self.kinetic_deriv_jvp
+        jax.interpreters.ad.primitive_jvps[self.potential_p] = self.potential_jvp
+        jax.interpreters.ad.primitive_jvps[self.potential_deriv_p] = self.potential_deriv_jvp
+
+        # Register the batching rules with JAX
+        jax.interpreters.batching.primitive_batchers[self.overlap_deriv_p] = self.overlap_deriv_batch
+        jax.interpreters.batching.primitive_batchers[self.kinetic_deriv_p] = self.kinetic_deriv_batch
+        jax.interpreters.batching.primitive_batchers[self.potential_deriv_p] = self.potential_deriv_batch
+
+    # Create functions to call primitives
+    def overlap(self, geom):
+        return self.overlap_p.bind(geom)
+
+    def overlap_deriv(self, geom, deriv_vec):
+        return self.overlap_deriv_p.bind(geom, deriv_vec)
+
+    def kinetic(self, geom):
+        return self.kinetic_p.bind(geom)
+
+    def kinetic_deriv(self, geom, deriv_vec):
+        return self.kinetic_deriv_p.bind(geom, deriv_vec)
+
+    def potential(self, geom):
+        return self.potential_p.bind(geom)
+
+    def potential_deriv(self, geom, deriv_vec):
+        return self.potential_deriv_p.bind(geom, deriv_vec)
+
+    # Create primitive evaluation rules
+    def overlap_impl(self, geom):
+        S = libint_interface.overlap()
+        S = S.reshape(self.nbf,self.nbf)
+        return jnp.asarray(S)
+
+    def kinetic_impl(self, geom):
+        T = libint_interface.kinetic()
+        T = T.reshape(self.nbf,self.nbf)
+        return jnp.asarray(T)
+
+    def potential_impl(self, geom):
+        V = libint_interface.potential()
+        V = V.reshape(self.nbf,self.nbf)
+        return jnp.asarray(V)
+
+    def overlap_deriv_impl(self, geom, deriv_vec):
+        deriv_vec = np.asarray(deriv_vec, int)
+        deriv_order = np.sum(deriv_vec)
+
+        #TODO update once core algo in libint is computed, this just computes one slice at a time
+        if self.mode == 'core':
+            S = libint_interface.overlap_deriv(np.asarray(deriv_vec, int))
+            return jnp.asarray(S).reshape(self.nbf,self.nbf)
+        else:
+            idx = get_deriv_vec_idx(deriv_vec)
+            if os.path.exists("oei_derivs.h5"):
+                file_name = "oei_derivs.h5"
+                dataset_name = "overlap_deriv" + str(deriv_order)
+            elif os.path.exists("oei_partials.h5"):
+                file_name = "oei_partials.h5"
+                dataset_name = "overlap_deriv" + str(deriv_order) + "_" + str(idx)
+            else:
+                raise Exception("Something went wrong reading integral derivative file")
+            with h5py.File(file_name, 'r') as f:
+                data_set = f[dataset_name]
+                if len(data_set.shape) == 3:
+                    S = data_set[:,:,idx]
+                elif len(data_set.shape) == 2:
+                    S = data_set[:,:]
+                else:
+                    raise Exception("Something went wrong reading integral derivative file")
+            return jnp.asarray(S)
+
+    def kinetic_deriv_impl(self, geom, deriv_vec):
+        deriv_vec = np.asarray(deriv_vec, int)
+        deriv_order = np.sum(deriv_vec)
+
+        #TODO update once core algo in libint is computed, this just computes one slice at a time
+        if self.mode == 'core':
+            T = libint_interface.kinetic_deriv(np.asarray(deriv_vec, int))
+            return jnp.asarray(T).reshape(self.nbf,self.nbf)
+        else:
+            idx = get_deriv_vec_idx(deriv_vec)
+            if os.path.exists("oei_derivs.h5"):
+                file_name = "oei_derivs.h5"
+                dataset_name = "kinetic_deriv" + str(deriv_order)
+            elif os.path.exists("oei_partials.h5"):
+                file_name = "oei_partials.h5"
+                dataset_name = "kinetic_deriv" + str(deriv_order) + "_" + str(idx)
+            else:
+                raise Exception("Something went wrong reading integral derivative file")
+            with h5py.File(file_name, 'r') as f:
+                data_set = f[dataset_name]
+                if len(data_set.shape) == 3:
+                    T = data_set[:,:,idx]
+                elif len(data_set.shape) == 2:
+                    T = data_set[:,:]
+                else:
+                    raise Exception("Something went wrong reading integral derivative file")
+            return jnp.asarray(T)
+
+    def potential_deriv_impl(self, geom, deriv_vec):
+        deriv_vec = np.asarray(deriv_vec, int)
+        deriv_order = np.sum(deriv_vec)
+
+        #TODO update once core algo in libint is computed, this just computes one slice at a time
+        if self.mode == 'core':
+            V = libint_interface.potential_deriv(np.asarray(deriv_vec, int))
+            return jnp.asarray(V).reshape(self.nbf,self.nbf)
+        else:
+            idx = get_deriv_vec_idx(deriv_vec)
+            if os.path.exists("oei_derivs.h5"):
+                file_name = "oei_derivs.h5"
+                dataset_name = "potential_deriv" + str(deriv_order)
+            elif os.path.exists("oei_partials.h5"):
+                file_name = "oei_partials.h5"
+                dataset_name = "potential_deriv" + str(deriv_order) + "_" + str(idx)
+            else:
+                raise Exception("Something went wrong reading integral derivative file")
+            with h5py.File(file_name, 'r') as f:
+                data_set = f[dataset_name]
+                if len(data_set.shape) == 3:
+                    V = data_set[:,:,idx]
+                elif len(data_set.shape) == 2:
+                    V = data_set[:,:]
+                else:
+                    raise Exception("Something went wrong reading integral derivative file")
+            return jnp.asarray(V)
+
+    def overlap_jvp(self, primals, tangents):
+        geom, = primals
+        primals_out = self.overlap(geom)
+        tangents_out = self.overlap_deriv(geom, tangents[0])
+        return primals_out, tangents_out
+
+    def overlap_deriv_jvp(self, primals, tangents):
+        geom, deriv_vec = primals
+        primals_out = self.overlap_deriv(geom, deriv_vec)
+        tangents_out = self.overlap_deriv(geom, deriv_vec + tangents[0])
+        return primals_out, tangents_out
+
+    def kinetic_jvp(self, primals, tangents):
+        geom, = primals
+        primals_out = self.kinetic(geom)
+        tangents_out = self.kinetic_deriv(geom, tangents[0])
+        return primals_out, tangents_out
+
+    def kinetic_deriv_jvp(self, primals, tangents):
+        geom, deriv_vec = primals
+        primals_out = self.kinetic_deriv(geom, deriv_vec)
+        tangents_out = self.kinetic_deriv(geom, deriv_vec + tangents[0])
+        return primals_out, tangents_out
+
+    def potential_jvp(self, primals, tangents):
+        geom, = primals
+        primals_out = self.potential(geom)
+        tangents_out = self.potential_deriv(geom, tangents[0])
+        return primals_out, tangents_out
+
+    def potential_deriv_jvp(self, primals, tangents):
+        geom, deriv_vec = primals
+        primals_out = self.potential_deriv(geom, deriv_vec)
+        tangents_out = self.potential_deriv(geom, deriv_vec + tangents[0])
+        return primals_out, tangents_out
+
+    # Define Batching rules, this is only needed since jax.jacfwd will call vmap on the JVP's
+    # of each oei function
+    def overlap_deriv_batch(self, batched_args, batch_dims):
+        # When the input argument of deriv_batch is batched along the 0'th axis
+        # we want to evaluate every 2d slice, gather up a (ncart, n,n) array,
+        # (expand dims at 0 and concatenate at 0)
+        # and then return the results, indicating the out batch axis
+        # is in the 0th position (return results, 0)
+        geom_batch, deriv_batch = batched_args
+        geom_dim, deriv_dim = batch_dims
+        results = []
+        for i in deriv_batch:
+            tmp = self.overlap_deriv(geom_batch, i)
+            results.append(jnp.expand_dims(tmp, axis=0))
+        results = jnp.concatenate(results, axis=0)
+        return results, 0
+
+    def kinetic_deriv_batch(self, batched_args, batch_dims):
+        geom_batch, deriv_batch = batched_args
+        geom_dim, deriv_dim = batch_dims
+        results = []
+        for i in deriv_batch:
+            tmp = self.kinetic_deriv(geom_batch, i)
+            results.append(jnp.expand_dims(tmp, axis=0))
+        results = jnp.concatenate(results, axis=0)
+        return results, 0
+
+    def potential_deriv_batch(self, batched_args, batch_dims):
+        geom_batch, deriv_batch = batched_args
+        geom_dim, deriv_dim = batch_dims
+        results = []
+        for i in deriv_batch:
+            tmp = self.potential_deriv(geom_batch, i)
+            results.append(jnp.expand_dims(tmp, axis=0))
+        results = jnp.concatenate(results, axis=0)
+        return results, 0
 
diff --git a/quax/integrals/tei.py b/quax/integrals/tei.py
index 258d4eb..e738d3b 100644
--- a/quax/integrals/tei.py
+++ b/quax/integrals/tei.py
@@ -1,310 +1,131 @@
 import jax 
-from jax.config import config
-config.update("jax_enable_x64", True)
 import jax.numpy as jnp
-from jax.lax import fori_loop, while_loop
-
-from .basis_utils import flatten_basis_data, get_nbf
-from .integrals_utils import gaussian_product, boys, binomial_prefactor, cartesian_product, am_leading_indices, angular_momentum_combinations, fact_ratio2, neg_one_pow
-
-def B_array(l1,l2,l3,l4,pa_pow,pb_pow,qc_pow,qd_pow,qp_pow,g1_pow,g2_pow,oodelta_pow,B_vals):
-    #TODO can you do some reduction magic to reduce the number of loops?
-    # Can you split it into two Scopes?
-    # Can you convert  all or part of this to a tensor contraction?  
-    # It does not appear to help to pull out binomial prefactors and compute outside loop.
-
-    def loop_i1(arr0):
-       i1_0, i2_0, r1_0, r2_0, u_0, B_0 = arr0
-       Bterm = binomial_prefactor(i1_0,l1,l2,pa_pow,pb_pow)
-       tmp = i1_0
-       r1_0 = i1_0 // 2
-
-       def loop_r1(arr1):
-          i1_1, i2_1, r1_1, r2_1, u_1, B_1 = arr1
-          Bterm *= fact_ratio2[i1_1,r1_1]
-          Bterm *= g1_pow[r1_1-i1_1]
-          tmp -= 2 * r1_1
-          i2_1 = l3 + l4
-
-          def loop_i2(arr2):
-             i1_2, i2_2, r1_2, r2_2, u_2, B_2 = arr2
-             Bterm *= neg_one_pow[i2_2]
-             Bterm *= binomial_prefactor(i2_2,l3,l4,qc_pow,qd_pow)
-             tmp += i2_2
-             r2_2 = i2_2 // 2
-
-             def loop_r2(arr3):
-                i1_3, i2_3, r1_3, r2_3, u_3, B_3 = arr3
-                Bterm *= fact_ratio2[i2_3,r2_3]
-                Bterm *= g2_pow[r2_3-i2_3]
-                tmp -= 2 * r2_3
-                u_3 = tmp // 2
-
-                def loop_u(arr4):
-                   i1_4, i2_4, r1_4, r2_4, u_4, B_4 = arr4
-                   I = tmp - u_4
-                   Bterm *= neg_one_pow[u_4]
-                   Bterm *= fact_ratio2[tmp,u_4]
-                   Bterm *= qp_pow[tmp - 2 * u_4]
-                   Bterm *= oodelta_pow[I]
-                   B = B.at[I].set(Bterm)
-                   u_4 -= 1
-                   return (i1_4, i2_4, r1_4, r2_4, u_4, B_4)
-
-                i1_3_, i2_3_, r1_3_, r2_3_, u_3_, B_3_ = while_loop(lambda arr4: arr4[4] > -1, loop_u, (i1_3, i2_3, r1_3, r2_3, u_3, B_3))
-                r2_3_ -= 1
-                return (i1_3_, i2_3_, r1_3_, r2_3_, u_3_, B_3_)
-
-             i1_2_, i2_2_, r1_2_, r2_2_, u_2_, B_2_ = while_loop(lambda arr3: arr3[3] > -1, loop_r2, (i1_2, i2_2, r1_2, r2_2, u_2, B_2))
-             i2_2_ -= 1
-             return (i1_2_, i2_2_, r1_2_, r2_2_, u_2_, B_2_)
-
-          i1_1_, i2_1_, r1_1_, r2_1_, u_1_, B_1_ = while_loop(lambda arr2: arr2[1] > -1, loop_i2, (i1_1, i2_1, r1_1, r2_1, u_1, B_1))
-          r1_1_ -= 1
-          return (i1_1_, i2_1_, r1_1_, r2_1_, u_1_, B_1_)
-
-       i1_0_, i2_0_, r1_0_, r2_0_, u_0_, B_0_ = while_loop(lambda arr1: arr1[2] > -1, loop_r1, (i1_0, i2_0, r1_0, r2_0, u_0, B_0))
-       i1_0_ -= 1
-       return (i1_0_, i2_0_, r1_0_, r2_0_, u_0_, B_0_)
-
-    i1, i2, r1, r2, u, B = while_loop(lambda arr0: arr0[0] > -1, loop_i1, (l1 + l2, 0, 0, 0, 0, B_vals)) # (i1, i2, r1, r2, u, B)
-    return B
-
-# def primitive_tei(La,Lb,Lc,Ld, A, B, C, D, aa, bb, cc, dd, c1, c2, c3, c4):
-#     """
-#     TODO can define a jvp rule for this, have it increment arguments appropriately
-#     Computes a single contracted two electron integral.
-#     given angular momentum vectors, centers, and single value exponents and contraction coefficients
-#     """
-#     # NOTE THIS FUNCTION IS NOT USED.
-#     # For debugging. This is implementation is directly coded into tei_array
-#     # in order to save some intermediates.
-#     la, ma, na = La
-#     lb, mb, nb = Lb
-#     lc, mc, nc = Lc
-#     ld, md, nd = Ld
-#     xa,ya,za = A
-#     xb,yb,zb = B
-#     xc,yc,zc = C
-#     xd,yd,zd = D
-
-#     rab2 = jnp.dot(A-B,A-B)
-#     rcd2 = jnp.dot(C-D,C-D)
-#     coef = c1 * c2 * c3 * c4
-#     xyzp = gaussian_product(aa,A,bb,B)
-#     xyzq = gaussian_product(cc,C,dd,D)
-#     xp,yp,zp = xyzp
-#     xq,yq,zq = xyzq
-#     rpq2 = jnp.dot(xyzp-xyzq,xyzp-xyzq)
-#     gamma1 = aa + bb
-#     gamma2 = cc + dd
-#     delta = 0.25*(1/gamma1+1/gamma2)
-#     Bx = B_array(la,lb,lc,ld,xp,xa,xb,xq,xc,xd,gamma1,gamma2,delta)
-#     By = B_array(ma,mb,mc,md,yp,ya,yb,yq,yc,yd,gamma1,gamma2,delta)
-#     Bz = B_array(na,nb,nc,nd,zp,za,zb,zq,zc,zd,gamma1,gamma2,delta)
-#     boys_arg = 0.25*rpq2/delta
-#     boys_eval = boys(jnp.arange(13), boys_arg) # supports up to f functions
-
-#     with loops.Scope() as s:
-#       s.I = 0
-#       s.J = 0
-#       s.K = 0
-#       s.primitive = 0.
-#       s.I = 0
-#       for _ in s.while_range(lambda: s.I < la + lb + lc + ld + 1):
-#         s.J = 0
-#         for _ in s.while_range(lambda: s.J < ma + mb + mc + md + 1):
-#           s.K = 0
-#           for _ in s.while_range(lambda: s.K < na + nb + nc + nd + 1):
-#             s.primitive += Bx[s.I] * By[s.J] * Bz[s.K] * boys_eval[s.I + s.J + s.K]
-#             s.K += 1
-#           s.J += 1
-#         s.I += 1
-#       value = 2*jax.lax.pow(jnp.pi,2.5)/(gamma1*gamma2*jnp.sqrt(gamma1+gamma2)) \
-#               *jnp.exp(-aa*bb*rab2/gamma1) \
-#               *jnp.exp(-cc*dd*rcd2/gamma2)*s.primitive*coef
-#       return value
-
-def tei_array(geom, basis):
-    """
-    Build two electron integral array from a jax.numpy array of the cartesian geometry in Bohr, 
-    and a basis dictionary as defined by basis_utils.build_basis_set
-    We have to loop over primitives rather than shells because JAX needs intermediates to be consistent 
-    sizes in order to compile.
-    """
-    # Smush primitive data together into vectors
-    coeffs, exps, atoms, ams, indices, dims = flatten_basis_data(basis)
-    nbf = get_nbf(basis)
-    max_am = jnp.max(ams)
-    max_am_idx = max_am * 4 + 1 
-    #TODO add excpetion raise if angular momentum is too high
-    B_vals = jnp.zeros(4*max_am+1)  
-    nprim = coeffs.shape[0]
-    # Obtain all possible primitive quartet index combinations 
-    primitive_quartets = cartesian_product(jnp.arange(nprim), jnp.arange(nprim), jnp.arange(nprim), jnp.arange(nprim))
-
-    #print("Number of basis functions: ", nbf)
-    #print("Number of primitve quartets: ", primitive_quartets.shape[0])
-
-    #TODO Experimental: precompute quantities and lookup inside loop
-    # Compute all possible Gaussian products for this basis set
-    aa_plus_bb = jnp.broadcast_to(exps, (nprim,nprim)) + jnp.transpose(jnp.broadcast_to(exps, (nprim,nprim)), (1,0))
-    aa_times_A = jnp.einsum('i,ij->ij', exps, geom[atoms])
-    aaxA_plus_bbxB = aa_times_A[:,None,:] + aa_times_A[None,:,:]
-    gaussian_products = jnp.einsum('ijk,ij->ijk', aaxA_plus_bbxB, 1/aa_plus_bb)  
-
-    # Compute all rab2 (rcd2), every possible jnp.dot(A-B,A-B)
-    natom = geom.shape[0]
-    tmpA = jnp.broadcast_to(geom, (natom,natom,3))
-    AminusB = (tmpA - jnp.transpose(tmpA, (1,0,2)))
-    AmBdot = jnp.einsum('ijk,ijk->ij', AminusB, AminusB) # shape: (natom,natom)
-
-    # Compute all differences between gaussian product centers with all atom centers
-    tmpP = jnp.tile(gaussian_products, natom).reshape(nprim,nprim,natom,3)
-    PminusA = tmpP - jnp.broadcast_to(geom, tmpP.shape)
-
-    # Commpute all powers (up to max_am) of differences between gaussian product centers and atom centers
-    # Shape: (nprim, nprim, natom, 3, max_am+1). In loop index PA_pow as [p1,p2,atoms[p1],:,:]
-    PminusA_pow = jnp.power(jnp.transpose(jnp.broadcast_to(PminusA, (max_am+1,nprim,nprim,natom,3)), (1,2,3,4,0)), jnp.arange(max_am+1))
-
-    def loop_prim_quartets(n, G_tei):
-      # Load in primitive indices, coeffs, exponents, centers, angular momentum index, and leading placement index in TEI array
-      p1,p2,p3,p4 = primitive_quartets[n]
-      coef = coeffs[p1] * coeffs[p2] * coeffs[p3] * coeffs[p4]
-      aa, bb, cc, dd = exps[p1], exps[p2], exps[p3], exps[p4]
-      ld1, ld2, ld3, ld4 = am_leading_indices[ams[p1]],am_leading_indices[ams[p2]],am_leading_indices[ams[p3]],am_leading_indices[ams[p4]]
-      idx1, idx2, idx3, idx4 = indices[p1],indices[p2],indices[p3],indices[p4],
-      #A, B, C, D = geom[atoms[p1]], geom[atoms[p2]], geom[atoms[p3]], geom[atoms[p4]]
-
-      # Compute common intermediates before looping over AM distributions.
-      # Avoids redundant recomputations/reassignment for all classes other than (ss|ss).
-      #AB = A - B
-      #CD = C - D
-      #rab2 = jnp.dot(AB,AB)
-      #rcd2 = jnp.dot(CD,CD)
-      #P = (aa * A + bb * B) / gamma1
-      #Q = (cc * C + dd * D) / gamma2
-      gamma1 = aa + bb
-      gamma2 = cc + dd
-
-      #TODO
-      P = gaussian_products[p1,p2]
-      Q = gaussian_products[p3,p4]
-      rab2 = AmBdot[atoms[p1],atoms[p2]]
-      rcd2 = AmBdot[atoms[p3],atoms[p4]]
-      #PA = PminusA[p1,p2,atoms[p1]]
-      #PB = PminusA[p1,p2,atoms[p2]]
-      #QC = PminusA[p3,p4,atoms[p3]]
-      #QD = PminusA[p3,p4,atoms[p4]]
-      #TODO
-
-      PQ = P - Q
-      rpq2 = jnp.dot(PQ,PQ)
-      delta = 0.25*(1/gamma1+1/gamma2)
-      boys_arg = 0.25 * rpq2 / delta
-      boys_eval = boys(jnp.arange(max_am_idx), boys_arg)
-
-      # Need all powers of Pi-Ai,Pi-Bi,Qi-Ci,Qi-Di (i=x,y,z) up to max_am and Qi-Pi up to max_am_idx
-      # note: this computes unncessary quantities for lower angular momentum,
-      # but avoids repeated computation of the same quantities in loops for higher angular momentum
-
-      #PA_pow = jnp.power(jnp.broadcast_to(P-A, (max_am+1,3)).T, jnp.arange(max_am+1))
-      #PB_pow = jnp.power(jnp.broadcast_to(P-B, (max_am+1,3)).T, jnp.arange(max_am+1))
-      #QC_pow = jnp.power(jnp.broadcast_to(Q-C, (max_am+1,3)).T, jnp.arange(max_am+1))
-      #QD_pow = jnp.power(jnp.broadcast_to(Q-D, (max_am+1,3)).T, jnp.arange(max_am+1))
-
-      PA_pow = PminusA_pow[p1,p2,atoms[p1],:,:]
-      PB_pow = PminusA_pow[p1,p2,atoms[p2],:,:]
-      QC_pow = PminusA_pow[p3,p4,atoms[p3],:,:]
-      QD_pow = PminusA_pow[p3,p4,atoms[p4],:,:]
-      QP_pow = jnp.power(jnp.broadcast_to(Q-P, (max_am_idx,3)).T, jnp.arange(max_am_idx))
-
-      # Gamma powers are negative, up to -(l1+l2).
-      # Make array such that the given negative index returns the same negative power.
-      g1_pow = jnp.power(4*gamma1, -jnp.roll(jnp.flip(jnp.arange(2*max_am+1)),1))
-      g2_pow = jnp.power(4*gamma2, -jnp.roll(jnp.flip(jnp.arange(2*max_am+1)),1))
-      oodelta_pow = jnp.power(1 / delta, jnp.arange(max_am_idx))  # l1 + l2 + l3 + l4 + 1
-
-      prefactor = 34.986836655249726 / (gamma1*gamma2*jnp.sqrt(gamma1+gamma2)) \
-                  * jnp.exp(-aa*bb*rab2/gamma1 + -cc*dd*rcd2/gamma2) * coef
-
-      a, b, c, d = 0, 0, 0, 0
-      def loop_a(arr0):
-         a_0, b_0, c_0, d_0, G_0 = arr0
-         b_0 = 0
-
-         def loop_b(arr1):
-            a_1, b_1, c_1, d_1, G_1 = arr1
-            c_1 = 0
-
-            def loop_c(arr2):
-               a_2, b_2, c_2, d_2, G_2 = arr2
-               d_2 = 0
-
-               def loop_d(arr3):
-                  a_3, b_3, c_3, d_3, G_3 = arr3
-                  # Collect angular momentum and index in G
-                  la, ma, na = angular_momentum_combinations[a_3 + ld1]
-                  lb, mb, nb = angular_momentum_combinations[b_3 + ld2]
-                  lc, mc, nc = angular_momentum_combinations[c_3 + ld3]
-                  ld, md, nd = angular_momentum_combinations[d_3 + ld4]
-                  i = idx1 + a_3
-                  j = idx2 + b_3
-                  k = idx3 + c_3
-                  l = idx4 + d_3
-                  # Compute the primitive quartet tei and add to appropriate index in G
-                  Bx = B_array(la,lb,lc,ld,PA_pow[0],PB_pow[0],QC_pow[0],QD_pow[0],QP_pow[0],g1_pow,g2_pow,oodelta_pow,B_vals)
-                  By = B_array(ma,mb,mc,md,PA_pow[1],PB_pow[1],QC_pow[1],QD_pow[1],QP_pow[1],g1_pow,g2_pow,oodelta_pow,B_vals)
-                  Bz = B_array(na,nb,nc,nd,PA_pow[2],PB_pow[2],QC_pow[2],QD_pow[2],QP_pow[2],g1_pow,g2_pow,oodelta_pow,B_vals)
-
-                  I, J, K, primitive = 0, 0, 0, 0.0
-                  def loop_I(arrI):
-                     I_I, J_I, K_I, primitive_I = arrI
-                     J_I = 0
-                     tmp = Bx[I_I]
-
-                     def loop_J(arrJ):
-                        I_J, J_J, K_J, primitive_J = arrJ
-                        K_J = 0
-                        tmp *= By[J_J]
-
-                        def loop_K(arrK):
-                           I_K, J_K, K_K, primitive_K = arrK
-                           tmp *= Bz[K_K] * boys_eval[I_K + J_K + K_K]
-                           primitive_K += tmp
-                           K_K += 1
-                           return (I_K, J_K, K_K, primitive_K)
-
-                        I_J_, J_J_, K_J_, primitive_J_ = while_loop(lambda arrK: arrK[2] < na + nb + nc + nd + 1, loop_K, (I_J, J_J, K_J, primitive_J))
-                        J_J_ += 1
-                        return (I_J_, J_J_, K_J_, primitive_J_)
-
-                     I_I_, J_I_, K_I_, primitive_I_ = while_loop(lambda arrJ: arrJ[1] < ma + mb + mc + md + 1, loop_J, (I_I, J_I, K_I, primitive_I))
-                     I_I_ += 1 # I
-                     return (I_I_, J_I_, K_I_, primitive_I_)
-
-                  I_, J_, K_, primitive_ = while_loop(lambda arrI: arrI[0] < la + lb + lc + ld + 1, loop_I, (I, J, K, primitive))
-
-                  tei = prefactor * primitive_
-                  G_3 = G_3.at[i, j, k, l].set(tei)
-                  d_3 += 1
-                  return (a_3, b_3, c_3, d_3, G_3)
-
-               a_2_, b_2_, c_2_, d_2_, G_2_ = while_loop(lambda arr3: arr3[3] < dims[p4], loop_d, (a_2, b_2, c_2, d_2, G_2))
-               c_2_ += 1
-               return (a_2_, b_2_, c_2_, d_2_, G_2_)
-
-            a_1_, b_1_, c_1_, d_1_, G_1_ = while_loop(lambda arr2: arr2[2] < dims[p3], loop_c, (a_1, b_1, c_1, d_1, G_1))
-            b_1_ += 1
-            return (a_1_, b_1_, c_1_, d_1_, G_1_)
-
-         a_0_, b_0_, c_0_, d_0_, G_0_ = while_loop(lambda arr1: arr1[1] < dims[p2], loop_b, (a_0, b_0, c_0, d_0, G_0))
-         a_0_ += 1
-         return (a_0_, b_0_, c_0_, d_0_, G_0_)
-
-      a_, b_, c_, d_, G_tei_ = while_loop(lambda arr0: arr0[0] < dims[p1], loop_a, (a, b, c, d, G_tei))
-      return G_tei_
-
-    G = fori_loop(0, primitive_quartets.shape[0], loop_prim_quartets, jnp.zeros((nbf,nbf,nbf,nbf)))
-    return G
+import numpy as np
+import h5py
+import os
+import psi4
+from . import libint_interface
+from ..utils import get_deriv_vec_idx, how_many_derivs
+
+jax.config.update("jax_enable_x64", True)
+
+class TEI(object):
+
+    def __init__(self, basis_name, xyz_path, max_deriv_order, mode):
+        with open(xyz_path, 'r') as f:
+            tmp = f.read()
+        molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
+        basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
+        natoms = molecule.natom()
+        nbf = basis_set.nbf()
+
+        if mode == 'core' and max_deriv_order > 0:
+            # An list of ERI derivative tensors, containing only unique elements
+            # corresponding to upper hypertriangle (since derivative tensors are symmetric)
+            # Length of tuple is maximum deriv order, each array is (upper triangle derivatives,nbf,nbf,nbf,nbf)
+            # Then when JAX calls JVP, read appropriate slice
+            self.eri_derivatives = []
+            for i in range(max_deriv_order):
+                n_unique_derivs = how_many_derivs(natoms, i + 1)
+                eri_deriv = libint_interface.eri_deriv_core(i+1).reshape(n_unique_derivs,nbf,nbf,nbf,nbf)
+                self.eri_derivatives.append(eri_deriv)
+
+        self.mode = mode
+        self.nbf = nbf
+
+        # Create new JAX primitive for TEI evaluation
+        self.tei_p = jax.core.Primitive("tei")
+        self.tei_deriv_p = jax.core.Primitive("tei_deriv")
+
+        # Register primitive evaluation rules
+        self.tei_p.def_impl(self.tei_impl)
+        self.tei_deriv_p.def_impl(self.tei_deriv_impl)
+
+        # Register the JVP rules with JAX
+        jax.interpreters.ad.primitive_jvps[self.tei_p] = self.tei_jvp
+        jax.interpreters.ad.primitive_jvps[self.tei_deriv_p] = self.tei_deriv_jvp
+
+        # Register tei_deriv batching rule with JAX
+        jax.interpreters.batching.primitive_batchers[self.tei_deriv_p] = self.tei_deriv_batch
+
+    # Create functions to call primitives
+    def tei(self, geom):
+        return self.tei_p.bind(geom)
+
+    def tei_deriv(self, geom, deriv_vec):
+        return self.tei_deriv_p.bind(geom, deriv_vec)
+
+    # Create primitive evaluation rules
+    def tei_impl(self, geom):
+        G = libint_interface.eri()
+        #d = int(np.sqrt(np.sqrt(G.shape[0])))
+        G = G.reshape(self.nbf,self.nbf,self.nbf,self.nbf)
+        return jnp.asarray(G)
+
+    def tei_deriv_impl(self, geom, deriv_vec):
+        deriv_vec = np.asarray(deriv_vec, int)
+        deriv_order = np.sum(deriv_vec)
+        idx = get_deriv_vec_idx(deriv_vec)
+
+        # Use eri derivatives in memory
+        if self.mode == 'core':
+            G = self.eri_derivatives[deriv_order-1][idx,:,:,:,:]
+            return jnp.asarray(G)
+
+        # Read from disk
+        elif self.mode == 'disk':
+            # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
+            if os.path.exists("eri_derivs.h5"):
+                file_name = "eri_derivs.h5"
+                dataset_name = "eri_deriv" + str(deriv_order)
+            # if not found, look for partial derivative tensor file with datasets named (type)_deriv(order)_(flattened_uppertri_idx)
+            elif os.path.exists("eri_partials.h5"):
+                file_name = "eri_partials.h5"
+                dataset_name = "eri_deriv" + str(deriv_order) + "_" + str(idx)
+            else:
+                raise Exception("ERI derivatives not found on disk")
+
+            with h5py.File(file_name, 'r') as f:
+                data_set = f[dataset_name]
+                if len(data_set.shape) == 5:
+                    G = data_set[:,:,:,:,idx]
+                elif len(data_set.shape) == 4:
+                    G = data_set[:,:,:,:]
+                else:
+                    raise Exception("Something went wrong reading integral derivative file")
+            return jnp.asarray(G)
+
+
+    # Create Jacobian-vector product rule, which given some input args (primals)
+    # and a tangent std basis vector (tangent), returns the function evaluated at that point (primals_out)
+    # and the slice of the Jacobian (tangents_out)
+    def tei_jvp(self, primals, tangents):
+        geom, = primals
+        primals_out = self.tei(geom)
+        tangents_out = self.tei_deriv(geom, tangents[0])
+        return primals_out, tangents_out
+
+    def tei_deriv_jvp(self, primals, tangents):
+        geom, deriv_vec = primals
+        primals_out = self.tei_deriv(geom, deriv_vec)
+        # Here we add the current value of deriv_vec to the incoming tangent vector,
+        # so that nested higher order differentiation works
+        tangents_out = self.tei_deriv(geom, deriv_vec + tangents[0])
+        return primals_out, tangents_out
+
+    # Define Batching rules, this is only needed since jax.jacfwd will call vmap on the JVP of tei
+    def tei_deriv_batch(self, batched_args, batch_dims):
+        # When the input argument of deriv_batch is batched along the 0'th axis
+        # we want to evaluate every 4d slice, gather up a (ncart, n,n,n,n) array,
+        # (expand dims at 0 and concatenate at 0)
+        # and then return the results, indicating the out batch axis
+        # is in the 0th position (return results, 0)
+        geom_batch, deriv_batch = batched_args
+        geom_dim, deriv_dim = batch_dims
+        results = []
+        for i in deriv_batch:
+            tmp = self.tei_deriv(geom_batch, i)
+            results.append(jnp.expand_dims(tmp, axis=0))
+        results = jnp.concatenate(results, axis=0)
+        return results, 0
 
diff --git a/quax/methods/ccsd.py b/quax/methods/ccsd.py
index 3acc6b8..9242b13 100644
--- a/quax/methods/ccsd.py
+++ b/quax/methods/ccsd.py
@@ -27,8 +27,8 @@ def rccsd(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_or
     fock_Vd = eps[v]
 
     # Oribital energy denominators 
-    D = 1.0 / (fock_Od.reshape(-1,1,1,1) + fock_Od.reshape(-1,1,1) - fock_Vd.reshape(-1,1) - fock_Vd)
-    d = 1.0 / (fock_Od.reshape(-1,1) - fock_Vd)
+    D = 1.0 / (fock_Od.reshape(-1, 1, 1, 1) + fock_Od.reshape(-1, 1, 1) - fock_Vd.reshape(-1, 1) - fock_Vd)
+    d = 1.0 / (fock_Od.reshape(-1, 1) - fock_Vd)
 
     # Initial Amplitudes
     T1 = jnp.zeros((ndocc,nvir))
@@ -42,7 +42,7 @@ def rccsd(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_or
         E_old = E_ccsd * 1
 
         T1, T2 = rccsd_iter(T1, T2, V, d, D, ndocc, nvir)
-        E_ccsd = rccsd_energy(T1,T2,V[2])
+        E_ccsd = rccsd_energy(T1, T2, V[2])
 
         iteration += 1
         if iteration == maxit:
@@ -60,10 +60,10 @@ def rccsd(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_or
 @jax.jit
 def rccsd_energy(T1, T2, Voovv):
     E_ccsd = 0.0
-    E_ccsd -= jnp.tensordot(T1, jnp.tensordot(T1, Voovv, [(0,1),(1,2)]), [(0,1),(0,1)])
-    E_ccsd -= jnp.tensordot(T2, Voovv, [(0,1,2,3),(1,0,2,3)])
-    E_ccsd += 2.0*jnp.tensordot(T2, Voovv, [(0,1,2,3),(0,1,2,3)])
-    E_ccsd += 2.0*jnp.tensordot(T1, jnp.tensordot(T1, Voovv, [(0,1),(0,2)]), [(0,1),(0,1)])
+    E_ccsd -= jnp.tensordot(T1, jnp.tensordot(T1, Voovv, [(0, 1), (1, 2)]), [(0, 1), (0, 1)])
+    E_ccsd -= jnp.tensordot(T2, Voovv, [(0, 1, 2, 3), (1, 0, 2, 3)])
+    E_ccsd += 2.0*jnp.tensordot(T2, Voovv, [(0, 1, 2, 3),(0, 1, 2, 3)])
+    E_ccsd += 2.0*jnp.tensordot(T1, jnp.tensordot(T1, Voovv, [(0, 1), (0, 2)]), [(0, 1), (0, 1)])
     return E_ccsd
 
 # Jit compiling ccsd is a BAD IDEA.
@@ -75,9 +75,9 @@ def rccsd_iter(T1, T2, V, d, D, ndocc, nvir):
     newT2 = jnp.zeros(T2.shape)
 
     # T1 equation
-    newT1 += jnp.tensordot(T1, Voovv, [(0,1),(0,2)])
-    newT1 += jnp.tensordot(T2, Vovvv, [(1,2,3), (0,3,2)])
-    newT1 -= jnp.tensordot(Vooov, T2, [(0,1,3),(0,1,3)])
+    newT1 += jnp.tensordot(T1, Voovv, [(0, 1), (0, 2)])
+    newT1 += jnp.tensordot(T2, Vovvv, [(1, 2, 3), (0, 3, 2)])
+    newT1 -= jnp.tensordot(Vooov, T2, [(0, 1, 3), (0, 1, 3)])
     newT1 -= jnp.einsum('kc, la, lkic -> ia', T1, T1, Vooov, optimize = 'optimal')
     newT1 += jnp.einsum('kc, id, kacd -> ia', T1, T1, Vovvv, optimize = 'optimal')
     newT1 -= jnp.einsum('kc, ilad, lkcd -> ia', T1, T2, Voovv, optimize = 'optimal')
@@ -85,12 +85,12 @@ def rccsd_iter(T1, T2, V, d, D, ndocc, nvir):
     newT1 -= jnp.einsum('ic, lkad, lkcd -> ia', T1, T2, Voovv, optimize = 'optimal')
     newT1 -= jnp.einsum('la, ikdc, klcd -> ia', T1, T2, Voovv, optimize = 'optimal')
     newT1 -= jnp.einsum('kc, id, la, klcd -> ia', T1, T1, T1, Voovv, optimize = 'optimal')
-    newT1 += 2.0*jnp.einsum('kc, ilad, klcd -> ia', T1, T2, Voovv, optimize = 'optimal')
+    newT1 += 2.0 * jnp.einsum('kc, ilad, klcd -> ia', T1, T2, Voovv, optimize = 'optimal')
     newT1 *= 2.0
 
-    newT1 -= jnp.tensordot(T1, Vovov, [(0,1),(2,1)])
-    newT1 -= jnp.tensordot(T2, Vovvv, [(0,2,3),(0,3,2)])
-    newT1 += jnp.tensordot(Vooov, T2, [(0,1,3),(1,0,3)])
+    newT1 -= jnp.tensordot(T1, Vovov, [(0, 1), (2, 1)])
+    newT1 -= jnp.tensordot(T2, Vovvv, [(0, 2, 3), (0, 3, 2)])
+    newT1 += jnp.tensordot(Vooov, T2, [(0, 1, 3), (1, 0, 3)])
     newT1 -= jnp.einsum('kc, id, kadc -> ia', T1, T1, Vovvv, optimize = 'optimal')
     newT1 += jnp.einsum('kc, la, klic -> ia', T1, T1, Vooov, optimize = 'optimal')
     newT1 += jnp.einsum('kc, liad, lkcd -> ia', T1, T2, Voovv, optimize = 'optimal')
@@ -104,16 +104,16 @@ def rccsd_iter(T1, T2, V, d, D, ndocc, nvir):
     newT2 -= jnp.einsum('ikac, jlbd, lkcd -> ijab', T2, T2, Voovv, optimize = 'optimal')
     newT2 -= jnp.einsum('kiac, jlbd, klcd -> ijab', T2, T2, Voovv, optimize = 'optimal')
     newT2 -= jnp.einsum('ijac, klbd, klcd -> ijab', T2, T2, Voovv, optimize = 'optimal')
-    newT2 += 2.0*jnp.einsum('ikac, jlbd, klcd -> ijab', T2, T2, Voovv, optimize = 'optimal')
+    newT2 += 2.0 * jnp.einsum('ikac, jlbd, klcd -> ijab', T2, T2, Voovv, optimize = 'optimal')
     newT2 *= 2.0
 
     # Reducing Vvvvv contractions to tensordot is especially productive.
     # TODO try reducing Vovvv as well. Also check if removing jit makes this optimization moot...
     newT2 += Voovv
-    newT2 += jnp.tensordot(T1, jnp.tensordot(T1, Vvvvv, [(1,),(1,)]), [(1,),(1,)])
-    newT2 += jnp.tensordot(T2, Vvvvv, [(2,3),(0,1)])
+    newT2 += jnp.tensordot(T1, jnp.tensordot(T1, Vvvvv, [(1, ), (1, )]), [(1, ), (1, )])
+    newT2 += jnp.tensordot(T2, Vvvvv, [(2, 3), (0, 1)])
     newT2 += jnp.einsum('ka, lb, ijkl -> ijab', T1, T1, Voooo, optimize = 'optimal')
-    newT2 += jnp.tensordot(T2, Voooo, [(0,1),(2,3)]).transpose((2,3,0,1))
+    newT2 += jnp.tensordot(T2, Voooo, [(0, 1), (2, 3)]).transpose((2, 3, 0, 1))
     newT2 -= jnp.einsum('ic, jd, ka, kbcd -> ijab', T1, T1, T1, Vovvv, optimize = 'optimal')
     newT2 -= jnp.einsum('ic, jd, kb, kadc -> ijab', T1, T1, T1, Vovvv, optimize = 'optimal')
     newT2 += jnp.einsum('ic, ka, lb, lkjc -> ijab', T1, T1, T1, Vooov, optimize = 'optimal')
@@ -129,7 +129,7 @@ def rccsd_iter(T1, T2, V, d, D, ndocc, nvir):
     newT2 += jnp.einsum('ic, jd, lkab, lkcd -> ijab', T1, T1, T2, Voovv, optimize = 'optimal')
     newT2 += jnp.einsum('ka, lb, ijdc, lkcd -> ijab', T1, T1, T2, Voovv, optimize = 'optimal')
 
-    P_OVVO  = jnp.tensordot(T2, Voovv, [(1,3),(0,2)]).transpose((0,2,1,3))
+    P_OVVO  = jnp.tensordot(T2, Voovv, [(1, 3),(0, 2)]).transpose((0, 2, 1, 3))
     P_OVVO -= jnp.einsum('lb, ikac, lkjc -> ijab', T1, T2, Vooov, optimize = 'optimal')
     P_OVVO += jnp.einsum('jc, ikad, kbdc -> ijab', T1, T2, Vovvv, optimize = 'optimal')
     P_OVVO += jnp.einsum('kc, ijad, kbcd -> ijab', T1, T2, Vovvv, optimize = 'optimal')
@@ -140,13 +140,13 @@ def rccsd_iter(T1, T2, V, d, D, ndocc, nvir):
     P_OVVO -= jnp.einsum('ikdc, ljab, klcd -> ijab', T2, T2, Voovv, optimize = 'optimal')
     P_OVVO *= 2.0
 
-    P_OVVO -= jnp.tensordot(T1, Vooov, [(0,), (2,)]).transpose((2,1,3,0))
-    P_OVVO += jnp.tensordot(T1, Vovvv, [(1,),(1,)]).transpose((1,0,2,3))
-    P_OVVO -= jnp.tensordot(T2, Voovv, [(0,3),(0,2)]).transpose((0,2,1,3))
+    P_OVVO -= jnp.tensordot(T1, Vooov, [(0, ), (2, )]).transpose((2, 1, 3, 0))
+    P_OVVO += jnp.tensordot(T1, Vovvv, [(1, ), (1, )]).transpose((1, 0, 2, 3))
+    P_OVVO -= jnp.tensordot(T2, Voovv, [(0, 3), (0, 2)]).transpose((0, 2, 1, 3))
     P_OVVO -= jnp.einsum('ic, ka, kjcb -> ijab', T1, T1, Voovv, optimize = 'optimal')
     P_OVVO -= jnp.einsum('ic, kb, jcka -> ijab', T1, T1, Vovov, optimize = 'optimal')
-    P_OVVO -= jnp.tensordot(T2, Vovov, [(1,3),(2,1)]).transpose((0,2,1,3))
-    P_OVVO -= jnp.tensordot(T2, Vovov, [(0,3),(2,1)]).transpose((2,0,1,3))
+    P_OVVO -= jnp.tensordot(T2, Vovov, [(1, 3), (2, 1)]).transpose((0, 2, 1, 3))
+    P_OVVO -= jnp.tensordot(T2, Vovov, [(0, 3), (2, 1)]).transpose((2, 0, 1, 3))
     P_OVVO += jnp.einsum('lb, kiac, lkjc -> ijab', T1, T2, Vooov, optimize = 'optimal')
     P_OVVO -= jnp.einsum('jc, ikdb, kacd -> ijab', T1, T2, Vovvv, optimize = 'optimal')
     P_OVVO -= jnp.einsum('jc, kiad, kbdc -> ijab', T1, T2, Vovvv, optimize = 'optimal')
@@ -165,7 +165,7 @@ def rccsd_iter(T1, T2, V, d, D, ndocc, nvir):
     P_OVVO += jnp.einsum('ic, lb, kjad, klcd -> ijab', T1, T1, T2, Voovv, optimize = 'optimal')
 
     newT2 += P_OVVO 
-    newT2 += P_OVVO.transpose((1,0,3,2))
+    newT2 += P_OVVO.transpose((1, 0, 3, 2))
 
     newT1 *= d
     newT2 *= D
diff --git a/quax/methods/ccsd_t.py b/quax/methods/ccsd_t.py
index 2ab6a44..43b41f5 100644
--- a/quax/methods/ccsd_t.py
+++ b/quax/methods/ccsd_t.py
@@ -4,8 +4,7 @@
 from jax.lax import while_loop
 
 from .energy_utils import nuclear_repulsion, partial_tei_transformation, tei_transformation
-from .ccsd import rccsd 
-from ..integrals import integrals_utils
+from .ccsd import rccsd
 
 def perturbative_triples(T1, T2, V, fock_Od, fock_Vd):
     Voooo, Vooov, Voovv, Vovov, Vovvv, Vvvvv = V
@@ -43,22 +42,23 @@ def loop_a(arr0):
            def loop_b(arr1):
               a_1, b_1, c_1, pT_contribution_1 = arr1
               c_1 = 0
-              delta_vir = 1 + delta_v[a_1,b_1]
+              delta_vir = 1 + delta_v[a_1, b_1]
 
               def loop_c(arr2):
-                 a_2, b_2, c_2, pT_contribution_2 = arr2
-                 delta_vir = delta_vir + delta_v[b_2,c_2]
+                 a_2, b_2, c_2, delta_vir_2, pT_contribution_2 = arr2
+                 delta_vir_2 = delta_vir_2 + delta_v[b_2,c_2]
                  Dd = Dd_occ - (fock_Vd[a_2] + fock_Vd[b_2] + fock_Vd[c_2])
-                 X = W[a_2,b_2,c_2]*V[a_2,b_2,c_2] + W[a_2,c_2,b_2]*V[a_2,c_2,b_2] + W[b_2,a_2,c_2]*V[b_2,a_2,c_2]  \
-                   + W[b_2,c_2,a_2]*V[b_2,c_2,a_2] + W[c_2,a_2,b_2]*V[c_2,a_2,b_2] + W[c_2,b_2,a_2]*V[c_2,b_2,a_2]
-                 Y = (V[a_2,b_2,c_2] + V[b_2,c_2,a_2] + V[c_2,a_2,b_2])
-                 Z = (V[a_2,c_2,b_2] + V[b_2,a_2,c_2] + V[c_2,b_2,a_2])
-                 E = (Y - 2*Z)*(W[a_2,b_2,c_2] + W[b_2,c_2,a_2] + W[c_2,a_2,b_2]) + (Z - 2*Y)*(W[a_2,c_2,b_2]+W[b_2,a_2,c_2]+W[c_2,b_2,a_2]) + 3*X
-                 pT_contribution_2 += E * delta_occ / (Dd * delta_vir)
+                 X = W[a_2, b_2, c_2]*V[a_2, b_2, c_2] + W[a_2, c_2, b_2]*V[a_2, c_2, b_2] + W[b_2, a_2, c_2]*V[b_2, a_2, c_2]  \
+                   + W[b_2, c_2, a_2]*V[b_2, c_2, a_2] + W[c_2, a_2, b_2]*V[c_2, a_2, b_2] + W[c_2, b_2, a_2]*V[c_2, b_2, a_2]
+                 Y = (V[a_2, b_2, c_2] + V[b_2, c_2, a_2] + V[c_2, a_2, b_2])
+                 Z = (V[a_2, c_2, b_2] + V[b_2, a_2, c_2] + V[c_2, b_2, a_2])
+                 E = (Y - 2 * Z) * (W[a_2, b_2, c_2] + W[b_2, c_2, a_2] + W[c_2, a_2, b_2]) \
+                   + (Z - 2 * Y) * (W[a_2, c_2, b_2] + W[b_2, a_2, c_2]+W[c_2, b_2, a_2]) + 3 * X
+                 pT_contribution_2 += E * delta_occ / (Dd * delta_vir_2)
                  c_2 += 1
-                 return (a_2, b_2, c_2, pT_contribution_2)
+                 return (a_2, b_2, c_2, delta_vir_2, pT_contribution_2)
 
-              a_1_, b_1_, c_1_, pT_contribution_1_ = while_loop(lambda arr2: arr2[2] < arr2[1] + 1, loop_c, (a_1, b_1, c_1, pT_contribution_1))
+              a_1_, b_1_, c_1_, delta_vir_, pT_contribution_1_ = while_loop(lambda arr2: arr2[2] < arr2[1] + 1, loop_c, (a_1, b_1, c_1, delta_vir, pT_contribution_1))
               b_1_ += 1
               return (a_1_, b_1_, c_1_, pT_contribution_1_)
 
diff --git a/quax/methods/hartree_fock.py b/quax/methods/hartree_fock.py
index d9ce966..75126e2 100644
--- a/quax/methods/hartree_fock.py
+++ b/quax/methods/hartree_fock.py
@@ -20,9 +20,9 @@ def restricted_hartree_fock(geom, basis_name, xyz_path, nuclear_charges, charge,
 
     # If we are doing MP2 or CCSD after, might as well use jit-compiled JK-build, since HF will not be memory bottleneck
     if return_aux_data:
-        jk_build = jax.jit(jax.vmap(jax.vmap(lambda x,y: jnp.tensordot(x, y, axes=[(0,1),(0,1)]), in_axes=(0,None)), in_axes=(0,None)))
+        jk_build = jax.jit(jax.vmap(jax.vmap(lambda x,y: jnp.tensordot(x, y, axes=[(0, 1), (0, 1)]), in_axes=(0, None)), in_axes=(0, None)))
     else: 
-        jk_build = jax.vmap(jax.vmap(lambda x,y: jnp.tensordot(x, y, axes=[(0,1),(0,1)]), in_axes=(0,None)), in_axes=(0,None))
+        jk_build = jax.vmap(jax.vmap(lambda x,y: jnp.tensordot(x, y, axes=[(0, 1), (0, 1)]), in_axes=(0, None)), in_axes=(0, None))
 
     S, T, V, G = compute_integrals(geom, basis_name, xyz_path, nuclear_charges, charge, deriv_order, options)
     # Canonical orthogonalization via cholesky decomposition
@@ -49,7 +49,7 @@ def rhf_iter(F,D):
         Fp = jnp.dot(A.T, jnp.dot(F, A))
         Fp = Fp + shift 
         eps, C2 = jnp.linalg.eigh(Fp)
-        C = jnp.dot(A,C2)
+        C = jnp.dot(A, C2)
         Cocc = C[:, :ndocc]
         D = jnp.dot(Cocc, Cocc.T)
         return E_scf, D, C, eps
@@ -70,23 +70,23 @@ def rhf_iter(F,D):
                 Dold = D * 1
         # Build JK matrix: 2 * J - K
         JK = 2 * jk_build(G, D)
-        JK -= jk_build(G.transpose((0,2,1,3)), D)
+        JK -= jk_build(G.transpose((0, 2, 1, 3)), D)
         # Build Fock
         F = H + JK
         # Update convergence error
         if iteration > 1:
             diis_e = jnp.einsum('ij,jk,kl->il', F, D, S) - jnp.einsum('ij,jk,kl->il', S, D, F)
             diis_e = A.dot(diis_e).dot(A)
-            dRMS = jnp.mean(diis_e**2)**0.5
+            dRMS = jnp.mean(diis_e ** 2) ** 0.5
         # Compute energy, transform Fock and diagonalize, get new density
-        E_scf, D, C, eps = rhf_iter(F,D)
+        E_scf, D, C, eps = rhf_iter(F, D)
         iteration += 1
         if iteration == maxit:
             break
     print(iteration, " RHF iterations performed")
 
     # If many orbitals are degenerate, warn that higher order derivatives may be unstable 
-    tmp = jnp.round(eps,6)
+    tmp = jnp.round(eps, 6)
     ndegen_orbs =  tmp.shape[0] - jnp.unique(tmp).shape[0] 
     if (ndegen_orbs / nbf) > 0.20:
         print("Hartree-Fock warning: More than 20% of orbitals have degeneracies. Higher order derivatives may be unstable due to eigendecomposition AD rule")
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index 1933da6..82ac1c7 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -7,117 +7,54 @@
 import psi4
 import os
 
-from ..integrals.basis_utils import build_basis_set
-from ..integrals.tei import tei_array 
-from ..integrals.oei import oei_arrays
-
 from ..utils import get_deriv_vec_idx, get_required_deriv_vecs
 
-# Check for Libint interface 
-from ..constants import libint_imported
-if libint_imported:
-    from ..external_integrals import TEI 
-    from ..external_integrals import OEI 
-    from ..external_integrals import libint_interface
-    from ..external_integrals import tmp_potential
+# Check for Libint interface
+from ..integrals import TEI 
+from ..integrals import OEI 
+from ..integrals import libint_interface
      
 
 def compute_integrals(geom, basis_name, xyz_path, nuclear_charges, charge, deriv_order, options):
     # Load integral algo, decides to compute integrals in memory or use disk 
     algo = options['integral_algo']
 
-    if libint_imported and libint_interface.LIBINT2_MAX_DERIV_ORDER >= deriv_order:
-        if algo == 'libint_core':
+    if algo == 'libint_disk':
+        # Check disk for currently existing integral derivatives 
+        check = check_disk(geom,basis_name,xyz_path,deriv_order)
+
+        tei_obj = TEI(basis_name, xyz_path, deriv_order, 'disk')
+        oei_obj = OEI(basis_name, xyz_path, deriv_order, 'disk')
+        # If disk integral derivs are right, nothing to do
+        if check:
             libint_interface.initialize(xyz_path, basis_name)
-            # Precompute TEI derivatives 
-            tei_obj = TEI(basis_name, xyz_path, deriv_order, 'core')
-            oei_obj = OEI(basis_name, xyz_path, deriv_order, 'core')
-            # Compute integrals
             S = oei_obj.overlap(geom)
             T = oei_obj.kinetic(geom)
-            # Since Libint does not support potentials beyond 2nd order,
-            # don't use Libint in that case. 
-            # TODO revert if Libint ever changes
-            if deriv_order <= 2:
-                V = oei_obj.potential(geom)
-            else:
-                with open(xyz_path, 'r') as f:
-                    tmp = f.read()
-                molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
-                basis_dict = build_basis_set(molecule, basis_name)
-                V = tmp_potential(geom.reshape(-1,3),basis_dict,nuclear_charges)
+            V = oei_obj.potential(geom)
             G = tei_obj.tei(geom)
             libint_interface.finalize()
-            return S, T, V, G
-
-        elif algo == 'libint_disk' and deriv_order > 0:
-            # Check disk for currently existing integral derivatives 
-            check = check_disk(geom,basis_name,xyz_path,deriv_order)
-
-            tei_obj = TEI(basis_name, xyz_path, deriv_order, 'disk')
-            oei_obj = OEI(basis_name, xyz_path, deriv_order, 'disk')
-            # If disk integral derivs are right, nothing to do
-            if check:
-                libint_interface.initialize(xyz_path, basis_name)
-                S = oei_obj.overlap(geom)
-                T = oei_obj.kinetic(geom)
-                V = oei_obj.potential(geom)
-                G = tei_obj.tei(geom)
-                libint_interface.finalize()
-            else:
-                # Else write integral derivs to disk
-                if deriv_order <= 2:
-                    libint_interface.initialize(xyz_path, basis_name)
-                    libint_interface.oei_deriv_disk(deriv_order)
-                    libint_interface.eri_deriv_disk(deriv_order)
-                    S = oei_obj.overlap(geom)
-                    T = oei_obj.kinetic(geom)
-                    V = oei_obj.potential(geom)
-                    G = tei_obj.tei(geom)
-                    libint_interface.finalize()
-                else:
-                    # If higher order than 2, LIBINT api does not support potentials 
-                    # In this case, use Libint to write TEI's to disk, and do OEI's with Quax
-                    libint_interface.initialize(xyz_path, basis_name)
-                    libint_interface.eri_deriv_disk(deriv_order)
-                    G = tei_obj.tei(geom)
-                    libint_interface.finalize()
-    
-                    with open(xyz_path, 'r') as f:
-                        tmp = f.read()
-                    molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
-                    basis_dict = build_basis_set(molecule, basis_name)
-                    S, T, V = oei_arrays(geom.reshape(-1,3),basis_dict,nuclear_charges)
-        elif deriv_order == 0:
+        else:
             libint_interface.initialize(xyz_path, basis_name)
-            tei_obj = TEI(basis_name, xyz_path, deriv_order, 'core')
-            oei_obj = OEI(basis_name, xyz_path, deriv_order, 'core')
-            # Compute integrals
+            libint_interface.oei_deriv_disk(deriv_order)
+            libint_interface.eri_deriv_disk(deriv_order)
             S = oei_obj.overlap(geom)
             T = oei_obj.kinetic(geom)
             V = oei_obj.potential(geom)
             G = tei_obj.tei(geom)
             libint_interface.finalize()
 
-        # TODO
-        #elif algo == 'quax_disk':
-
-        elif algo == 'quax_core':
-            with open(xyz_path, 'r') as f:
-                tmp = f.read()
-            molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
-            basis_dict = build_basis_set(molecule, basis_name)
-            S, T, V = oei_arrays(geom.reshape(-1,3),basis_dict,nuclear_charges)
-            G = tei_array(geom.reshape(-1,3),basis_dict)
-
-    # If Libint not imported or Libint version doesnt support requested deriv order, use Quax integrals
     else:
-        with open(xyz_path, 'r') as f:
-            tmp = f.read()
-        molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
-        basis_dict = build_basis_set(molecule, basis_name)
-        S, T, V = oei_arrays(geom.reshape(-1,3),basis_dict,nuclear_charges)
-        G = tei_array(geom.reshape(-1,3),basis_dict)
+        libint_interface.initialize(xyz_path, basis_name)
+        # Precompute TEI derivatives 
+        tei_obj = TEI(basis_name, xyz_path, deriv_order, 'core')
+        oei_obj = OEI(basis_name, xyz_path, deriv_order, 'core')
+        # Compute integrals
+        S = oei_obj.overlap(geom)
+        T = oei_obj.kinetic(geom)
+        V = oei_obj.potential(geom)
+        G = tei_obj.tei(geom)
+        libint_interface.finalize()
+
     return S, T, V, G
 
 def check_disk(geom,basis_name,xyz_path,deriv_order,address=None):
@@ -161,71 +98,6 @@ def check_disk(geom,basis_name,xyz_path,deriv_order,address=None):
         correct_int_derivs = correct_nbf
     return correct_int_derivs
 
-def write_integrals(molecule, basis_name, deriv_order, address):
-    geom = jnp.asarray(np.asarray(molecule.geometry()))
-    natoms = geom.shape[0]
-    geom_list = np.asarray(molecule.geometry()).reshape(-1).tolist()
-    charge = molecule.molecular_charge()
-    nuclear_charges = jnp.asarray([molecule.charge(i) for i in range(geom.shape[0])])
-    basis_dict = build_basis_set(molecule,basis_name)
-    kwargs = {"basis_dict":basis_dict,"nuclear_charges":nuclear_charges}
-
-    # Define wrapper functions for computing partial derivatives
-    def oei_wrapper(*args, **kwargs):
-        geom = jnp.asarray(args)
-        basis_dict = kwargs['basis_dict']
-        nuclear_charges = kwargs['nuclear_charges']
-        S, T, V = oei_arrays(geom.reshape(-1,3),basis_dict,nuclear_charges)
-        return S, T, V
-
-    def tei_wrapper(*args, **kwargs):
-        geom = jnp.asarray(args)
-        basis_dict = kwargs['basis_dict']
-        G = tei_array(geom.reshape(-1,3),basis_dict)
-        return G
-
-    # Determine the set of all integral derivatives that need to be written 
-    # to disk for this computation
-    deriv_vecs = get_required_deriv_vecs(natoms, deriv_order, address)
-    for deriv_vec in deriv_vecs:
-        flat_idx = get_deriv_vec_idx(deriv_vec)
-        order = np.sum(deriv_vec)
-        # Compute partial derivative integral arrays corresponding to this deriv vec
-        if order == 1:
-            i = address[0]
-            dS, dT, dV = jacfwd(oei_wrapper, i)(*geom_list, **kwargs)
-            dG = jacfwd(tei_wrapper, i)(*geom_list, **kwargs)
-        elif order == 2:
-            i,j = address[0], address[1]
-            dS, dT, dV = jacfwd(jacfwd(oei_wrapper, i), j)(*geom_list, **kwargs)
-            dG = jacfwd(jacfwd(tei_wrapper, i), j)(*geom_list, **kwargs)
-        elif order == 3:
-            i,j,k = address[0], address[1], address[2]
-            dS, dT, dV = jacfwd(jacfwd(jacfwd(oei_wrapper, i), j), k)(*geom_list, **kwargs)
-            dG = jacfwd(jacfwd(jacfwd(tei_wrapper, i), j), k)(*geom_list, **kwargs)
-        elif order == 4:
-            i,j,k,l = address[0], address[1], address[2], address[3]
-            dS, dT, dV= jacfwd(jacfwd(jacfwd(jacfwd(oei_wrapper, i), j), k), l)(*geom_list, **kwargs)
-            dG = jacfwd(jacfwd(jacfwd(jacfwd(tei_wrapper, i), j), k), l)(*geom_list, **kwargs)
-        elif order == 5:
-            i,j,k,l,m = address[0], address[1], address[2], address[3], address[4]
-            dS, dT, dV= jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(oei_wrapper, i), j), k), l), m)(*geom_list, **kwargs)
-            dG = jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(tei_wrapper, i), j), k), l), m)(*geom_list, **kwargs)
-        elif order == 6:
-            i,j,k,l,m,n = address[0], address[1], address[2], address[3], address[4], address[5]
-            dS, dT, dV= jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(oei_wrapper, i), j), k), l), m), n)(*geom_list, **kwargs)
-            dG = jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(tei_wrapper, i), j), k), l), m), n)(*geom_list, **kwargs)
-        # Save partial derivative arrays to disk
-        f = h5py.File("oei_partials.h5","a")
-        f.create_dataset("overlap_deriv"+str(order)+"_"+str(flat_idx), data=dS)
-        f.create_dataset("kinetic_deriv"+str(order)+"_"+str(flat_idx), data=dT)
-        f.create_dataset("potential_deriv"+str(order)+"_"+str(flat_idx), data=dV)
-        f.close()
-
-        f = h5py.File("eri_partials.h5","a")
-        f.create_dataset("eri_deriv"+str(order)+"_"+str(flat_idx), data=dG)
-        f.close()
-
 
 
               
diff --git a/quax/methods/mp2.py b/quax/methods/mp2.py
index 792b33b..a2a1f4b 100644
--- a/quax/methods/mp2.py
+++ b/quax/methods/mp2.py
@@ -15,7 +15,7 @@ def restricted_mp2(geom, basis_name, xyz_path, nuclear_charges, charge, options,
     nvirt = G.shape[0] - ndocc
     nbf = G.shape[0]
 
-    G = partial_tei_transformation(G, C[:,:ndocc],C[:,ndocc:],C[:,:ndocc],C[:,ndocc:])
+    G = partial_tei_transformation(G, C[:,:ndocc], C[:,ndocc:], C[:,:ndocc], C[:,ndocc:])
 
     # Create tensor dim (occ,vir,occ,vir) of all possible orbital energy denominators
     eps_occ, eps_vir = eps[:ndocc], eps[ndocc:]
@@ -29,12 +29,12 @@ def restricted_mp2(geom, basis_name, xyz_path, nuclear_charges, charge, options,
 
     # Loop algo (lower memory, but tei transform is the memory bottleneck)
     # Create all combinations of four loop variables to make XLA compilation easier
-    indices = cartesian_product(jnp.arange(ndocc),jnp.arange(ndocc),jnp.arange(nvirt),jnp.arange(nvirt))
+    indices = cartesian_product(jnp.arange(ndocc), jnp.arange(ndocc), jnp.arange(nvirt), jnp.arange(nvirt))
 
     mp2_correlation = 0.0
     def loop_mp2(idx, mp2_corr):
         i,j,a,b = indices[idx]
-        mp2_corr += G[i, a, j, b] * (2 * G[i, a, j, b] - G[i, b, j, a]) * e_denom[i,a,j,b]
+        mp2_corr += G[i, a, j, b] * (2 * G[i, a, j, b] - G[i, b, j, a]) * e_denom[i, a, j, b]
         return mp2_corr
 
     dE_mp2 = fori_loop(0, indices.shape[0], loop_mp2, mp2_correlation)
diff --git a/tests/test_gradients.py b/tests/test_gradients.py
index d31ae01..a1fa101 100644
--- a/tests/test_gradients.py
+++ b/tests/test_gradients.py
@@ -23,7 +23,7 @@
                   'points':5,
                   'fd_project':False})
 
-options = {'damping':True, 'spectral_shift':False, 'integral_algo': 'quax_core'}
+options = {'damping':True, 'spectral_shift':False}
 
 def test_hartree_fock_gradient(method='hf'):
     psi_deriv = np.round(np.asarray(psi4.gradient(method + '/' + basis_name)), 10)
diff --git a/tests/test_hessians.py b/tests/test_hessians.py
index 929181f..b0a36d2 100644
--- a/tests/test_hessians.py
+++ b/tests/test_hessians.py
@@ -23,7 +23,7 @@
                   'points':5,
                   'fd_project':False})
 
-options = {'damping':True, 'spectral_shift':False, 'integral_algo': 'quax_core'}
+options = {'damping':True, 'spectral_shift':False}
 
 def test_hartree_fock_hessian(method='hf'):
     psi_deriv = np.round(np.asarray(psi4.hessian(method + '/' + basis_name)), 10)

From 9c9c5da316085a02f338d7d2d00689d38108783d Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Fri, 15 Sep 2023 15:47:00 -0400
Subject: [PATCH 05/91] Fix whitespace

---
 quax/methods/ints.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index 82ac1c7..95e681e 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -10,8 +10,8 @@
 from ..utils import get_deriv_vec_idx, get_required_deriv_vecs
 
 # Check for Libint interface
-from ..integrals import TEI 
-from ..integrals import OEI 
+from ..integrals import TEI
+from ..integrals import OEI
 from ..integrals import libint_interface
      
 
@@ -20,7 +20,7 @@ def compute_integrals(geom, basis_name, xyz_path, nuclear_charges, charge, deriv
     algo = options['integral_algo']
 
     if algo == 'libint_disk':
-        # Check disk for currently existing integral derivatives 
+        # Check disk for currently existing integral derivatives
         check = check_disk(geom,basis_name,xyz_path,deriv_order)
 
         tei_obj = TEI(basis_name, xyz_path, deriv_order, 'disk')
@@ -45,7 +45,7 @@ def compute_integrals(geom, basis_name, xyz_path, nuclear_charges, charge, deriv
 
     else:
         libint_interface.initialize(xyz_path, basis_name)
-        # Precompute TEI derivatives 
+        # Precompute TEI derivatives
         tei_obj = TEI(basis_name, xyz_path, deriv_order, 'core')
         oei_obj = OEI(basis_name, xyz_path, deriv_order, 'core')
         # Compute integrals

From f3c93a770679b22c758bb14e27a919e5218f863c Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 18 Sep 2023 16:39:55 -0400
Subject: [PATCH 06/91] First attempt OEI core derivs

---
 quax/integrals/buffer_lookups.h    |  70 ++++++-
 quax/integrals/libint_interface.cc | 325 +++++++++++++++++++++++++++--
 quax/integrals/oei.py              |  40 ++--
 quax/integrals/tei.py              |   4 +-
 quax/methods/ccsd_t.py             |   6 +-
 5 files changed, 397 insertions(+), 48 deletions(-)

diff --git a/quax/integrals/buffer_lookups.h b/quax/integrals/buffer_lookups.h
index fea4537..d9b6f96 100644
--- a/quax/integrals/buffer_lookups.h
+++ b/quax/integrals/buffer_lookups.h
@@ -19,7 +19,7 @@ std::vector<std::vector<int>> generate_2d_lookup(int dim_size) {
     vector<vector<int>> lookup(dim_size, vector<int> (dim_size, 0));
     vector<vector<int>> combos; // always the same, list of lists
 
-    // Collect multidimensional indices corresponding to generalized upper triangle 
+    // Collect multidimensional indices corresponding to generalized upper triangle
     for (int i = 0; i < dim_size; i++) {
       for (int j = i; j < dim_size; j++) {
         vector<int> tmp = {i, j};
@@ -39,11 +39,10 @@ std::vector<std::vector<int>> generate_2d_lookup(int dim_size) {
 }
 
 std::vector<std::vector<std::vector<int>>> generate_3d_lookup(int dim_size) { 
-    //TODO test this.
     using namespace std;
     vector<vector<vector<int>>> lookup(dim_size, vector<vector<int>>(dim_size, vector<int>(dim_size)));
     vector<vector<int>> combos; // always the same, list of lists
-    // Collect multidimensional indices corresponding to generalized upper triangle 
+    // Collect multidimensional indices corresponding to generalized upper triangle
     for (int i = 0; i < dim_size; i++) {
       for (int j = i; j < dim_size; j++) {
         for (int k = j; k < dim_size; k++) {
@@ -65,11 +64,10 @@ std::vector<std::vector<std::vector<int>>> generate_3d_lookup(int dim_size) {
 }
 
 std::vector<std::vector<std::vector<std::vector<int>>>> generate_4d_lookup(int dim_size) { 
-    //TODO test this.
     using namespace std;
     vector<vector<vector<vector<int>>>> lookup(dim_size, vector<vector<vector<int>>>(dim_size, vector<vector<int>>(dim_size, vector<int>(dim_size))));
     vector<vector<int>> combos; // always the same, list of lists
-    // Collect multidimensional indices corresponding to generalized upper triangle 
+    // Collect multidimensional indices corresponding to generalized upper triangle
     for (int i = 0; i < dim_size; i++) {
       for (int j = i; j < dim_size; j++) {
         for (int k = j; k < dim_size; k++) {
@@ -91,3 +89,65 @@ std::vector<std::vector<std::vector<std::vector<int>>>> generate_4d_lookup(int d
     }
     return lookup;
 }
+
+/*
+std::vector<std::vector<std::vector<std::vector<std::vector<int>>>>> generate_5d_lookup(int dim_size) {
+    using namespace std;
+    vector<vector<vector<vector<int>>>> lookup(dim_size, vector<vector<vector<int>>>(dim_size, vector<vector<int>>(dim_size, vector<int>(dim_size))));
+    vector<vector<int>> combos; // always the same, list of lists
+    // Collect multidimensional indices corresponding to generalized upper triangle
+    for (int i = 0; i < dim_size; i++) {
+      for (int j = i; j < dim_size; j++) {
+        for (int k = j; k < dim_size; k++) {
+          for (int l = k; l < dim_size; l++) {
+            for (int m = l; m < dim_size; m++) {
+                vector<int> tmp = {i, j, k, l, m};
+                combos.push_back(tmp);
+            }
+          }
+        }
+      }
+    }
+    // Build lookup array and return
+    for (int i = 0; i < combos.size(); i++){
+        auto multi_idx = combos[i];
+        // Loop over all permutations, assign 1d buffer index to appropriate addresses in totally symmetric lookup array
+        do {
+        lookup[multi_idx[0]][multi_idx[1]][multi_idx[2]][multi_idx[3]][multi_idx[4]] = i;
+        }
+        while (next_permutation(multi_idx.begin(),multi_idx.end()));
+    }
+    return lookup;
+}
+
+std::vector<std::vector<std::vector<std::vector<std::vector<std::vector<int>>>>>> generate_6d_lookup(int dim_size) {
+    using namespace std;
+    vector<vector<vector<vector<int>>>> lookup(dim_size, vector<vector<vector<int>>>(dim_size, vector<vector<int>>(dim_size, vector<int>(dim_size))));
+    vector<vector<int>> combos; // always the same, list of lists
+    // Collect multidimensional indices corresponding to generalized upper triangle
+    for (int i = 0; i < dim_size; i++) {
+      for (int j = i; j < dim_size; j++) {
+        for (int k = j; k < dim_size; k++) {
+          for (int l = k; l < dim_size; l++) {
+            for (int m = l; m < dim_size; m++) {
+              for (int n = m; n < dim_size; n++) {
+                vector<int> tmp = {i, j, k, l, m, n};
+                combos.push_back(tmp);
+              }
+            }
+          }
+        }
+      }
+    }
+    // Build lookup array and return
+    for (int i = 0; i < combos.size(); i++){
+        auto multi_idx = combos[i];
+        // Loop over all permutations, assign 1d buffer index to appropriate addresses in totally symmetric lookup array
+        do {
+        lookup[multi_idx[0]][multi_idx[1]][multi_idx[2]][multi_idx[3]][multi_idx[4]][multi_idx[5]] = i;
+        }
+        while (next_permutation(multi_idx.begin(),multi_idx.end()));
+    }
+    return lookup;
+}
+*/
\ No newline at end of file
diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index e306c7e..7a3d931 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -32,10 +32,14 @@ static const std::vector<int> buffer_index_eri1d = generate_1d_lookup(12);
 static const std::vector<std::vector<int>> buffer_index_eri2d = generate_2d_lookup(12);
 static const std::vector<std::vector<std::vector<int>>> buffer_index_eri3d = generate_3d_lookup(12);
 static const std::vector<std::vector<std::vector<std::vector<int>>>> buffer_index_eri4d = generate_4d_lookup(12);
+//static const std::vector<std::vector<std::vector<std::vector<std::vector<int>>>>> buffer_index_eri5d = generate_5d_lookup(12);
+//static const std::vector<std::vector<std::vector<std::vector<std::vector<std::vector<int>>>>>> buffer_index_eri6d = generate_6d_lookup(12);
 static const std::vector<int> buffer_index_oei1d = generate_1d_lookup(6);
 static const std::vector<std::vector<int>> buffer_index_oei2d = generate_2d_lookup(6);
 static const std::vector<std::vector<std::vector<int>>> buffer_index_oei3d = generate_3d_lookup(6);
 static const std::vector<std::vector<std::vector<std::vector<int>>>> buffer_index_oei4d = generate_4d_lookup(6);
+//static const std::vector<std::vector<std::vector<std::vector<std::vector<int>>>>> buffer_index_oei5d = generate_5d_lookup(6);
+//static const std::vector<std::vector<std::vector<std::vector<std::vector<std::vector<int>>>>>> buffer_index_oei6d = generate_6d_lookup(6);
 
 // Creates atom objects from xyz file path
 std::vector<libint2::Atom> get_atoms(std::string xyzfilename) 
@@ -221,7 +225,7 @@ py::array potential() {
 
     size_t length = nbf * nbf;
     std::vector<double> result(length);
-    
+
     for(auto s1 = 0; s1 != obs.size(); ++s1) {
         auto bf1 = shell2bf[s1];  // first basis function in first shell
         auto n1 = obs[s1].size(); // number of basis functions in first shell
@@ -293,7 +297,7 @@ py::array eri() {
 
 // Computes nuclear derivatives of overlap integrals
 py::array overlap_deriv(std::vector<int> deriv_vec) {
-    assert(3 * atoms.size() == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
+    assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
     // Get order of differentiation
     int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
 
@@ -310,12 +314,12 @@ py::array overlap_deriv(std::vector<int> deriv_vec) {
     std::vector<double> result(length);
 
     const auto& buf_vec = s_engine.results(); // will point to computed shell sets
-    
+
     for(auto s1 = 0; s1 != obs.size(); ++s1) {
         auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
         auto atom1 = shell2atom[s1]; // Atom index of shell 1
         auto n1 = obs[s1].size();    // number of basis functions in shell 1
-        for(auto s2=0; s2 != obs.size(); ++s2) {
+        for(auto s2 = 0; s2 != obs.size(); ++s2) {
             auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
             auto atom2 = shell2atom[s2]; // Atom index of shell 2
             auto n2 = obs[s2].size();    // number of basis functions in shell 2
@@ -379,7 +383,7 @@ py::array overlap_deriv(std::vector<int> deriv_vec) {
 
 // Computes nuclear derivatives of kinetic energy integrals
 py::array kinetic_deriv(std::vector<int> deriv_vec) {
-    assert(3 * atoms.size() == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
+    assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
     // Get order of differentiation
     int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
 
@@ -394,7 +398,7 @@ py::array kinetic_deriv(std::vector<int> deriv_vec) {
 
     size_t length = nbf * nbf;
     std::vector<double> result(length);
-    
+
     for(auto s1 = 0; s1 != obs.size(); ++s1) {
         auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
         auto atom1 = shell2atom[s1]; // Atom index of shell 1
@@ -463,13 +467,13 @@ py::array kinetic_deriv(std::vector<int> deriv_vec) {
 
 // Computes nuclear derivatives of potential energy integrals 
 py::array potential_deriv(std::vector<int> deriv_vec) {
-    assert(3 * atoms.size() == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
+    assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
     // Get order of differentiation
     int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
 
     // Lookup arrays for mapping shell derivative index to buffer index 
     // Potential lookup arrays depend on atom size
-    int dimensions = 6 + 3 * atoms.size();
+    int dimensions = 6 + ncart;
     static const std::vector<int> buffer_index_potential1d = generate_1d_lookup(dimensions);
     static const std::vector<std::vector<int>> buffer_index_potential2d = generate_2d_lookup(dimensions);
     static const std::vector<std::vector<std::vector<int>>> buffer_index_potential3d = generate_3d_lookup(dimensions);
@@ -575,13 +579,13 @@ py::array potential_deriv(std::vector<int> deriv_vec) {
 
             // Loop over every buffer index and accumulate for every shell set.
             for(auto i = 0; i < buffer_indices.size(); ++i) {
-              auto ints_shellset = buf_vec[buffer_indices[i]]; 
-              if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
-              for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                  result[(bf1 + f1) * nbf + bf2 + f2] += ints_shellset[idx]; 
+                auto ints_shellset = buf_vec[buffer_indices[i]];
+                if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
+                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                    for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                        result[(bf1 + f1) * nbf + bf2 + f2] += ints_shellset[idx];
+                    }
                 }
-              }
             }
         }
     }
@@ -597,7 +601,7 @@ py::array eri_deriv(std::vector<int> deriv_vec) {
     std::vector<int> desired_coordinates;
     process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
 
-    assert(3 * atoms.size() == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
+    assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
 
     // ERI derivative integral engine
     libint2::Engine eri_engine(libint2::Operator::coulomb, obs.max_nprim(), obs.max_l(), deriv_order);
@@ -777,11 +781,11 @@ void oei_deriv_disk(int max_deriv_order) {
         // Overlap and kinetic have different mappings than potential since potential has more elements in the buffer 
         const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
         // Potential integrals buffer is flattened upper triangle of (6 + NCART) dimensional deriv_order tensor
-        int dimensions = 6 + 3 * natom;
+        int dimensions = 6 + ncart;
         const std::vector<std::vector<int>> potential_buffer_multidim_lookup = generate_multi_index_lookup(dimensions, deriv_order);
 
         // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
-        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(natom * 3, deriv_order);
+        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
 
         // Define engines and buffers
         libint2::Engine overlap_engine(libint2::Operator::overlap,obs.max_nprim(),obs.max_l(),deriv_order);
@@ -844,7 +848,7 @@ void oei_deriv_disk(int max_deriv_order) {
                         int desired_atom_idx = multi_cart_idx[j] / 3;
                         int desired_coord = multi_cart_idx[j] % 3;
                         // Loop over shell indices
-                        for (int i=0; i<2; i++){
+                        for (int i = 0; i < 2; i++){
                             int atom_idx = shell_atom_index_list[i];
                             if (atom_idx == desired_atom_idx) {
                                 int tmp = 3 * i + desired_coord;
@@ -972,7 +976,7 @@ void eri_deriv_disk(int max_deriv_order) {
         const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
 
         // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
-        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(natom * 3, deriv_order);
+        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
 
         // Libint engine for computing shell quartet derivatives
         libint2::Engine eri_engine(libint2::Operator::coulomb, obs.max_nprim(), obs.max_l(), deriv_order);
@@ -1094,6 +1098,278 @@ delete file;
 std::cout << " done" << std::endl;
 } // eri_deriv_disk function
 
+// Computes a single 'deriv_order' derivative tensor of overlap integrals, keeps everything in core memory
+py::array overlap_deriv_core(int deriv_order) {
+    int nshell_derivs = how_many_derivs(2, deriv_order);
+    // how many unique cartesian nuclear derivatives (e.g., so we only save one of d^2/dx1dx2 and d^2/dx2dx1, etc)
+    unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
+
+    // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
+
+    // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
+    const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
+
+    // Define engines and buffers
+    libint2::Engine overlap_engine(libint2::Operator::overlap, obs.max_nprim(), obs.max_l(), deriv_order);
+    const auto& overlap_buffer = overlap_engine.results();
+
+    size_t length = nbf * nbf * nderivs_triu;
+    std::vector<double> result(length);
+
+    for(auto s1 = 0; s1 != obs.size(); ++s1) {
+        auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
+        auto atom1 = shell2atom[s1]; // Atom index of shell 1
+        auto n1 = obs[s1].size();    // number of basis functions in shell 1
+        for(auto s2 = 0; s2 != obs.size(); ++s2) {
+            auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
+            auto atom2 = shell2atom[s2]; // Atom index of shell 2
+            auto n2 = obs[s2].size();    // number of basis functions in shell 2
+            std::vector<long> shell_atom_index_list{atom1, atom2};
+
+            overlap_engine.compute(obs[s1], obs[s2]);
+
+            // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+            // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
+            for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                size_t offset_nuc_idx = nuc_idx * nbf * nbf;
+                // Look up multidimensional cartesian derivative index
+                auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+                // Create a vector of vectors called `indices`, where each subvector is your possible choices
+                // for the first derivative operator, second, third, etc and the total number of subvectors is order of differentiation
+                // What follows fills these indices
+                std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+
+                // Loop over each cartesian coordinate index which we are differentiating wrt for this nuclear cartesian derivative index
+                // and check to see if it is present in the shell duet
+                for (int j = 0; j < multi_cart_idx.size(); j++){
+                    int desired_atom_idx = multi_cart_idx[j] / 3;
+                    int desired_coord = multi_cart_idx[j] % 3;
+                    // Loop over shell indices
+                    for (int i = 0; i < 2; i++){
+                        int atom_idx = shell_atom_index_list[i];
+                        if (atom_idx == desired_atom_idx) {
+                            int tmp = 3 * i + desired_coord;
+                            indices[j].push_back(tmp);
+                        }
+                    }
+                }
+
+                // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+                // and the total number of subvectors is the order of differentiation
+                // Now we want all combinations where we pick exactly one index from each subvector.
+                // This is achievable through a cartesian product
+                std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                std::vector<int> buffer_indices;
+                // Collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+                for (auto vec : index_combos)  {
+                    std::sort(vec.begin(), vec.end());
+                    int buf_idx = 0;
+                    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                    buffer_indices.push_back(buf_idx);
+                }
+
+                // Loop over shell block for each buffer index which contributes to this derivative
+                // Overlap and Kinetic
+                for(auto i = 0; i < buffer_indices.size(); ++i) {
+                    auto overlap_shellset = overlap_buffer[buffer_indices[i]];
+                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                        for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                            result[(bf1 + f1) * nbf + bf2 + f2 + offset_nuc_idx] += overlap_shellset[idx];
+                        }
+                    }
+                }
+            } // Unique nuclear cartesian derivative indices loop
+        }
+    } // shell duet loops
+    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
+} // overlap_deriv_core function
+
+// Computes a single 'deriv_order' derivative tensor of kinetic integrals, keeps everything in core memory
+py::array kinetic_deriv_core(int deriv_order) {
+    int nshell_derivs = how_many_derivs(2, deriv_order);
+    // how many unique cartesian nuclear derivatives (e.g., so we only save one of d^2/dx1dx2 and d^2/dx2dx1, etc)
+    unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
+
+    // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
+
+    // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
+    const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
+
+    // Define engines and buffers
+    libint2::Engine kinetic_engine(libint2::Operator::kinetic, obs.max_nprim(), obs.max_l(), deriv_order);
+    const auto& kinetic_buffer = kinetic_engine.results();
+
+    size_t length = nbf * nbf * nderivs_triu;
+    std::vector<double> result(length);
+
+    for(auto s1 = 0; s1 != obs.size(); ++s1) {
+        auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
+        auto atom1 = shell2atom[s1]; // Atom index of shell 1
+        auto n1 = obs[s1].size();    // number of basis functions in shell 1
+        for(auto s2 = 0; s2 != obs.size(); ++s2) {
+            auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
+            auto atom2 = shell2atom[s2]; // Atom index of shell 2
+            auto n2 = obs[s2].size();    // number of basis functions in shell 2
+            std::vector<long> shell_atom_index_list{atom1, atom2};
+
+            kinetic_engine.compute(obs[s1], obs[s2]);
+
+            // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+            // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
+            for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                size_t offset_nuc_idx = nuc_idx * nbf * nbf;
+                // Look up multidimensional cartesian derivative index
+                auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+                // Create a vector of vectors called `indices`, where each subvector is your possible choices
+                // for the first derivative operator, second, third, etc and the total number of subvectors is order of differentiation
+                // What follows fills these indices
+                std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+
+                // Loop over each cartesian coordinate index which we are differentiating wrt for this nuclear cartesian derivative index
+                // and check to see if it is present in the shell duet
+                for (int j = 0; j < multi_cart_idx.size(); j++){
+                    int desired_atom_idx = multi_cart_idx[j] / 3;
+                    int desired_coord = multi_cart_idx[j] % 3;
+                    // Loop over shell indices
+                    for (int i = 0; i < 2; i++){
+                        int atom_idx = shell_atom_index_list[i];
+                        if (atom_idx == desired_atom_idx) {
+                            int tmp = 3 * i + desired_coord;
+                            indices[j].push_back(tmp);
+                        }
+                    }
+                }
+
+                // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+                // and the total number of subvectors is the order of differentiation
+                // Now we want all combinations where we pick exactly one index from each subvector.
+                // This is achievable through a cartesian product
+                std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                std::vector<int> buffer_indices;
+                // Collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+                for (auto vec : index_combos)  {
+                    std::sort(vec.begin(), vec.end());
+                    int buf_idx = 0;
+                    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                    buffer_indices.push_back(buf_idx);
+                }
+
+                // Loop over shell block for each buffer index which contributes to this derivative
+                // Overlap and Kinetic
+                for(auto i = 0; i < buffer_indices.size(); ++i) {
+                    auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
+                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                        for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                            result[(bf1 + f1) * nbf + bf2 + f2 + offset_nuc_idx] += kinetic_shellset[idx];
+                        }
+                    }
+                }
+            } // Unique nuclear cartesian derivative indices loop
+        }
+    } // shell duet loops
+    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
+} // kinetic_deriv_core function
+
+// Computes a single 'deriv_order' derivative tensor of potential integrals, keeps everything in core memory
+py::array potential_deriv_core(int deriv_order) {
+    int nshell_derivs = how_many_derivs(2, deriv_order, natom);
+    // how many unique cartesian nuclear derivatives (e.g., so we only save one of d^2/dx1dx2 and d^2/dx2dx1, etc)
+    unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
+
+    // Potential integrals buffer is flattened upper triangle of (6 + NCART) dimensional deriv_order tensor
+    int dimensions = 6 + ncart;
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(dimensions, deriv_order);
+
+    // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
+    const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
+
+    // Define engines and buffers
+    libint2::Engine potential_engine(libint2::Operator::nuclear, obs.max_nprim(), obs.max_l(), deriv_order);
+    potential_engine.set_params(libint2::make_point_charges(atoms));
+    const auto& potential_buffer = potential_engine.results();
+
+    size_t length = nbf * nbf * nderivs_triu;
+    std::vector<double> result(length);
+
+    for(auto s1 = 0; s1 != obs.size(); ++s1) {
+        auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
+        auto atom1 = shell2atom[s1]; // Atom index of shell 1
+        auto n1 = obs[s1].size();    // number of basis functions in shell 1
+        for(auto s2 = 0; s2 != obs.size(); ++s2) {
+            auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
+            auto atom2 = shell2atom[s2]; // Atom index of shell 2
+            auto n2 = obs[s2].size();    // number of basis functions in shell 2
+            std::vector<long> shell_atom_index_list{atom1, atom2};
+
+            potential_engine.compute(obs[s1], obs[s2]);
+
+            // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+            // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
+            for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                size_t offset_nuc_idx = nuc_idx * nbf * nbf;
+                // Look up multidimensional cartesian derivative index
+                auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+                // Create a vector of vectors called `indices`, where each subvector is your possible choices
+                // for the first derivative operator, second, third, etc and the total number of subvectors is order of differentiation
+                // What follows fills these indices
+                std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+
+                // Loop over each cartesian coordinate index which we are differentiating wrt for this nuclear cartesian derivative index
+                // and check to see if it is present in the shell duet, and where it is present in the potential operator
+                for (int j = 0; j < multi_cart_idx.size(); j++){
+                    int desired_atom_idx = multi_cart_idx[j] / 3;
+                    int desired_coord = multi_cart_idx[j] % 3;
+                    // Loop over shell indices
+                    for (int i=0; i < 2; i++){
+                        int atom_idx = shell_atom_index_list[i];
+                        if (atom_idx == desired_atom_idx) {
+                            int tmp = 3 * i + desired_coord;
+                            indices[j].push_back(tmp);
+                        }
+                    }
+                    // Loop over each atom in molecule, and if this derivative
+                    // differentiates wrt that atom, we also need to collect that index.
+                    for (int i = 0; i < natom; i++){
+                        if (i == desired_atom_idx) {
+                            int offset_i = i + 2;
+                            int tmp = 3 * offset_i + desired_coord;
+                            indices[j].push_back(tmp);
+                        }
+                    }
+                }
+
+                // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+                // and the total number of subvectors is the order of differentiation
+                // Now we want all combinations where we pick exactly one index from each subvector.
+                // This is achievable through a cartesian product
+                std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                std::vector<int> buffer_indices;
+                // Collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+                for (auto vec : index_combos)  {
+                    std::sort(vec.begin(), vec.end());
+                    int buf_idx = 0;
+                    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                    buffer_indices.push_back(buf_idx);
+                }
+
+                for(auto i = 0; i < buffer_indices.size(); ++i) {
+                    auto potential_shellset = potential_buffer[buffer_indices[i]];
+                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                        for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                            result[(bf1 + f1) * nbf + bf2 + f2  + offset_nuc_idx] += potential_shellset[idx];
+                        }
+                    }
+                }
+            } // Unique nuclear cartesian derivative indices loop
+        }
+    } // shell duet loops
+    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
+} // potential_deriv_core function
 
 // Computes a single 'deriv_order' derivative tensor of electron repulsion integrals, keeps everything in core memory
 py::array eri_deriv_core(int deriv_order) {
@@ -1106,7 +1382,7 @@ py::array eri_deriv_core(int deriv_order) {
     const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
 
     // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
-    const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(natom * 3, deriv_order);
+    const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
 
     // Libint engine for computing shell quartet derivatives
     libint2::Engine eri_engine(libint2::Operator::coulomb, obs.max_nprim(), obs.max_l(), deriv_order);
@@ -1150,7 +1426,7 @@ py::array eri_deriv_core(int deriv_order) {
                         for (int j = 0; j < multi_cart_idx.size(); j++){
                             int desired_atom_idx = multi_cart_idx[j] / 3;
                             int desired_coord = multi_cart_idx[j] % 3;
-                            for (int i = 0; i<4; i++){
+                            for (int i = 0; i < 4; i++){
                                 int atom_idx = shell_atom_index_list[i];
                                 if (atom_idx == desired_atom_idx) {
                                     int tmp = 3 * i + desired_coord;
@@ -1207,7 +1483,7 @@ py::array eri_deriv_core(int deriv_order) {
         }
     } // shell quartet loops
     return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
-} // eri_deriv_disk function
+} // eri_deriv_core function
 
 // Define module named 'libint_interface' which can be imported with python
 // The second arg, 'm' defines a variable py::module_ which can be used to create
@@ -1224,8 +1500,11 @@ PYBIND11_MODULE(libint_interface, m) {
     m.def("kinetic_deriv", &kinetic_deriv, "Computes kinetic integral nuclear derivatives with libint");
     m.def("potential_deriv", &potential_deriv, "Computes potential integral nuclear derivatives with libint");
     m.def("eri_deriv", &eri_deriv, "Computes electron repulsion integral nuclear derivatives with libint");
-    m.def("eri_deriv_disk", &eri_deriv_disk, "Computes coulomb integral nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
     m.def("oei_deriv_disk", &oei_deriv_disk, "Computes overlap, kinetic, and potential integral derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
+    m.def("eri_deriv_disk", &eri_deriv_disk, "Computes coulomb integral nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
+    m.def("overlap_deriv_core", &overlap_deriv_core, "Computes a single overlap integral derivative tensor, in memory.");
+    m.def("kinetic_deriv_core", &kinetic_deriv_core, "Computes a single kinetic integral derivative tensor, in memory.");
+    m.def("potential_deriv_core", &potential_deriv_core, "Computes a single potential integral nuclear derivative tensor, in memory.");
     m.def("eri_deriv_core", &eri_deriv_core, "Computes a single coulomb integral nuclear derivative tensor, in memory.");
     //TODO partial derivative impl's
     //m.def("eri_partial_deriv_disk", &eri_partial_deriv_disk, "Computes a subset of the full coulomb integral nuclear derivative tensor and writes them to disk with HDF5");
diff --git a/quax/integrals/oei.py b/quax/integrals/oei.py
index 976765e..5ca372e 100644
--- a/quax/integrals/oei.py
+++ b/quax/integrals/oei.py
@@ -19,9 +19,22 @@ def __init__(self, basis_name, xyz_path, max_deriv_order, mode):
         natoms = molecule.natom()
         nbf = basis_set.nbf()
 
-        # TODO implement core-algo for OEI's in libint_interface.cc
-        #if mode == 'core' and max_deriv_order > 0:
-            #self.oei_derivatives = {}
+        if mode == 'core' and max_deriv_order > 0:
+            # A list of OEI derivative tensors, containing only unique elements
+            # corresponding to upper hypertriangle (since derivative tensors are symmetric)
+            # Length of tuple is maximum deriv order, each array is (upper triangle derivatives,nbf,nbf)
+            # Then when JAX calls JVP, read appropriate slice
+            self.overlap_derivatives = []
+            self.kinetic_derivatives = []
+            self.potential_derivatives = []
+            for i in range(max_deriv_order):
+                n_unique_derivs = how_many_derivs(natoms, i + 1)
+                overlap_deriv = libint_interface.overlap_deriv_core(i + 1).reshape(n_unique_derivs,nbf,nbf)
+                kinetic_deriv = libint_interface.kinetic_deriv_core(i + 1).reshape(n_unique_derivs,nbf,nbf)
+                potential_deriv = libint_interface.potential_deriv_core(i + 1).reshape(n_unique_derivs,nbf,nbf)
+                self.overlap_derivatives.append(overlap_deriv)
+                self.kinetic_derivatives.append(kinetic_deriv)
+                self.potential_derivatives.append(potential_deriv)
 
         self.mode = mode
         self.nbf = nbf
@@ -93,13 +106,12 @@ def potential_impl(self, geom):
     def overlap_deriv_impl(self, geom, deriv_vec):
         deriv_vec = np.asarray(deriv_vec, int)
         deriv_order = np.sum(deriv_vec)
+        idx = get_deriv_vec_idx(deriv_vec)
 
-        #TODO update once core algo in libint is computed, this just computes one slice at a time
         if self.mode == 'core':
-            S = libint_interface.overlap_deriv(np.asarray(deriv_vec, int))
-            return jnp.asarray(S).reshape(self.nbf,self.nbf)
+            S = self.overlap_derivatives[deriv_order-1][idx,:,:]
+            return jnp.asarray(S)
         else:
-            idx = get_deriv_vec_idx(deriv_vec)
             if os.path.exists("oei_derivs.h5"):
                 file_name = "oei_derivs.h5"
                 dataset_name = "overlap_deriv" + str(deriv_order)
@@ -121,13 +133,12 @@ def overlap_deriv_impl(self, geom, deriv_vec):
     def kinetic_deriv_impl(self, geom, deriv_vec):
         deriv_vec = np.asarray(deriv_vec, int)
         deriv_order = np.sum(deriv_vec)
+        idx = get_deriv_vec_idx(deriv_vec)
 
-        #TODO update once core algo in libint is computed, this just computes one slice at a time
         if self.mode == 'core':
-            T = libint_interface.kinetic_deriv(np.asarray(deriv_vec, int))
-            return jnp.asarray(T).reshape(self.nbf,self.nbf)
+            T = self.kinetic_derivatives[deriv_order-1][idx,:,:]
+            return jnp.asarray(T)
         else:
-            idx = get_deriv_vec_idx(deriv_vec)
             if os.path.exists("oei_derivs.h5"):
                 file_name = "oei_derivs.h5"
                 dataset_name = "kinetic_deriv" + str(deriv_order)
@@ -149,13 +160,12 @@ def kinetic_deriv_impl(self, geom, deriv_vec):
     def potential_deriv_impl(self, geom, deriv_vec):
         deriv_vec = np.asarray(deriv_vec, int)
         deriv_order = np.sum(deriv_vec)
+        idx = get_deriv_vec_idx(deriv_vec)
 
-        #TODO update once core algo in libint is computed, this just computes one slice at a time
         if self.mode == 'core':
-            V = libint_interface.potential_deriv(np.asarray(deriv_vec, int))
-            return jnp.asarray(V).reshape(self.nbf,self.nbf)
+            V = self.potential_derivatives[deriv_order-1][idx,:,:]
+            return jnp.asarray(V)
         else:
-            idx = get_deriv_vec_idx(deriv_vec)
             if os.path.exists("oei_derivs.h5"):
                 file_name = "oei_derivs.h5"
                 dataset_name = "potential_deriv" + str(deriv_order)
diff --git a/quax/integrals/tei.py b/quax/integrals/tei.py
index e738d3b..0d1e866 100644
--- a/quax/integrals/tei.py
+++ b/quax/integrals/tei.py
@@ -20,14 +20,14 @@ def __init__(self, basis_name, xyz_path, max_deriv_order, mode):
         nbf = basis_set.nbf()
 
         if mode == 'core' and max_deriv_order > 0:
-            # An list of ERI derivative tensors, containing only unique elements
+            # A list of ERI derivative tensors, containing only unique elements
             # corresponding to upper hypertriangle (since derivative tensors are symmetric)
             # Length of tuple is maximum deriv order, each array is (upper triangle derivatives,nbf,nbf,nbf,nbf)
             # Then when JAX calls JVP, read appropriate slice
             self.eri_derivatives = []
             for i in range(max_deriv_order):
                 n_unique_derivs = how_many_derivs(natoms, i + 1)
-                eri_deriv = libint_interface.eri_deriv_core(i+1).reshape(n_unique_derivs,nbf,nbf,nbf,nbf)
+                eri_deriv = libint_interface.eri_deriv_core(i + 1).reshape(n_unique_derivs,nbf,nbf,nbf,nbf)
                 self.eri_derivatives.append(eri_deriv)
 
         self.mode = mode
diff --git a/quax/methods/ccsd_t.py b/quax/methods/ccsd_t.py
index 43b41f5..4015ba9 100644
--- a/quax/methods/ccsd_t.py
+++ b/quax/methods/ccsd_t.py
@@ -48,12 +48,12 @@ def loop_c(arr2):
                  a_2, b_2, c_2, delta_vir_2, pT_contribution_2 = arr2
                  delta_vir_2 = delta_vir_2 + delta_v[b_2,c_2]
                  Dd = Dd_occ - (fock_Vd[a_2] + fock_Vd[b_2] + fock_Vd[c_2])
-                 X = W[a_2, b_2, c_2]*V[a_2, b_2, c_2] + W[a_2, c_2, b_2]*V[a_2, c_2, b_2] + W[b_2, a_2, c_2]*V[b_2, a_2, c_2]  \
-                   + W[b_2, c_2, a_2]*V[b_2, c_2, a_2] + W[c_2, a_2, b_2]*V[c_2, a_2, b_2] + W[c_2, b_2, a_2]*V[c_2, b_2, a_2]
+                 X = W[a_2, b_2, c_2] * V[a_2, b_2, c_2] + W[a_2, c_2, b_2] * V[a_2, c_2, b_2] + W[b_2, a_2, c_2] * V[b_2, a_2, c_2]  \
+                   + W[b_2, c_2, a_2] * V[b_2, c_2, a_2] + W[c_2, a_2, b_2] * V[c_2, a_2, b_2] + W[c_2, b_2, a_2] * V[c_2, b_2, a_2]
                  Y = (V[a_2, b_2, c_2] + V[b_2, c_2, a_2] + V[c_2, a_2, b_2])
                  Z = (V[a_2, c_2, b_2] + V[b_2, a_2, c_2] + V[c_2, b_2, a_2])
                  E = (Y - 2 * Z) * (W[a_2, b_2, c_2] + W[b_2, c_2, a_2] + W[c_2, a_2, b_2]) \
-                   + (Z - 2 * Y) * (W[a_2, c_2, b_2] + W[b_2, a_2, c_2]+W[c_2, b_2, a_2]) + 3 * X
+                   + (Z - 2 * Y) * (W[a_2, c_2, b_2] + W[b_2, a_2, c_2] + W[c_2, b_2, a_2]) + 3 * X
                  pT_contribution_2 += E * delta_occ / (Dd * delta_vir_2)
                  c_2 += 1
                  return (a_2, b_2, c_2, delta_vir_2, pT_contribution_2)

From 3145c024a058f1bc63571ee13520a66cad10142e Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Tue, 19 Sep 2023 17:46:16 -0400
Subject: [PATCH 07/91] oei_core_derivs

---
 quax/integrals/libint_interface.cc | 285 ++++++++---------------------
 quax/integrals/oei.py              |  10 +-
 2 files changed, 76 insertions(+), 219 deletions(-)

diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index 7a3d931..e9f9ad7 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -530,9 +530,8 @@ py::array potential_deriv(std::vector<int> deriv_vec) {
                 
                 for (int i = 0; i < natom; i++){
                     // i = shell_atom_index_list[i];
-                    if (i == desired_atom_idx) { 
-                        int offset_i = i + 2;
-                        int tmp = 3 * offset_i + desired_coordinates[j];
+                    if (i == desired_atom_idx) {
+                        int tmp = 3 * (i +2) + desired_coordinates[j];
                         indices[j].push_back(tmp);
                     }
                 }
@@ -772,29 +771,29 @@ void oei_deriv_disk(int max_deriv_order) {
 
     for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
         // how many shell derivatives in the Libint buffer for overlap/kinetic integrals
-        // how many shell and operator derivatives for potential integrals 
+        // how many shell and operator derivatives for potential integrals
         int nshell_derivs = how_many_derivs(2, deriv_order);
         int nshell_derivs_potential = how_many_derivs(2, deriv_order, natom);
         // how many unique cartesian nuclear derivatives (e.g., so we only save one of d^2/dx1dx2 and d^2/dx2dx1, etc)
         unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
+
         // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
         // Overlap and kinetic have different mappings than potential since potential has more elements in the buffer 
         const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
         // Potential integrals buffer is flattened upper triangle of (6 + NCART) dimensional deriv_order tensor
-        int dimensions = 6 + ncart;
-        const std::vector<std::vector<int>> potential_buffer_multidim_lookup = generate_multi_index_lookup(dimensions, deriv_order);
+        const std::vector<std::vector<int>> potential_buffer_multidim_lookup = generate_multi_index_lookup(6 + ncart, deriv_order);
 
         // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
         const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
 
         // Define engines and buffers
-        libint2::Engine overlap_engine(libint2::Operator::overlap,obs.max_nprim(),obs.max_l(),deriv_order);
-        const auto& overlap_buffer = overlap_engine.results(); 
-        libint2::Engine kinetic_engine(libint2::Operator::kinetic,obs.max_nprim(),obs.max_l(),deriv_order);
-        const auto& kinetic_buffer = kinetic_engine.results(); 
-        libint2::Engine potential_engine(libint2::Operator::nuclear,obs.max_nprim(),obs.max_l(),deriv_order);
+        libint2::Engine overlap_engine(libint2::Operator::overlap, obs.max_nprim(), obs.max_l(), deriv_order);
+        const auto& overlap_buffer = overlap_engine.results();
+        libint2::Engine kinetic_engine(libint2::Operator::kinetic, obs.max_nprim(), obs.max_l(), deriv_order);
+        const auto& kinetic_buffer = kinetic_engine.results();
+        libint2::Engine potential_engine(libint2::Operator::nuclear, obs.max_nprim(), obs.max_l(), deriv_order);
         potential_engine.set_params(libint2::make_point_charges(atoms));
-        const auto& potential_buffer = potential_engine.results(); 
+        const auto& potential_buffer = potential_engine.results();
 
         // Define HDF5 dataset names
         const H5std_string overlap_dset_name("overlap_deriv" + std::to_string(deriv_order));
@@ -820,7 +819,7 @@ void oei_deriv_disk(int max_deriv_order) {
                 auto bf2 = shell2bf[s2];  // first basis function in second shell
                 auto atom2 = shell2atom[s2]; // Atom index of shell 2
                 auto n2 = obs[s2].size(); // number of basis functions in second shell
-                std::vector<long> shell_atom_index_list{atom1,atom2};
+                std::vector<long> shell_atom_index_list{atom1, atom2};
 
                 overlap_engine.compute(obs[s1], obs[s2]);
                 kinetic_engine.compute(obs[s1], obs[s2]);
@@ -830,7 +829,7 @@ void oei_deriv_disk(int max_deriv_order) {
                 double overlap_shellset_slab [n1][n2][nderivs_triu] = {};
                 double kinetic_shellset_slab [n1][n2][nderivs_triu] = {};
                 double potential_shellset_slab [n1][n2][nderivs_triu] = {};
-                
+
                 // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
                 // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
                 for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
@@ -841,9 +840,9 @@ void oei_deriv_disk(int max_deriv_order) {
                     // What follows fills these indices
                     std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
                     std::vector<std::vector<int>> potential_indices(deriv_order, std::vector<int> (0,0));
-                
+
                     // Loop over each cartesian coordinate index which we are differentiating wrt for this nuclear cartesian derivative index
-                    // and check to see if it is present in the shell duet, and where it is present in the potential operator 
+                    // and check to see if it is present in the shell duet, and where it is present in the potential operator
                     for (int j = 0; j < multi_cart_idx.size(); j++){
                         int desired_atom_idx = multi_cart_idx[j] / 3;
                         int desired_coord = multi_cart_idx[j] % 3;
@@ -860,16 +859,16 @@ void oei_deriv_disk(int max_deriv_order) {
                         // differentiates wrt that atom, we also need to collect that index.
                         for (int i = 0; i < natom; i++){
                             if (i == desired_atom_idx) {
-                                int offset_i = i + 2;
-                                int tmp = 3 * offset_i + desired_coord;
+                                int tmp = 3 * (i + 2) + desired_coord;
                                 potential_indices[j].push_back(tmp);
                             }
                         }
                     }
+
                     // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
                     // and the total number of subvectors is the order of differentiation
                     // Now we want all combinations where we pick exactly one index from each subvector.
-                    // This is achievable through a cartesian product 
+                    // This is achievable through a cartesian product
                     std::vector<std::vector<int>> index_combos = cartesian_product(indices);
                     std::vector<std::vector<int>> potential_index_combos = cartesian_product(potential_indices);
                     std::vector<int> buffer_indices;
@@ -1098,14 +1097,20 @@ delete file;
 std::cout << " done" << std::endl;
 } // eri_deriv_disk function
 
-// Computes a single 'deriv_order' derivative tensor of overlap integrals, keeps everything in core memory
-py::array overlap_deriv_core(int deriv_order) {
+// Computes a single 'deriv_order' derivative tensor of OEIs, keeps everything in core memory
+std::vector<py::array> oei_deriv_core(int deriv_order) {
+    // how many shell derivatives in the Libint buffer for overlap/kinetic integrals
+    // how many shell and operator derivatives for potential integrals
     int nshell_derivs = how_many_derivs(2, deriv_order);
+    int nshell_derivs_potential = how_many_derivs(2, deriv_order, natom);
     // how many unique cartesian nuclear derivatives (e.g., so we only save one of d^2/dx1dx2 and d^2/dx2dx1, etc)
     unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
     // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+    // Overlap and kinetic have different mappings than potential since potential has more elements in the buffer
     const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
+    // Potential integrals buffer is flattened upper triangle of (6 + NCART) dimensional deriv_order tensor
+    const std::vector<std::vector<int>> potential_buffer_multidim_lookup = generate_multi_index_lookup(6 + ncart, deriv_order);
 
     // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
     const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
@@ -1113,35 +1118,46 @@ py::array overlap_deriv_core(int deriv_order) {
     // Define engines and buffers
     libint2::Engine overlap_engine(libint2::Operator::overlap, obs.max_nprim(), obs.max_l(), deriv_order);
     const auto& overlap_buffer = overlap_engine.results();
+    libint2::Engine kinetic_engine(libint2::Operator::kinetic, obs.max_nprim(), obs.max_l(), deriv_order);
+    const auto& kinetic_buffer = kinetic_engine.results();
+    libint2::Engine potential_engine(libint2::Operator::nuclear, obs.max_nprim(), obs.max_l(), deriv_order);
+    potential_engine.set_params(libint2::make_point_charges(atoms));
+    const auto& potential_buffer = potential_engine.results();
 
     size_t length = nbf * nbf * nderivs_triu;
-    std::vector<double> result(length);
+    std::vector<double> S(length);
+    std::vector<double> T(length);
+    std::vector<double> V(length);
 
     for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
+        auto bf1 = shell2bf[s1];  // first basis function in first shell
         auto atom1 = shell2atom[s1]; // Atom index of shell 1
-        auto n1 = obs[s1].size();    // number of basis functions in shell 1
+        auto n1 = obs[s1].size(); // number of basis functions in first shell
         for(auto s2 = 0; s2 != obs.size(); ++s2) {
-            auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
+            auto bf2 = shell2bf[s2];  // first basis function in second shell
             auto atom2 = shell2atom[s2]; // Atom index of shell 2
-            auto n2 = obs[s2].size();    // number of basis functions in shell 2
+            auto n2 = obs[s2].size(); // number of basis functions in second shell
             std::vector<long> shell_atom_index_list{atom1, atom2};
 
             overlap_engine.compute(obs[s1], obs[s2]);
+            kinetic_engine.compute(obs[s1], obs[s2]);
+            potential_engine.compute(obs[s1], obs[s2]);
 
             // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
             // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
             for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
                 size_t offset_nuc_idx = nuc_idx * nbf * nbf;
+
                 // Look up multidimensional cartesian derivative index
                 auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
-                // Create a vector of vectors called `indices`, where each subvector is your possible choices
-                // for the first derivative operator, second, third, etc and the total number of subvectors is order of differentiation
+                // For overlap/kinetic and potential sepearately, create a vector of vectors called `indices`, where each subvector
+                // is your possible choices for the first derivative operator, second, third, etc and the total number of subvectors is order of differentiation
                 // What follows fills these indices
                 std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+                std::vector<std::vector<int>> potential_indices(deriv_order, std::vector<int> (0,0));
 
                 // Loop over each cartesian coordinate index which we are differentiating wrt for this nuclear cartesian derivative index
-                // and check to see if it is present in the shell duet
+                // and check to see if it is present in the shell duet, and where it is present in the potential operator
                 for (int j = 0; j < multi_cart_idx.size(); j++){
                     int desired_atom_idx = multi_cart_idx[j] / 3;
                     int desired_coord = multi_cart_idx[j] % 3;
@@ -1151,6 +1167,15 @@ py::array overlap_deriv_core(int deriv_order) {
                         if (atom_idx == desired_atom_idx) {
                             int tmp = 3 * i + desired_coord;
                             indices[j].push_back(tmp);
+                            potential_indices[j].push_back(tmp);
+                        }
+                    }
+                    // Now for potentials only, loop over each atom in molecule, and if this derivative
+                    // differentiates wrt that atom, we also need to collect that index.
+                    for (int i = 0; i < natom; i++){
+                        if (i == desired_atom_idx) {
+                            int tmp = 3 * (i + 2) + desired_coord;
+                            potential_indices[j].push_back(tmp);
                         }
                     }
                 }
@@ -1160,8 +1185,10 @@ py::array overlap_deriv_core(int deriv_order) {
                 // Now we want all combinations where we pick exactly one index from each subvector.
                 // This is achievable through a cartesian product
                 std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                std::vector<std::vector<int>> potential_index_combos = cartesian_product(potential_indices);
                 std::vector<int> buffer_indices;
-                // Collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+                std::vector<int> potential_buffer_indices;
+                // Overlap/Kinetic integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
                 for (auto vec : index_combos)  {
                     std::sort(vec.begin(), vec.end());
                     int buf_idx = 0;
@@ -1169,207 +1196,41 @@ py::array overlap_deriv_core(int deriv_order) {
                     if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
                     buffer_indices.push_back(buf_idx);
                 }
-
-                // Loop over shell block for each buffer index which contributes to this derivative
-                // Overlap and Kinetic
-                for(auto i = 0; i < buffer_indices.size(); ++i) {
-                    auto overlap_shellset = overlap_buffer[buffer_indices[i]];
-                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                        for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                            result[(bf1 + f1) * nbf + bf2 + f2 + offset_nuc_idx] += overlap_shellset[idx];
-                        }
-                    }
-                }
-            } // Unique nuclear cartesian derivative indices loop
-        }
-    } // shell duet loops
-    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
-} // overlap_deriv_core function
-
-// Computes a single 'deriv_order' derivative tensor of kinetic integrals, keeps everything in core memory
-py::array kinetic_deriv_core(int deriv_order) {
-    int nshell_derivs = how_many_derivs(2, deriv_order);
-    // how many unique cartesian nuclear derivatives (e.g., so we only save one of d^2/dx1dx2 and d^2/dx2dx1, etc)
-    unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
-
-    // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
-
-    // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
-    const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
-
-    // Define engines and buffers
-    libint2::Engine kinetic_engine(libint2::Operator::kinetic, obs.max_nprim(), obs.max_l(), deriv_order);
-    const auto& kinetic_buffer = kinetic_engine.results();
-
-    size_t length = nbf * nbf * nderivs_triu;
-    std::vector<double> result(length);
-
-    for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
-        auto atom1 = shell2atom[s1]; // Atom index of shell 1
-        auto n1 = obs[s1].size();    // number of basis functions in shell 1
-        for(auto s2 = 0; s2 != obs.size(); ++s2) {
-            auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
-            auto atom2 = shell2atom[s2]; // Atom index of shell 2
-            auto n2 = obs[s2].size();    // number of basis functions in shell 2
-            std::vector<long> shell_atom_index_list{atom1, atom2};
-
-            kinetic_engine.compute(obs[s1], obs[s2]);
-
-            // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-            // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
-            for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                size_t offset_nuc_idx = nuc_idx * nbf * nbf;
-                // Look up multidimensional cartesian derivative index
-                auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
-                // Create a vector of vectors called `indices`, where each subvector is your possible choices
-                // for the first derivative operator, second, third, etc and the total number of subvectors is order of differentiation
-                // What follows fills these indices
-                std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-
-                // Loop over each cartesian coordinate index which we are differentiating wrt for this nuclear cartesian derivative index
-                // and check to see if it is present in the shell duet
-                for (int j = 0; j < multi_cart_idx.size(); j++){
-                    int desired_atom_idx = multi_cart_idx[j] / 3;
-                    int desired_coord = multi_cart_idx[j] % 3;
-                    // Loop over shell indices
-                    for (int i = 0; i < 2; i++){
-                        int atom_idx = shell_atom_index_list[i];
-                        if (atom_idx == desired_atom_idx) {
-                            int tmp = 3 * i + desired_coord;
-                            indices[j].push_back(tmp);
-                        }
-                    }
-                }
-
-                // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                // and the total number of subvectors is the order of differentiation
-                // Now we want all combinations where we pick exactly one index from each subvector.
-                // This is achievable through a cartesian product
-                std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                std::vector<int> buffer_indices;
-                // Collect needed buffer indices which we need to sum for this nuclear cartesian derivative
-                for (auto vec : index_combos)  {
+                // Potential integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+                for (auto vec : potential_index_combos)  {
                     std::sort(vec.begin(), vec.end());
                     int buf_idx = 0;
-                    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                    buffer_indices.push_back(buf_idx);
+                    auto it = lower_bound(potential_buffer_multidim_lookup.begin(), potential_buffer_multidim_lookup.end(), vec);
+                    if (it != potential_buffer_multidim_lookup.end()) buf_idx = it - potential_buffer_multidim_lookup.begin();
+                    potential_buffer_indices.push_back(buf_idx);
                 }
 
                 // Loop over shell block for each buffer index which contributes to this derivative
                 // Overlap and Kinetic
                 for(auto i = 0; i < buffer_indices.size(); ++i) {
+                    auto overlap_shellset = overlap_buffer[buffer_indices[i]];
                     auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
                     for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                         for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                            result[(bf1 + f1) * nbf + bf2 + f2 + offset_nuc_idx] += kinetic_shellset[idx];
-                        }
-                    }
-                }
-            } // Unique nuclear cartesian derivative indices loop
-        }
-    } // shell duet loops
-    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
-} // kinetic_deriv_core function
-
-// Computes a single 'deriv_order' derivative tensor of potential integrals, keeps everything in core memory
-py::array potential_deriv_core(int deriv_order) {
-    int nshell_derivs = how_many_derivs(2, deriv_order, natom);
-    // how many unique cartesian nuclear derivatives (e.g., so we only save one of d^2/dx1dx2 and d^2/dx2dx1, etc)
-    unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
-
-    // Potential integrals buffer is flattened upper triangle of (6 + NCART) dimensional deriv_order tensor
-    int dimensions = 6 + ncart;
-    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(dimensions, deriv_order);
-
-    // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
-    const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
-
-    // Define engines and buffers
-    libint2::Engine potential_engine(libint2::Operator::nuclear, obs.max_nprim(), obs.max_l(), deriv_order);
-    potential_engine.set_params(libint2::make_point_charges(atoms));
-    const auto& potential_buffer = potential_engine.results();
-
-    size_t length = nbf * nbf * nderivs_triu;
-    std::vector<double> result(length);
-
-    for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
-        auto atom1 = shell2atom[s1]; // Atom index of shell 1
-        auto n1 = obs[s1].size();    // number of basis functions in shell 1
-        for(auto s2 = 0; s2 != obs.size(); ++s2) {
-            auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
-            auto atom2 = shell2atom[s2]; // Atom index of shell 2
-            auto n2 = obs[s2].size();    // number of basis functions in shell 2
-            std::vector<long> shell_atom_index_list{atom1, atom2};
-
-            potential_engine.compute(obs[s1], obs[s2]);
-
-            // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-            // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
-            for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                size_t offset_nuc_idx = nuc_idx * nbf * nbf;
-                // Look up multidimensional cartesian derivative index
-                auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
-                // Create a vector of vectors called `indices`, where each subvector is your possible choices
-                // for the first derivative operator, second, third, etc and the total number of subvectors is order of differentiation
-                // What follows fills these indices
-                std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-
-                // Loop over each cartesian coordinate index which we are differentiating wrt for this nuclear cartesian derivative index
-                // and check to see if it is present in the shell duet, and where it is present in the potential operator
-                for (int j = 0; j < multi_cart_idx.size(); j++){
-                    int desired_atom_idx = multi_cart_idx[j] / 3;
-                    int desired_coord = multi_cart_idx[j] % 3;
-                    // Loop over shell indices
-                    for (int i=0; i < 2; i++){
-                        int atom_idx = shell_atom_index_list[i];
-                        if (atom_idx == desired_atom_idx) {
-                            int tmp = 3 * i + desired_coord;
-                            indices[j].push_back(tmp);
-                        }
-                    }
-                    // Loop over each atom in molecule, and if this derivative
-                    // differentiates wrt that atom, we also need to collect that index.
-                    for (int i = 0; i < natom; i++){
-                        if (i == desired_atom_idx) {
-                            int offset_i = i + 2;
-                            int tmp = 3 * offset_i + desired_coord;
-                            indices[j].push_back(tmp);
+                            S[(bf1 + f1) * nbf + bf2 + f2 + offset_nuc_idx] += overlap_shellset[idx];
+                            T[(bf1 + f1) * nbf + bf2 + f2 + offset_nuc_idx] += kinetic_shellset[idx];
                         }
                     }
                 }
-
-                // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                // and the total number of subvectors is the order of differentiation
-                // Now we want all combinations where we pick exactly one index from each subvector.
-                // This is achievable through a cartesian product
-                std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                std::vector<int> buffer_indices;
-                // Collect needed buffer indices which we need to sum for this nuclear cartesian derivative
-                for (auto vec : index_combos)  {
-                    std::sort(vec.begin(), vec.end());
-                    int buf_idx = 0;
-                    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                    buffer_indices.push_back(buf_idx);
-                }
-
-                for(auto i = 0; i < buffer_indices.size(); ++i) {
-                    auto potential_shellset = potential_buffer[buffer_indices[i]];
+                // Potential
+                for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
+                    auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
                     for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                         for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                            result[(bf1 + f1) * nbf + bf2 + f2  + offset_nuc_idx] += potential_shellset[idx];
+                            V[(bf1 + f1) * nbf + bf2 + f2 + offset_nuc_idx] += potential_shellset[idx];
                         }
                     }
                 }
             } // Unique nuclear cartesian derivative indices loop
         }
     } // shell duet loops
-    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
-} // potential_deriv_core function
+    return {py::array(S.size(), S.data()), py::array(T.size(), T.data()), py::array(V.size(), V.data())}; // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
+} // oei_deriv_core function
 
 // Computes a single 'deriv_order' derivative tensor of electron repulsion integrals, keeps everything in core memory
 py::array eri_deriv_core(int deriv_order) {
@@ -1502,9 +1363,7 @@ PYBIND11_MODULE(libint_interface, m) {
     m.def("eri_deriv", &eri_deriv, "Computes electron repulsion integral nuclear derivatives with libint");
     m.def("oei_deriv_disk", &oei_deriv_disk, "Computes overlap, kinetic, and potential integral derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
     m.def("eri_deriv_disk", &eri_deriv_disk, "Computes coulomb integral nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
-    m.def("overlap_deriv_core", &overlap_deriv_core, "Computes a single overlap integral derivative tensor, in memory.");
-    m.def("kinetic_deriv_core", &kinetic_deriv_core, "Computes a single kinetic integral derivative tensor, in memory.");
-    m.def("potential_deriv_core", &potential_deriv_core, "Computes a single potential integral nuclear derivative tensor, in memory.");
+    m.def("oei_deriv_core", &oei_deriv_core, "Computes a single OEI integral derivative tensor, in memory.");
     m.def("eri_deriv_core", &eri_deriv_core, "Computes a single coulomb integral nuclear derivative tensor, in memory.");
     //TODO partial derivative impl's
     //m.def("eri_partial_deriv_disk", &eri_partial_deriv_disk, "Computes a subset of the full coulomb integral nuclear derivative tensor and writes them to disk with HDF5");
diff --git a/quax/integrals/oei.py b/quax/integrals/oei.py
index 5ca372e..746179f 100644
--- a/quax/integrals/oei.py
+++ b/quax/integrals/oei.py
@@ -29,12 +29,10 @@ def __init__(self, basis_name, xyz_path, max_deriv_order, mode):
             self.potential_derivatives = []
             for i in range(max_deriv_order):
                 n_unique_derivs = how_many_derivs(natoms, i + 1)
-                overlap_deriv = libint_interface.overlap_deriv_core(i + 1).reshape(n_unique_derivs,nbf,nbf)
-                kinetic_deriv = libint_interface.kinetic_deriv_core(i + 1).reshape(n_unique_derivs,nbf,nbf)
-                potential_deriv = libint_interface.potential_deriv_core(i + 1).reshape(n_unique_derivs,nbf,nbf)
-                self.overlap_derivatives.append(overlap_deriv)
-                self.kinetic_derivatives.append(kinetic_deriv)
-                self.potential_derivatives.append(potential_deriv)
+                oei_deriv = libint_interface.oei_deriv_core(i + 1)
+                self.overlap_derivatives.append(oei_deriv[0].reshape(n_unique_derivs,nbf,nbf))
+                self.kinetic_derivatives.append(oei_deriv[1].reshape(n_unique_derivs,nbf,nbf))
+                self.potential_derivatives.append(oei_deriv[2].reshape(n_unique_derivs,nbf,nbf))
 
         self.mode = mode
         self.nbf = nbf

From 4312df8ab1da286f4b93ee0b84689a2cdb2bc10d Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Wed, 20 Sep 2023 14:54:00 -0400
Subject: [PATCH 08/91] OpenMP for Libint Ints

---
 quax/integrals/buffer_lookups.h    |   8 +-
 quax/integrals/libint_interface.cc | 380 ++++++++++++++++++++---------
 quax/integrals/makefile            |   6 +-
 3 files changed, 268 insertions(+), 126 deletions(-)

diff --git a/quax/integrals/buffer_lookups.h b/quax/integrals/buffer_lookups.h
index d9b6f96..529d326 100644
--- a/quax/integrals/buffer_lookups.h
+++ b/quax/integrals/buffer_lookups.h
@@ -1,10 +1,10 @@
-// These functions, generate_*_lookup, create the buffer index lookup arrays. 
+// These functions, generate_*_lookup, create the buffer index lookup arrays.
 // When given a set of indices which represent a Shell derivative operator, e.g. 0,0 == d/dx1 d/dx1, 0,1 = d/dx1 d/dx2, etc
 // these arrays, when indexed with those indices, give the flattened buffer index according to the order these shell derivatives
-// are packed into a Libint integral Engine buffer.  
-// These arrays are always the same for finding the shell derivative mapping for overlap, kinetic, and ERI for a given derivative order. 
+// are packed into a Libint integral Engine buffer.
+// These arrays are always the same for finding the shell derivative mapping for overlap, kinetic, and ERI for a given derivative order.
 // These are also used for nuclear derivatives of nuclear attraction integrals,
-// which vary in size dynamically due to the presence of additional nuclear derivatives 
+// which vary in size dynamically due to the presence of additional nuclear derivatives
 
 std::vector<int> generate_1d_lookup(int dim_size) { 
     std::vector<int> lookup(dim_size, 0);
diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index e9f9ad7..68517a4 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -1,15 +1,19 @@
+#include <stdlib.h>
+#include <iostream>
+
+#ifdef _OPENMP
 #include <omp.h>
+#endif
+
+#include <H5Cpp.h>
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
-#include <stdlib.h>
 #include <libint2.hpp>
-#include <H5Cpp.h>
-#include <iostream>
 
 #include "buffer_lookups.h"
 
-// TODO support spherical harmonic gaussians, try parallelization with openmp, implement symmetry considerations, support 5th, 6th derivs
+// TODO support spherical harmonic gaussians, implement symmetry considerations, support 5th, 6th derivs
 
 namespace py = pybind11;
 using namespace H5;
@@ -21,6 +25,7 @@ unsigned int natom;
 unsigned int ncart;
 std::vector<size_t> shell2bf;
 std::vector<long> shell2atom;
+int nthreads;
 
 // These lookup arrays are for mapping Libint's computed shell-set integrals and integral derivatives to the proper index 
 // in the full OEI/TEI array or derivative array.
@@ -62,6 +67,12 @@ void initialize(std::string xyzfilename, std::string basis_name) {
     ncart = natom * 3;
     shell2bf = obs.shell2bf(); // maps shell index to basis function index
     shell2atom = obs.shell2atom(atoms); // maps shell index to atom index
+    // Get number of OMP threads
+    nthreads = 1;
+#ifdef _OPENMP
+    nthreads = omp_get_max_threads();
+#endif
+    py::print("Number of OMP Threads:", nthreads);
 }
 
 void finalize() {
@@ -159,22 +170,34 @@ std::vector<std::vector<int>> generate_multi_index_lookup(int nparams, int deriv
 // Compute overlap integrals
 py::array overlap() {
     // Overlap integral engine
-    libint2::Engine s_engine(libint2::Operator::overlap, obs.max_nprim(), obs.max_l());
-    const auto& buf_vec = s_engine.results(); // will point to computed shell sets
+    std::vector<libint2::Engine> s_engines(nthreads);
+    s_engines[0] = libint2::Engine(libint2::Operator::overlap, obs.max_nprim(), obs.max_l());
+    for (size_t i = 1; i != nthreads; ++i) {
+        s_engines[i] = s_engines[0];
+    }
+
     size_t length = nbf * nbf;
     std::vector<double> result(length); // vector to store integral array
 
+#pragma omp parallel for collapse(2) num_threads(nthreads)
     for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        auto bf1 = shell2bf[s1];  // first basis function in first shell
-        auto n1 = obs[s1].size(); // number of basis functions in first shell
         for(auto s2 = 0; s2 != obs.size(); ++s2) {
+            auto bf1 = shell2bf[s1];  // first basis function in first shell
+            auto n1 = obs[s1].size(); // number of basis functions in first shell
             auto bf2 = shell2bf[s2];  // first basis function in second shell
             auto n2 = obs[s2].size(); // number of basis functions in second shell
 
-            s_engine.compute(obs[s1], obs[s2]); // Compute shell set
+            size_t thread_id = 0;
+#ifdef _OPENMP
+            thread_id = omp_get_thread_num();
+#endif
+            s_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
+            const auto& buf_vec = s_engines[thread_id].results(); // will point to computed shell sets
+            
             auto ints_shellset = buf_vec[0];    // Location of the computed integrals
             if (ints_shellset == nullptr)
                 continue;  // nullptr returned if the entire shell-set was screened out
+
             // Loop over shell block, keeping a total count idx for the size of shell set
             for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                 for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
@@ -189,26 +212,38 @@ py::array overlap() {
 // Compute kinetic energy integrals
 py::array kinetic() {
     // Kinetic energy integral engine
-    libint2::Engine t_engine(libint2::Operator::kinetic, obs.max_nprim(), obs.max_l());
-    const auto& buf_vec = t_engine.results(); // will point to computed shell sets
+    std::vector<libint2::Engine> t_engines(nthreads);
+    t_engines[0] = libint2::Engine(libint2::Operator::kinetic, obs.max_nprim(), obs.max_l());
+    for (size_t i = 1; i != nthreads; ++i) {
+        t_engines[i] = t_engines[0];
+    }
+
     size_t length = nbf * nbf;
     std::vector<double> result(length);
 
+#pragma omp parallel for collapse(2) num_threads(nthreads)
     for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        auto bf1 = shell2bf[s1];  // first basis function in first shell
-        auto n1 = obs[s1].size(); // number of basis functions in first shell
         for(auto s2 = 0; s2 != obs.size(); ++s2) {
+            auto bf1 = shell2bf[s1];  // first basis function in first shell
+            auto n1 = obs[s1].size(); // number of basis functions in first shell
             auto bf2 = shell2bf[s2];  // first basis function in second shell
             auto n2 = obs[s2].size(); // number of basis functions in second shell
 
-            t_engine.compute(obs[s1], obs[s2]); // Compute shell set
+            size_t thread_id = 0;
+#ifdef _OPENMP
+            thread_id = omp_get_thread_num();
+#endif
+            t_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
+            const auto& buf_vec = t_engines[thread_id].results(); // will point to computed shell sets
+
             auto ints_shellset = buf_vec[0];    // Location of the computed integrals
             if (ints_shellset == nullptr)
                 continue;  // nullptr returned if the entire shell-set was screened out
+
             // Loop over shell block, keeping a total count idx for the size of shell set
             for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                 for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                    result[ (bf1 + f1) * nbf + bf2 + f2 ] = ints_shellset[idx];
+                    result[(bf1 + f1) * nbf + bf2 + f2] = ints_shellset[idx];
                 }
             }
         }
@@ -219,24 +254,35 @@ py::array kinetic() {
 // Compute nuclear-electron potential energy integrals
 py::array potential() {
     // Potential integral engine
-    libint2::Engine v_engine(libint2::Operator::nuclear, obs.max_nprim(), obs.max_l());
-    v_engine.set_params(make_point_charges(atoms));
-    const auto& buf_vec = v_engine.results(); // will point to computed shell sets
+    std::vector<libint2::Engine> v_engines(nthreads);
+    v_engines[0] = libint2::Engine(libint2::Operator::nuclear, obs.max_nprim(), obs.max_l());
+    v_engines[0].set_params(make_point_charges(atoms));
+    for (size_t i = 1; i != nthreads; ++i) {
+        v_engines[i] = v_engines[0];
+    }
 
     size_t length = nbf * nbf;
     std::vector<double> result(length);
 
+#pragma omp parallel for collapse(2) num_threads(nthreads)
     for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        auto bf1 = shell2bf[s1];  // first basis function in first shell
-        auto n1 = obs[s1].size(); // number of basis functions in first shell
         for(auto s2 = 0; s2 != obs.size(); ++s2) {
+            auto bf1 = shell2bf[s1];  // first basis function in first shell
+            auto n1 = obs[s1].size(); // number of basis functions in first shell
             auto bf2 = shell2bf[s2];  // first basis function in second shell
             auto n2 = obs[s2].size(); // number of basis functions in second shell
 
-            v_engine.compute(obs[s1], obs[s2]); // Compute shell set
+            size_t thread_id = 0;
+#ifdef _OPENMP
+            thread_id = omp_get_thread_num();
+#endif
+            v_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
+            const auto& buf_vec = v_engines[thread_id].results(); // will point to computed shell sets
+
             auto ints_shellset = buf_vec[0];    // Location of the computed integrals
             if (ints_shellset == nullptr)
                 continue;  // nullptr returned if the entire shell-set was screened out
+
             // Loop over shell block, keeping a total count idx for the size of shell set
             for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                 for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
@@ -252,29 +298,40 @@ py::array potential() {
 // Computes electron repulsion integrals
 py::array eri() {
     // workaround for data copying: perhaps pass an empty numpy array, then populate it in C++? avoids last line, which copies
-    libint2::Engine eri_engine(libint2::Operator::coulomb, obs.max_nprim(), obs.max_l());
-    const auto& buf_vec = eri_engine.results(); // will point to computed shell sets
+    std::vector<libint2::Engine> eri_engines(nthreads);
+    eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, obs.max_nprim(), obs.max_l());
+    for (size_t i = 1; i != nthreads; ++i) {
+        eri_engines[i] = eri_engines[0];
+    }
 
     size_t length = nbf * nbf * nbf * nbf;
     std::vector<double> result(length);
     
+#pragma omp parallel for collapse(4) num_threads(nthreads)
     for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        auto bf1 = shell2bf[s1];  // first basis function in first shell
-        auto n1 = obs[s1].size(); // number of basis functions in first shell
         for(auto s2 = 0; s2 != obs.size(); ++s2) {
-            auto bf2 = shell2bf[s2];  // first basis function in second shell
-            auto n2 = obs[s2].size(); // number of basis functions in second shell
             for(auto s3=0; s3 != obs.size(); ++s3) {
-                auto bf3 = shell2bf[s3];  // first basis function in third shell
-                auto n3 = obs[s3].size(); // number of basis functions in third shell
                 for(auto s4 = 0; s4 != obs.size(); ++s4) {
+                    auto bf1 = shell2bf[s1];  // first basis function in first shell
+                    auto n1 = obs[s1].size(); // number of basis functions in first shell
+                    auto bf2 = shell2bf[s2];  // first basis function in second shell
+                    auto n2 = obs[s2].size(); // number of basis functions in second shell
+                    auto bf3 = shell2bf[s3];  // first basis function in third shell
+                    auto n3 = obs[s3].size(); // number of basis functions in third shell
                     auto bf4 = shell2bf[s4];  // first basis function in fourth shell
                     auto n4 = obs[s4].size(); // number of basis functions in fourth shell
 
-                    eri_engine.compute(obs[s1], obs[s2], obs[s3], obs[s4]); // Compute shell set
+                    size_t thread_id = 0;
+#ifdef _OPENMP
+                    thread_id = omp_get_thread_num();
+#endif
+                    eri_engines[thread_id].compute(obs[s1], obs[s2], obs[s3], obs[s4]); // Compute shell set
+                    const auto& buf_vec = eri_engines[thread_id].results(); // will point to computed shell sets
+
                     auto ints_shellset = buf_vec[0];    // Location of the computed integrals
                     if (ints_shellset == nullptr)
                         continue;  // nullptr returned if the entire shell-set was screened out
+
                     // Loop over shell block, keeping a total count idx for the size of shell set
                     for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                         size_t offset_1 = (bf1 + f1) * nbf * nbf * nbf;
@@ -307,19 +364,22 @@ py::array overlap_deriv(std::vector<int> deriv_vec) {
     process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
 
     // Overlap integral derivative engine
-    libint2::Engine s_engine(libint2::Operator::overlap, obs.max_nprim(), obs.max_l(), deriv_order);
+    std::vector<libint2::Engine> s_engines(nthreads);
+    s_engines[0] = libint2::Engine(libint2::Operator::overlap, obs.max_nprim(), obs.max_l(), deriv_order);
+    for (size_t i = 1; i != nthreads; ++i) {
+        s_engines[i] = s_engines[0];
+    }
 
     // Get size of overlap derivative array and allocate 
     size_t length = nbf * nbf;
     std::vector<double> result(length);
 
-    const auto& buf_vec = s_engine.results(); // will point to computed shell sets
-
+#pragma omp parallel for collapse(2) num_threads(nthreads)
     for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
-        auto atom1 = shell2atom[s1]; // Atom index of shell 1
-        auto n1 = obs[s1].size();    // number of basis functions in shell 1
         for(auto s2 = 0; s2 != obs.size(); ++s2) {
+            auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
+            auto atom1 = shell2atom[s1]; // Atom index of shell 1
+            auto n1 = obs[s1].size();    // number of basis functions in shell 1
             auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
             auto atom2 = shell2atom[s2]; // Atom index of shell 2
             auto n2 = obs[s2].size();    // number of basis functions in shell 2
@@ -342,7 +402,12 @@ py::array overlap_deriv(std::vector<int> deriv_vec) {
             if (desired_shell_atoms.size() != deriv_order) continue;
 
             // If we made it this far, the shell derivative we want is in the buffer, perhaps even more than once. 
-            s_engine.compute(obs[s1], obs[s2]); 
+            size_t thread_id = 0;
+#ifdef _OPENMP
+            thread_id = omp_get_thread_num();
+#endif
+            s_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
+            const auto& buf_vec = s_engines[thread_id].results(); // will point to computed shell sets
 
             // Now convert these shell atom indices into a shell derivative index, a set of indices length deriv_order with values between 0 and 5, corresponding to 6 possible shell center coordinates
             std::vector<int> shell_derivative;
@@ -393,17 +458,21 @@ py::array kinetic_deriv(std::vector<int> deriv_vec) {
     process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
 
     // Kinetic integral derivative engine
-    libint2::Engine t_engine(libint2::Operator::kinetic, obs.max_nprim(), obs.max_l(), deriv_order);
-    const auto& buf_vec = t_engine.results(); // will point to computed shell sets
+    std::vector<libint2::Engine> t_engines(nthreads);
+    t_engines[0] = libint2::Engine(libint2::Operator::kinetic, obs.max_nprim(), obs.max_l(), deriv_order);
+    for (size_t i = 1; i != nthreads; ++i) {
+        t_engines[i] = t_engines[0];
+    }
 
     size_t length = nbf * nbf;
     std::vector<double> result(length);
 
+#pragma omp parallel for collapse(2) num_threads(nthreads)
     for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
-        auto atom1 = shell2atom[s1]; // Atom index of shell 1
-        auto n1 = obs[s1].size();    // number of basis functions in shell 1
         for(auto s2 = 0; s2 != obs.size(); ++s2) {
+            auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
+            auto atom1 = shell2atom[s1]; // Atom index of shell 1
+            auto n1 = obs[s1].size();    // number of basis functions in shell 1
             auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
             auto atom2 = shell2atom[s2]; // Atom index of shell 2
             auto n2 = obs[s2].size();    // number of basis functions in shell 2
@@ -426,7 +495,12 @@ py::array kinetic_deriv(std::vector<int> deriv_vec) {
             if (desired_shell_atoms.size() != deriv_order) continue;
 
             // If we made it this far, the shell derivative we want is in the buffer, perhaps even more than once. 
-            t_engine.compute(obs[s1], obs[s2]); 
+            size_t thread_id = 0;
+#ifdef _OPENMP
+            thread_id = omp_get_thread_num();
+#endif
+            t_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
+            const auto& buf_vec = t_engines[thread_id].results(); // will point to computed shell sets
 
             // Now convert these shell atom indices into a shell derivative index, a set of indices length deriv_order with values between 0 and 5, corresponding to 6 possible shell center coordinates
             std::vector<int> shell_derivative;
@@ -485,19 +559,23 @@ py::array potential_deriv(std::vector<int> deriv_vec) {
     process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
 
     // Potential integral derivative engine
-    libint2::Engine v_engine(libint2::Operator::nuclear, obs.max_nprim(), obs.max_l(), deriv_order);
-    v_engine.set_params(libint2::make_point_charges(atoms));
-    const auto& buf_vec = v_engine.results(); // will point to computed shell sets
+    std::vector<libint2::Engine> v_engines(nthreads);
+    v_engines[0] = libint2::Engine(libint2::Operator::nuclear, obs.max_nprim(), obs.max_l(), deriv_order);
+    v_engines[0].set_params(make_point_charges(atoms));
+    for (size_t i = 1; i != nthreads; ++i) {
+        v_engines[i] = v_engines[0];
+    }
 
     // Get size of potential derivative array and allocate 
     size_t length = nbf * nbf;
     std::vector<double> result(length);
 
+#pragma omp parallel for collapse(2) num_threads(nthreads)
     for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
-        auto atom1 = shell2atom[s1]; // Atom index of shell 1
-        auto n1 = obs[s1].size();    // number of basis functions in shell 1
         for(auto s2 = 0; s2 != obs.size(); ++s2) {
+            auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
+            auto atom1 = shell2atom[s1]; // Atom index of shell 1
+            auto n1 = obs[s1].size();    // number of basis functions in shell 1
             auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
             auto atom2 = shell2atom[s2]; // Atom index of shell 2
             auto n2 = obs[s2].size();    // number of basis functions in shell 2
@@ -541,7 +619,12 @@ py::array potential_deriv(std::vector<int> deriv_vec) {
             std::vector<std::vector<int>> index_combos = cartesian_product(indices);
 
             // Compute the integrals
-            v_engine.compute(obs[s1], obs[s2]); 
+            size_t thread_id = 0;
+#ifdef _OPENMP
+            thread_id = omp_get_thread_num();
+#endif
+            v_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
+            const auto& buf_vec = v_engines[thread_id].results(); // will point to computed shell sets
             
             // Loop over every subvector of index_combos and lookup buffer index.
             std::vector<int> buffer_indices;
@@ -603,24 +686,29 @@ py::array eri_deriv(std::vector<int> deriv_vec) {
     assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
 
     // ERI derivative integral engine
-    libint2::Engine eri_engine(libint2::Operator::coulomb, obs.max_nprim(), obs.max_l(), deriv_order);
-    const auto& buf_vec = eri_engine.results(); // will point to computed shell sets
+    std::vector<libint2::Engine> eri_engines(nthreads);
+    eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, obs.max_nprim(), obs.max_l(), deriv_order);
+    for (size_t i = 1; i != nthreads; ++i) {
+        eri_engines[i] = eri_engines[0];
+    }
+
     size_t length = nbf * nbf * nbf * nbf;
     std::vector<double> result(length);
 
+#pragma omp parallel for collapse(4) num_threads(nthreads)
     for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
-        auto atom1 = shell2atom[s1]; // Atom index of shell 1
-        auto n1 = obs[s1].size();    // number of basis functions in shell 1
         for(auto s2 = 0; s2 != obs.size(); ++s2) {
-            auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
-            auto atom2 = shell2atom[s2]; // Atom index of shell 2
-            auto n2 = obs[s2].size();    // number of basis functions in shell 2
             for(auto s3 = 0; s3 != obs.size(); ++s3) {
-                auto bf3 = shell2bf[s3];     // Index of first basis function in shell 3
-                auto atom3 = shell2atom[s3]; // Atom index of shell 3
-                auto n3 = obs[s3].size();    // number of basis functions in shell 3
                 for(auto s4 = 0; s4 != obs.size(); ++s4) {
+                    auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
+                    auto atom1 = shell2atom[s1]; // Atom index of shell 1
+                    auto n1 = obs[s1].size();    // number of basis functions in shell 1
+                    auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
+                    auto atom2 = shell2atom[s2]; // Atom index of shell 2
+                    auto n2 = obs[s2].size();    // number of basis functions in shell 2
+                    auto bf3 = shell2bf[s3];     // Index of first basis function in shell 3
+                    auto atom3 = shell2atom[s3]; // Atom index of shell 3
+                    auto n3 = obs[s3].size();    // number of basis functions in shell 3
                     auto bf4 = shell2bf[s4];     // Index of first basis function in shell 4
                     auto atom4 = shell2atom[s4]; // Atom index of shell 4
                     auto n4 = obs[s4].size();    // number of basis functions in shell 4
@@ -705,7 +793,12 @@ py::array eri_deriv(std::vector<int> deriv_vec) {
                     }
 
                     // If we made it this far, the shell derivative we want is contained in the buffer. 
-                    eri_engine.compute(obs[s1], obs[s2], obs[s3], obs[s4]); // Compute shell set, fills buf_vec
+                    size_t thread_id = 0;
+#ifdef _OPENMP
+                    thread_id = omp_get_thread_num();
+#endif
+                    eri_engines[thread_id].compute(obs[s1], obs[s2], obs[s3], obs[s4]); // Compute shell set
+                    const auto& buf_vec = eri_engines[thread_id].results(); // will point to computed shell sets
 
                     for(auto i = 0; i<buffer_indices.size(); ++i) {
                         auto ints_shellset = buf_vec[buffer_indices[i]];
@@ -787,13 +880,16 @@ void oei_deriv_disk(int max_deriv_order) {
         const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
 
         // Define engines and buffers
-        libint2::Engine overlap_engine(libint2::Operator::overlap, obs.max_nprim(), obs.max_l(), deriv_order);
-        const auto& overlap_buffer = overlap_engine.results();
-        libint2::Engine kinetic_engine(libint2::Operator::kinetic, obs.max_nprim(), obs.max_l(), deriv_order);
-        const auto& kinetic_buffer = kinetic_engine.results();
-        libint2::Engine potential_engine(libint2::Operator::nuclear, obs.max_nprim(), obs.max_l(), deriv_order);
-        potential_engine.set_params(libint2::make_point_charges(atoms));
-        const auto& potential_buffer = potential_engine.results();
+        std::vector<libint2::Engine> s_engines(nthreads), t_engines(nthreads), v_engines(nthreads);
+        s_engines[0] = libint2::Engine(libint2::Operator::overlap, obs.max_nprim(), obs.max_l(), deriv_order);
+        t_engines[0] = libint2::Engine(libint2::Operator::kinetic, obs.max_nprim(), obs.max_l(), deriv_order);
+        v_engines[0] = libint2::Engine(libint2::Operator::nuclear, obs.max_nprim(), obs.max_l(), deriv_order);
+        v_engines[0].set_params(make_point_charges(atoms));
+        for (size_t i = 1; i != nthreads; ++i) {
+            s_engines[i] = s_engines[0];
+            t_engines[i] = t_engines[0];
+            v_engines[i] = v_engines[0];
+        }
 
         // Define HDF5 dataset names
         const H5std_string overlap_dset_name("overlap_deriv" + std::to_string(deriv_order));
@@ -821,9 +917,16 @@ void oei_deriv_disk(int max_deriv_order) {
                 auto n2 = obs[s2].size(); // number of basis functions in second shell
                 std::vector<long> shell_atom_index_list{atom1, atom2};
 
-                overlap_engine.compute(obs[s1], obs[s2]);
-                kinetic_engine.compute(obs[s1], obs[s2]);
-                potential_engine.compute(obs[s1], obs[s2]);
+                size_t thread_id = 0;
+#ifdef _OPENMP
+                thread_id = omp_get_thread_num();
+#endif
+                s_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
+                t_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
+                v_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
+                const auto& overlap_buffer = s_engines[thread_id].results(); // will point to computed shell sets
+                const auto& kinetic_buffer = t_engines[thread_id].results(); // will point to computed shell sets
+                const auto& potential_buffer = v_engines[thread_id].results(); // will point to computed shell sets;
 
                 // Define shell set slabs
                 double overlap_shellset_slab [n1][n2][nderivs_triu] = {};
@@ -972,14 +1075,18 @@ void eri_deriv_disk(int max_deriv_order) {
         unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
         // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-        const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+        // Currently not used due to predefined lookup arrays
+        //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
 
         // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
         const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
 
         // Libint engine for computing shell quartet derivatives
-        libint2::Engine eri_engine(libint2::Operator::coulomb, obs.max_nprim(), obs.max_l(), deriv_order);
-        const auto& eri_buffer = eri_engine.results(); // will point to computed shell sets
+        std::vector<libint2::Engine> eri_engines(nthreads);
+        eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, obs.max_nprim(), obs.max_l(), deriv_order);
+        for (size_t i = 1; i != nthreads; ++i) {
+            eri_engines[i] = eri_engines[0];
+        }
 
         // Define HDF5 dataset name
         const H5std_string eri_dset_name("eri_deriv" + std::to_string(deriv_order));
@@ -991,20 +1098,20 @@ void eri_deriv_disk(int max_deriv_order) {
         hsize_t block[5] = {1, 1, 1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
         hsize_t zerostart[5] = {0, 0, 0, 0, 0};
 
-        // Begin shell quartet loops
+#pragma omp parallel for collapse(4) num_threads(nthreads)
         for(auto s1 = 0; s1 != obs.size(); ++s1) {
-            auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
-            auto atom1 = shell2atom[s1]; // Atom index of shell 1
-            auto n1 = obs[s1].size();    // number of basis functions in shell 1
             for(auto s2 = 0; s2 != obs.size(); ++s2) {
-                auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
-                auto atom2 = shell2atom[s2]; // Atom index of shell 2
-                auto n2 = obs[s2].size();    // number of basis functions in shell 2
                 for(auto s3 = 0; s3 != obs.size(); ++s3) {
-                    auto bf3 = shell2bf[s3];     // Index of first basis function in shell 3
-                    auto atom3 = shell2atom[s3]; // Atom index of shell 3
-                    auto n3 = obs[s3].size();    // number of basis functions in shell 3
                     for(auto s4 = 0; s4 != obs.size(); ++s4) {
+                        auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
+                        auto atom1 = shell2atom[s1]; // Atom index of shell 1
+                        auto n1 = obs[s1].size();    // number of basis functions in shell 1
+                        auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
+                        auto atom2 = shell2atom[s2]; // Atom index of shell 2
+                        auto n2 = obs[s2].size();    // number of basis functions in shell 2
+                        auto bf3 = shell2bf[s3];     // Index of first basis function in shell 3
+                        auto atom3 = shell2atom[s3]; // Atom index of shell 3
+                        auto n3 = obs[s3].size();    // number of basis functions in shell 3
                         auto bf4 = shell2bf[s4];     // Index of first basis function in shell 4
                         auto atom4 = shell2atom[s4]; // Atom index of shell 4
                         auto n4 = obs[s4].size();    // number of basis functions in shell 4
@@ -1012,7 +1119,12 @@ void eri_deriv_disk(int max_deriv_order) {
                         if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
                         std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
 
-                        eri_engine.compute(obs[s1], obs[s2], obs[s3], obs[s4]); // Compute shell set
+                        size_t thread_id = 0;
+#ifdef _OPENMP
+                        thread_id = omp_get_thread_num();
+#endif
+                        eri_engines[thread_id].compute(obs[s1], obs[s2], obs[s3], obs[s4]); // Compute shell set
+                        const auto& eri_buffer = eri_engines[thread_id].results(); // will point to computed shell sets
 
                         // Define shell set slab, with extra dimension for unique derivatives, initialized with 0.0's
                         double eri_shellset_slab [n1][n2][n3][n4][nderivs_triu] = {};
@@ -1108,7 +1220,8 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
 
     // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
     // Overlap and kinetic have different mappings than potential since potential has more elements in the buffer
-    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
+    // Currently unused due to predefined lookup arrays
+    //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
     // Potential integrals buffer is flattened upper triangle of (6 + NCART) dimensional deriv_order tensor
     const std::vector<std::vector<int>> potential_buffer_multidim_lookup = generate_multi_index_lookup(6 + ncart, deriv_order);
 
@@ -1116,32 +1229,43 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
     const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
 
     // Define engines and buffers
-    libint2::Engine overlap_engine(libint2::Operator::overlap, obs.max_nprim(), obs.max_l(), deriv_order);
-    const auto& overlap_buffer = overlap_engine.results();
-    libint2::Engine kinetic_engine(libint2::Operator::kinetic, obs.max_nprim(), obs.max_l(), deriv_order);
-    const auto& kinetic_buffer = kinetic_engine.results();
-    libint2::Engine potential_engine(libint2::Operator::nuclear, obs.max_nprim(), obs.max_l(), deriv_order);
-    potential_engine.set_params(libint2::make_point_charges(atoms));
-    const auto& potential_buffer = potential_engine.results();
+    std::vector<libint2::Engine> s_engines(nthreads), t_engines(nthreads), v_engines(nthreads);
+    s_engines[0] = libint2::Engine(libint2::Operator::overlap, obs.max_nprim(), obs.max_l(), deriv_order);
+    t_engines[0] = libint2::Engine(libint2::Operator::kinetic, obs.max_nprim(), obs.max_l(), deriv_order);
+    v_engines[0] = libint2::Engine(libint2::Operator::nuclear, obs.max_nprim(), obs.max_l(), deriv_order);
+    v_engines[0].set_params(make_point_charges(atoms));
+    for (size_t i = 1; i != nthreads; ++i) {
+        s_engines[i] = s_engines[0];
+        t_engines[i] = t_engines[0];
+        v_engines[i] = v_engines[0];
+    }
 
     size_t length = nbf * nbf * nderivs_triu;
     std::vector<double> S(length);
     std::vector<double> T(length);
     std::vector<double> V(length);
 
+#pragma omp parallel for collapse(2) num_threads(nthreads)
     for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        auto bf1 = shell2bf[s1];  // first basis function in first shell
-        auto atom1 = shell2atom[s1]; // Atom index of shell 1
-        auto n1 = obs[s1].size(); // number of basis functions in first shell
         for(auto s2 = 0; s2 != obs.size(); ++s2) {
-            auto bf2 = shell2bf[s2];  // first basis function in second shell
+            auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
+            auto atom1 = shell2atom[s1]; // Atom index of shell 1
+            auto n1 = obs[s1].size();    // number of basis functions in shell 1
+            auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
             auto atom2 = shell2atom[s2]; // Atom index of shell 2
-            auto n2 = obs[s2].size(); // number of basis functions in second shell
+            auto n2 = obs[s2].size();    // number of basis functions in shell 2
             std::vector<long> shell_atom_index_list{atom1, atom2};
 
-            overlap_engine.compute(obs[s1], obs[s2]);
-            kinetic_engine.compute(obs[s1], obs[s2]);
-            potential_engine.compute(obs[s1], obs[s2]);
+            size_t thread_id = 0;
+#ifdef _OPENMP
+            thread_id = omp_get_thread_num();
+#endif
+            s_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
+            t_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
+            v_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
+            const auto& overlap_buffer = s_engines[thread_id].results(); // will point to computed shell sets
+            const auto& kinetic_buffer = t_engines[thread_id].results(); // will point to computed shell sets
+            const auto& potential_buffer = v_engines[thread_id].results(); // will point to computed shell sets
 
             // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
             // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
@@ -1189,12 +1313,18 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
                 std::vector<int> buffer_indices;
                 std::vector<int> potential_buffer_indices;
                 // Overlap/Kinetic integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+                //for (auto vec : index_combos)  {
+                //    std::sort(vec.begin(), vec.end());
+                //    int buf_idx = 0;
+                //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                //    buffer_indices.push_back(buf_idx);
+                //}
                 for (auto vec : index_combos)  {
-                    std::sort(vec.begin(), vec.end());
-                    int buf_idx = 0;
-                    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                    buffer_indices.push_back(buf_idx);
+                    if (deriv_order == 1) buffer_indices.push_back(buffer_index_oei1d[vec[0]]);
+                    else if (deriv_order == 2) buffer_indices.push_back(buffer_index_oei2d[vec[0]][vec[1]]);
+                    else if (deriv_order == 3) buffer_indices.push_back(buffer_index_oei3d[vec[0]][vec[1]][vec[2]]);
+                    else if (deriv_order == 4) buffer_indices.push_back(buffer_index_oei4d[vec[0]][vec[1]][vec[2]][vec[3]]);
                 }
                 // Potential integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
                 for (auto vec : potential_index_combos)  {
@@ -1240,32 +1370,37 @@ py::array eri_deriv_core(int deriv_order) {
     unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
     // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+    // Currently unused due to predefined lookup arrays
+    //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
 
     // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
     const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
 
     // Libint engine for computing shell quartet derivatives
-    libint2::Engine eri_engine(libint2::Operator::coulomb, obs.max_nprim(), obs.max_l(), deriv_order);
-    const auto& eri_buffer = eri_engine.results(); // will point to computed shell sets
+    std::vector<libint2::Engine> eri_engines(nthreads);
+    eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, obs.max_nprim(), obs.max_l(), deriv_order);
+    for (size_t i = 1; i != nthreads; ++i) {
+        eri_engines[i] = eri_engines[0];
+    }
 
     size_t length = nbf * nbf * nbf * nbf * nderivs_triu;
     std::vector<double> result(length);
 
     // Begin shell quartet loops
+#pragma omp parallel for collapse(4) num_threads(nthreads)
     for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
-        auto atom1 = shell2atom[s1]; // Atom index of shell 1
-        auto n1 = obs[s1].size();    // number of basis functions in shell 1
         for(auto s2 = 0; s2 != obs.size(); ++s2) {
-            auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
-            auto atom2 = shell2atom[s2]; // Atom index of shell 2
-            auto n2 = obs[s2].size();    // number of basis functions in shell 2
             for(auto s3 = 0; s3 != obs.size(); ++s3) {
-                auto bf3 = shell2bf[s3];     // Index of first basis function in shell 3
-                auto atom3 = shell2atom[s3]; // Atom index of shell 3
-                auto n3 = obs[s3].size();    // number of basis functions in shell 3
                 for(auto s4 = 0; s4 != obs.size(); ++s4) {
+                    auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
+                    auto atom1 = shell2atom[s1]; // Atom index of shell 1
+                    auto n1 = obs[s1].size();    // number of basis functions in shell 1
+                    auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
+                    auto atom2 = shell2atom[s2]; // Atom index of shell 2
+                    auto n2 = obs[s2].size();    // number of basis functions in shell 2
+                    auto bf3 = shell2bf[s3];     // Index of first basis function in shell 3
+                    auto atom3 = shell2atom[s3]; // Atom index of shell 3
+                    auto n3 = obs[s3].size();    // number of basis functions in shell 3
                     auto bf4 = shell2bf[s4];     // Index of first basis function in shell 4
                     auto atom4 = shell2atom[s4]; // Atom index of shell 4
                     auto n4 = obs[s4].size();    // number of basis functions in shell 4
@@ -1273,7 +1408,12 @@ py::array eri_deriv_core(int deriv_order) {
                     if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
                     std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
 
-                    eri_engine.compute(obs[s1], obs[s2], obs[s3], obs[s4]); // Compute shell set
+                    size_t thread_id = 0;
+#ifdef _OPENMP
+                    thread_id = omp_get_thread_num();
+#endif
+                    eri_engines[thread_id].compute(obs[s1], obs[s2], obs[s3], obs[s4]); // Compute shell set
+                    const auto& eri_buffer = eri_engines[thread_id].results(); // will point to computed shell sets
 
                     // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
                     for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
diff --git a/quax/integrals/makefile b/quax/integrals/makefile
index 93dad13..eb7acde 100644
--- a/quax/integrals/makefile
+++ b/quax/integrals/makefile
@@ -1,6 +1,8 @@
 # NOTE: These paths below need to be edited such that they point to a set of 
 # Eigen headers, Python headers, Pybind11 headers, Libint API headers libint2.h libint2.hpp, the rest of the Libint2 headers, and the library location of libint2.a,
 CC      := g++
+# Options passed to compiler
+CFLAGS  := -O3 -fPIC -fopenmp
 # Libint prefix location (where /include, /include/libint2, /lib, /share are located) 
 LIBINT_PREFIX := /home/ecm23353/psi_env
 
@@ -33,7 +35,7 @@ clean:
 	rm -f $(OBJ)
 
 $(OBJ): %.o : %.cc $(DEPS)
-	$(CC) -c $< -o $@ -O3 -fPIC -I $(I1) -I $(I2) -I $(I3) -I $(I4) -I $(I5) -I $(I6) -lint2 -L $(L1) -lhdf5 -lhdf5_cpp -L $(L2) $(RPATH)
+	$(CC) -c $< -o $@ $(CFLAGS) -I $(I1) -I $(I2) -I $(I3) -I $(I4) -I $(I5) -I $(I6) -lint2 -L $(L1) -lhdf5 -lhdf5_cpp -L $(L2) $(RPATH)
 $(TARGETS): $(OBJ)
-	$(CC) $^ -o $@ -O3 -fPIC -shared -I $(I1) -I $(I2) -I $(I3) -I $(I4) -I $(I5) -I $(I6) -lint2 -L $(L1) -lhdf5 -lhdf5_cpp -L $(L2) $(RPATH)
+	$(CC) $^ -o $@ $(CFLAGS) -shared -I $(I1) -I $(I2) -I $(I3) -I $(I4) -I $(I5) -I $(I6) -lint2 -L $(L1) -lhdf5 -lhdf5_cpp -L $(L2) $(RPATH)
 

From 05a1e0ebf6bdcdb5d3cbfbd35216f2fa02f69507 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Fri, 22 Sep 2023 14:48:36 -0400
Subject: [PATCH 09/91] Start to generalize to use multiple basis sets

---
 quax/integrals/libint_interface.cc | 443 ++++++++++++++++-------------
 quax/integrals/makefile            |   2 +-
 quax/methods/ints.py               |   6 +-
 quax/utils.py                      |   2 +-
 4 files changed, 249 insertions(+), 204 deletions(-)

diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index 68517a4..1aa2d49 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -19,12 +19,12 @@ namespace py = pybind11;
 using namespace H5;
 
 std::vector<libint2::Atom> atoms;
-libint2::BasisSet obs;
-unsigned int nbf;
 unsigned int natom;
 unsigned int ncart;
-std::vector<size_t> shell2bf;
-std::vector<long> shell2atom;
+libint2::BasisSet bs1, bs2, bs3, bs4;
+unsigned int nbf1, nbf2, nbf3, nbf4;
+std::vector<size_t> shell2bf_1, shell2bf_2, shell2bf_3, shell2bf_4;
+std::vector<long> shell2atom_1, shell2atom_2, shell2atom_3, shell2atom_4;
 int nthreads;
 
 // These lookup arrays are for mapping Libint's computed shell-set integrals and integral derivatives to the proper index 
@@ -55,18 +55,36 @@ std::vector<libint2::Atom> get_atoms(std::string xyzfilename)
 }
 
 // Must call initialize before computing ints 
-void initialize(std::string xyzfilename, std::string basis_name) {
+void initialize(std::string xyzfilename, std::string basis1, std::string basis2,
+                std::string basis3, std::string basis4) {
     libint2::initialize();
     atoms = get_atoms(xyzfilename);
-    // Move harddrive load of basis and xyz to happen only once
-    obs = libint2::BasisSet(basis_name, atoms);
-    obs.set_pure(false); // use cartesian gaussians
-    // Get size of potential derivative array and allocate 
-    nbf = obs.nbf();
     natom = atoms.size();
     ncart = natom * 3;
-    shell2bf = obs.shell2bf(); // maps shell index to basis function index
-    shell2atom = obs.shell2atom(atoms); // maps shell index to atom index
+
+    // Move harddrive load of basis and xyz to happen only once
+    bs1 = libint2::BasisSet(basis1, atoms);
+    bs1.set_pure(false); // use cartesian gaussians
+    bs2 = libint2::BasisSet(basis2, atoms);
+    bs2.set_pure(false); // use cartesian gaussians
+    bs3 = libint2::BasisSet(basis3, atoms);
+    bs3.set_pure(false); // use cartesian gaussians
+    bs4 = libint2::BasisSet(basis4, atoms);
+    bs4.set_pure(false); // use cartesian gaussians
+
+    nbf1 = bs1.nbf();
+    nbf2 = bs2.nbf();
+    nbf3 = bs3.nbf();
+    nbf4 = bs4.nbf();
+    shell2bf_1 = bs1.shell2bf();
+    shell2bf_2 = bs2.shell2bf();
+    shell2bf_3 = bs3.shell2bf();
+    shell2bf_4 = bs4.shell2bf();
+    shell2atom_1 = bs1.shell2atom(atoms);
+    shell2atom_2 = bs2.shell2atom(atoms);
+    shell2atom_3 = bs3.shell2atom(atoms);
+    shell2atom_4 = bs4.shell2atom(atoms);
+
     // Get number of OMP threads
     nthreads = 1;
 #ifdef _OPENMP
@@ -171,29 +189,31 @@ std::vector<std::vector<int>> generate_multi_index_lookup(int nparams, int deriv
 py::array overlap() {
     // Overlap integral engine
     std::vector<libint2::Engine> s_engines(nthreads);
-    s_engines[0] = libint2::Engine(libint2::Operator::overlap, obs.max_nprim(), obs.max_l());
+    size_t max_nprim = std::max(bs1.max_nprim(), bs2.max_nprim());
+    int max_l = std::max(bs1.max_l(), bs2.max_l());
+    s_engines[0] = libint2::Engine(libint2::Operator::overlap, max_nprim, max_l);
     for (size_t i = 1; i != nthreads; ++i) {
         s_engines[i] = s_engines[0];
     }
 
-    size_t length = nbf * nbf;
+    size_t length = nbf1 * nbf2;
     std::vector<double> result(length); // vector to store integral array
 
 #pragma omp parallel for collapse(2) num_threads(nthreads)
-    for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        for(auto s2 = 0; s2 != obs.size(); ++s2) {
-            auto bf1 = shell2bf[s1];  // first basis function in first shell
-            auto n1 = obs[s1].size(); // number of basis functions in first shell
-            auto bf2 = shell2bf[s2];  // first basis function in second shell
-            auto n2 = obs[s2].size(); // number of basis functions in second shell
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            auto bf1 = shell2bf_1[s1];  // first basis function in first shell
+            auto n1 = bs1[s1].size(); // number of basis functions in first shell
+            auto bf2 = shell2bf_2[s2];  // first basis function in second shell
+            auto n2 = bs2[s2].size(); // number of basis functions in second shell
 
             size_t thread_id = 0;
 #ifdef _OPENMP
             thread_id = omp_get_thread_num();
 #endif
-            s_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
+            s_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
             const auto& buf_vec = s_engines[thread_id].results(); // will point to computed shell sets
-            
+
             auto ints_shellset = buf_vec[0];    // Location of the computed integrals
             if (ints_shellset == nullptr)
                 continue;  // nullptr returned if the entire shell-set was screened out
@@ -201,7 +221,7 @@ py::array overlap() {
             // Loop over shell block, keeping a total count idx for the size of shell set
             for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                 for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                    result[(bf1 + f1) * nbf + bf2 + f2] = ints_shellset[idx];
+                    result[(bf1 + f1) * nbf2 + bf2 + f2] = ints_shellset[idx];
                 }
             }
         }
@@ -213,27 +233,29 @@ py::array overlap() {
 py::array kinetic() {
     // Kinetic energy integral engine
     std::vector<libint2::Engine> t_engines(nthreads);
-    t_engines[0] = libint2::Engine(libint2::Operator::kinetic, obs.max_nprim(), obs.max_l());
+    size_t max_nprim = std::max(bs1.max_nprim(), bs2.max_nprim());
+    int max_l = std::max(bs1.max_l(), bs2.max_l());
+    t_engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l);
     for (size_t i = 1; i != nthreads; ++i) {
         t_engines[i] = t_engines[0];
     }
 
-    size_t length = nbf * nbf;
+    size_t length = nbf1 * nbf2;
     std::vector<double> result(length);
 
 #pragma omp parallel for collapse(2) num_threads(nthreads)
-    for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        for(auto s2 = 0; s2 != obs.size(); ++s2) {
-            auto bf1 = shell2bf[s1];  // first basis function in first shell
-            auto n1 = obs[s1].size(); // number of basis functions in first shell
-            auto bf2 = shell2bf[s2];  // first basis function in second shell
-            auto n2 = obs[s2].size(); // number of basis functions in second shell
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            auto bf1 = shell2bf_1[s1];  // first basis function in first shell
+            auto n1 = bs1[s1].size(); // number of basis functions in first shell
+            auto bf2 = shell2bf_2[s2];  // first basis function in second shell
+            auto n2 = bs2[s2].size(); // number of basis functions in second shell
 
             size_t thread_id = 0;
 #ifdef _OPENMP
             thread_id = omp_get_thread_num();
 #endif
-            t_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
+            t_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
             const auto& buf_vec = t_engines[thread_id].results(); // will point to computed shell sets
 
             auto ints_shellset = buf_vec[0];    // Location of the computed integrals
@@ -243,7 +265,7 @@ py::array kinetic() {
             // Loop over shell block, keeping a total count idx for the size of shell set
             for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                 for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                    result[(bf1 + f1) * nbf + bf2 + f2] = ints_shellset[idx];
+                    result[(bf1 + f1) * nbf2 + bf2 + f2] = ints_shellset[idx];
                 }
             }
         }
@@ -255,28 +277,30 @@ py::array kinetic() {
 py::array potential() {
     // Potential integral engine
     std::vector<libint2::Engine> v_engines(nthreads);
-    v_engines[0] = libint2::Engine(libint2::Operator::nuclear, obs.max_nprim(), obs.max_l());
+    size_t max_nprim = std::max(bs1.max_nprim(), bs2.max_nprim());
+    int max_l = std::max(bs1.max_l(), bs2.max_l());
+    v_engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l);
     v_engines[0].set_params(make_point_charges(atoms));
     for (size_t i = 1; i != nthreads; ++i) {
         v_engines[i] = v_engines[0];
     }
 
-    size_t length = nbf * nbf;
+    size_t length = nbf1 * nbf2;
     std::vector<double> result(length);
 
 #pragma omp parallel for collapse(2) num_threads(nthreads)
-    for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        for(auto s2 = 0; s2 != obs.size(); ++s2) {
-            auto bf1 = shell2bf[s1];  // first basis function in first shell
-            auto n1 = obs[s1].size(); // number of basis functions in first shell
-            auto bf2 = shell2bf[s2];  // first basis function in second shell
-            auto n2 = obs[s2].size(); // number of basis functions in second shell
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            auto bf1 = shell2bf_1[s1];  // first basis function in first shell
+            auto n1 = bs1[s1].size(); // number of basis functions in first shell
+            auto bf2 = shell2bf_2[s2];  // first basis function in second shell
+            auto n2 = bs2[s2].size(); // number of basis functions in second shell
 
             size_t thread_id = 0;
 #ifdef _OPENMP
             thread_id = omp_get_thread_num();
 #endif
-            v_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
+            v_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
             const auto& buf_vec = v_engines[thread_id].results(); // will point to computed shell sets
 
             auto ints_shellset = buf_vec[0];    // Location of the computed integrals
@@ -287,7 +311,7 @@ py::array potential() {
             for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                 for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
                     // idx = x + (y * width) where x = bf2 + f2 and y = bf1 + f1 
-                    result[ (bf1 + f1) * nbf + bf2 + f2 ] = ints_shellset[idx];
+                    result[(bf1 + f1) * nbf2 + bf2 + f2] = ints_shellset[idx];
                 }
             }
         }
@@ -299,33 +323,35 @@ py::array potential() {
 py::array eri() {
     // workaround for data copying: perhaps pass an empty numpy array, then populate it in C++? avoids last line, which copies
     std::vector<libint2::Engine> eri_engines(nthreads);
-    eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, obs.max_nprim(), obs.max_l());
+    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+    eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l);
     for (size_t i = 1; i != nthreads; ++i) {
         eri_engines[i] = eri_engines[0];
     }
 
-    size_t length = nbf * nbf * nbf * nbf;
+    size_t length = nbf1 * nbf2 * nbf3 * nbf4;
     std::vector<double> result(length);
     
 #pragma omp parallel for collapse(4) num_threads(nthreads)
-    for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        for(auto s2 = 0; s2 != obs.size(); ++s2) {
-            for(auto s3=0; s3 != obs.size(); ++s3) {
-                for(auto s4 = 0; s4 != obs.size(); ++s4) {
-                    auto bf1 = shell2bf[s1];  // first basis function in first shell
-                    auto n1 = obs[s1].size(); // number of basis functions in first shell
-                    auto bf2 = shell2bf[s2];  // first basis function in second shell
-                    auto n2 = obs[s2].size(); // number of basis functions in second shell
-                    auto bf3 = shell2bf[s3];  // first basis function in third shell
-                    auto n3 = obs[s3].size(); // number of basis functions in third shell
-                    auto bf4 = shell2bf[s4];  // first basis function in fourth shell
-                    auto n4 = obs[s4].size(); // number of basis functions in fourth shell
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            for(auto s3=0; s3 != bs3.size(); ++s3) {
+                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                    auto bf1 = shell2bf_1[s1];  // first basis function in first shell
+                    auto n1 = bs1[s1].size(); // number of basis functions in first shell
+                    auto bf2 = shell2bf_2[s2];  // first basis function in second shell
+                    auto n2 = bs2[s2].size(); // number of basis functions in second shell
+                    auto bf3 = shell2bf_3[s3];  // first basis function in third shell
+                    auto n3 = bs3[s3].size(); // number of basis functions in third shell
+                    auto bf4 = shell2bf_4[s4];  // first basis function in fourth shell
+                    auto n4 = bs4[s4].size(); // number of basis functions in fourth shell
 
                     size_t thread_id = 0;
 #ifdef _OPENMP
                     thread_id = omp_get_thread_num();
 #endif
-                    eri_engines[thread_id].compute(obs[s1], obs[s2], obs[s3], obs[s4]); // Compute shell set
+                    eri_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
                     const auto& buf_vec = eri_engines[thread_id].results(); // will point to computed shell sets
 
                     auto ints_shellset = buf_vec[0];    // Location of the computed integrals
@@ -334,11 +360,11 @@ py::array eri() {
 
                     // Loop over shell block, keeping a total count idx for the size of shell set
                     for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                        size_t offset_1 = (bf1 + f1) * nbf * nbf * nbf;
+                        size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
                         for(auto f2 = 0; f2 != n2; ++f2) {
-                            size_t offset_2 = (bf2 + f2) * nbf * nbf;
+                            size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
                             for(auto f3 = 0; f3 != n3; ++f3) {
-                                size_t offset_3 = (bf3 + f3) * nbf;
+                                size_t offset_3 = (bf3 + f3) * nbf4;
                                 for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
                                     result[offset_1 + offset_2 + offset_3 + bf4 + f4] = ints_shellset[idx];
                                 }
@@ -365,24 +391,26 @@ py::array overlap_deriv(std::vector<int> deriv_vec) {
 
     // Overlap integral derivative engine
     std::vector<libint2::Engine> s_engines(nthreads);
-    s_engines[0] = libint2::Engine(libint2::Operator::overlap, obs.max_nprim(), obs.max_l(), deriv_order);
+    size_t max_nprim = std::max(bs1.max_nprim(), bs2.max_nprim());
+    int max_l = std::max(bs1.max_l(), bs2.max_l());
+    s_engines[0] = libint2::Engine(libint2::Operator::overlap, max_nprim, max_l, deriv_order);
     for (size_t i = 1; i != nthreads; ++i) {
         s_engines[i] = s_engines[0];
     }
 
     // Get size of overlap derivative array and allocate 
-    size_t length = nbf * nbf;
+    size_t length = nbf1 * nbf2;
     std::vector<double> result(length);
 
 #pragma omp parallel for collapse(2) num_threads(nthreads)
-    for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        for(auto s2 = 0; s2 != obs.size(); ++s2) {
-            auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
-            auto atom1 = shell2atom[s1]; // Atom index of shell 1
-            auto n1 = obs[s1].size();    // number of basis functions in shell 1
-            auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
-            auto atom2 = shell2atom[s2]; // Atom index of shell 2
-            auto n2 = obs[s2].size();    // number of basis functions in shell 2
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+            auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+            auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+            auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+            auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+            auto n2 = bs2[s2].size();    // number of basis functions in shell 2
             // If the atoms are the same we ignore it as the derivatives will be zero.
             if (atom1 == atom2) continue;
 
@@ -406,7 +434,7 @@ py::array overlap_deriv(std::vector<int> deriv_vec) {
 #ifdef _OPENMP
             thread_id = omp_get_thread_num();
 #endif
-            s_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
+            s_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
             const auto& buf_vec = s_engines[thread_id].results(); // will point to computed shell sets
 
             // Now convert these shell atom indices into a shell derivative index, a set of indices length deriv_order with values between 0 and 5, corresponding to 6 possible shell center coordinates
@@ -438,7 +466,7 @@ py::array overlap_deriv(std::vector<int> deriv_vec) {
             // Loop over shell block, keeping a total count idx for the size of shell set
             for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                 for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                    result[(bf1 + f1) * nbf + bf2 + f2 ] = ints_shellset[idx];
+                    result[(bf1 + f1) * nbf2 + bf2 + f2 ] = ints_shellset[idx];
                 }
             }
         }
@@ -459,23 +487,25 @@ py::array kinetic_deriv(std::vector<int> deriv_vec) {
 
     // Kinetic integral derivative engine
     std::vector<libint2::Engine> t_engines(nthreads);
-    t_engines[0] = libint2::Engine(libint2::Operator::kinetic, obs.max_nprim(), obs.max_l(), deriv_order);
+    size_t max_nprim = std::max(bs1.max_nprim(), bs2.max_nprim());
+    int max_l = std::max(bs1.max_l(), bs2.max_l());
+    t_engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l, deriv_order);
     for (size_t i = 1; i != nthreads; ++i) {
         t_engines[i] = t_engines[0];
     }
 
-    size_t length = nbf * nbf;
+    size_t length = nbf1 * nbf2;
     std::vector<double> result(length);
 
 #pragma omp parallel for collapse(2) num_threads(nthreads)
-    for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        for(auto s2 = 0; s2 != obs.size(); ++s2) {
-            auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
-            auto atom1 = shell2atom[s1]; // Atom index of shell 1
-            auto n1 = obs[s1].size();    // number of basis functions in shell 1
-            auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
-            auto atom2 = shell2atom[s2]; // Atom index of shell 2
-            auto n2 = obs[s2].size();    // number of basis functions in shell 2
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+            auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+            auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+            auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+            auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+            auto n2 = bs2[s2].size();    // number of basis functions in shell 2
             // If the atoms are the same we ignore it as the derivatives will be zero.
             if (atom1 == atom2) continue;
 
@@ -499,7 +529,7 @@ py::array kinetic_deriv(std::vector<int> deriv_vec) {
 #ifdef _OPENMP
             thread_id = omp_get_thread_num();
 #endif
-            t_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
+            t_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
             const auto& buf_vec = t_engines[thread_id].results(); // will point to computed shell sets
 
             // Now convert these shell atom indices into a shell derivative index, a set of indices length deriv_order with values between 0 and 5, corresponding to 6 possible shell center coordinates
@@ -531,7 +561,7 @@ py::array kinetic_deriv(std::vector<int> deriv_vec) {
             // Loop over shell block, keeping a total count idx for the size of shell set
             for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                 for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                    result[(bf1 + f1) * nbf + bf2 + f2 ] = ints_shellset[idx];
+                    result[(bf1 + f1) * nbf2 + bf2 + f2] = ints_shellset[idx];
                 }
             }
         }
@@ -560,25 +590,27 @@ py::array potential_deriv(std::vector<int> deriv_vec) {
 
     // Potential integral derivative engine
     std::vector<libint2::Engine> v_engines(nthreads);
-    v_engines[0] = libint2::Engine(libint2::Operator::nuclear, obs.max_nprim(), obs.max_l(), deriv_order);
+    size_t max_nprim = std::max(bs1.max_nprim(), bs2.max_nprim());
+    int max_l = std::max(bs1.max_l(), bs2.max_l());
+    v_engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l, deriv_order);
     v_engines[0].set_params(make_point_charges(atoms));
     for (size_t i = 1; i != nthreads; ++i) {
         v_engines[i] = v_engines[0];
     }
 
-    // Get size of potential derivative array and allocate 
-    size_t length = nbf * nbf;
+    // Get size of potential derivative array and allocate
+    size_t length = nbf1 * nbf2;
     std::vector<double> result(length);
 
 #pragma omp parallel for collapse(2) num_threads(nthreads)
-    for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        for(auto s2 = 0; s2 != obs.size(); ++s2) {
-            auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
-            auto atom1 = shell2atom[s1]; // Atom index of shell 1
-            auto n1 = obs[s1].size();    // number of basis functions in shell 1
-            auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
-            auto atom2 = shell2atom[s2]; // Atom index of shell 2
-            auto n2 = obs[s2].size();    // number of basis functions in shell 2
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+            auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+            auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+            auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+            auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+            auto n2 = bs2[s2].size();    // number of basis functions in shell 2
 
             // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
             std::vector<long> shell_atom_index_list{atom1, atom2};
@@ -623,7 +655,7 @@ py::array potential_deriv(std::vector<int> deriv_vec) {
 #ifdef _OPENMP
             thread_id = omp_get_thread_num();
 #endif
-            v_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
+            v_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
             const auto& buf_vec = v_engines[thread_id].results(); // will point to computed shell sets
             
             // Loop over every subvector of index_combos and lookup buffer index.
@@ -665,7 +697,7 @@ py::array potential_deriv(std::vector<int> deriv_vec) {
                 if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
                 for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                     for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                        result[(bf1 + f1) * nbf + bf2 + f2] += ints_shellset[idx];
+                        result[(bf1 + f1) * nbf2 + bf2 + f2] += ints_shellset[idx];
                     }
                 }
             }
@@ -687,31 +719,33 @@ py::array eri_deriv(std::vector<int> deriv_vec) {
 
     // ERI derivative integral engine
     std::vector<libint2::Engine> eri_engines(nthreads);
-    eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, obs.max_nprim(), obs.max_l(), deriv_order);
+    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+    eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
     for (size_t i = 1; i != nthreads; ++i) {
         eri_engines[i] = eri_engines[0];
     }
 
-    size_t length = nbf * nbf * nbf * nbf;
+    size_t length = nbf1 * nbf2 * nbf3 * nbf4;
     std::vector<double> result(length);
 
 #pragma omp parallel for collapse(4) num_threads(nthreads)
-    for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        for(auto s2 = 0; s2 != obs.size(); ++s2) {
-            for(auto s3 = 0; s3 != obs.size(); ++s3) {
-                for(auto s4 = 0; s4 != obs.size(); ++s4) {
-                    auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
-                    auto atom1 = shell2atom[s1]; // Atom index of shell 1
-                    auto n1 = obs[s1].size();    // number of basis functions in shell 1
-                    auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
-                    auto atom2 = shell2atom[s2]; // Atom index of shell 2
-                    auto n2 = obs[s2].size();    // number of basis functions in shell 2
-                    auto bf3 = shell2bf[s3];     // Index of first basis function in shell 3
-                    auto atom3 = shell2atom[s3]; // Atom index of shell 3
-                    auto n3 = obs[s3].size();    // number of basis functions in shell 3
-                    auto bf4 = shell2bf[s4];     // Index of first basis function in shell 4
-                    auto atom4 = shell2atom[s4]; // Atom index of shell 4
-                    auto n4 = obs[s4].size();    // number of basis functions in shell 4
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
+                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
+                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
+                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
+                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
+                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
+                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
 
                     // If the atoms are the same we ignore it as the derivatives will be zero.
                     if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
@@ -797,18 +831,18 @@ py::array eri_deriv(std::vector<int> deriv_vec) {
 #ifdef _OPENMP
                     thread_id = omp_get_thread_num();
 #endif
-                    eri_engines[thread_id].compute(obs[s1], obs[s2], obs[s3], obs[s4]); // Compute shell set
+                    eri_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
                     const auto& buf_vec = eri_engines[thread_id].results(); // will point to computed shell sets
 
                     for(auto i = 0; i<buffer_indices.size(); ++i) {
                         auto ints_shellset = buf_vec[buffer_indices[i]];
                         if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
                         for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                            size_t offset_1 = (bf1 + f1) * nbf * nbf * nbf;
+                            size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
                             for(auto f2 = 0; f2 != n2; ++f2) {
-                                size_t offset_2 = (bf2 + f2) * nbf * nbf;
+                                size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
                                 for(auto f3 = 0; f3 != n3; ++f3) {
-                                    size_t offset_3 = (bf3 + f3) * nbf;
+                                    size_t offset_3 = (bf3 + f3) * nbf4;
                                     for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
                                         result[offset_1 + offset_2 + offset_3 + bf4 + f4] += ints_shellset[idx];
                                     }
@@ -853,7 +887,10 @@ void oei_deriv_disk(int max_deriv_order) {
     long total_deriv_slices = 0;
     for (int i = 1; i <= max_deriv_order; i++){
         total_deriv_slices += how_many_derivs(natom, i);
-        }
+    }
+
+    size_t max_nprim = std::max(bs1.max_nprim(), bs2.max_nprim());
+    int max_l = std::max(bs1.max_l(), bs2.max_l());
 
     // Create H5 File and prepare to fill with 0.0's
     const H5std_string file_name("oei_derivs.h5");
@@ -881,9 +918,9 @@ void oei_deriv_disk(int max_deriv_order) {
 
         // Define engines and buffers
         std::vector<libint2::Engine> s_engines(nthreads), t_engines(nthreads), v_engines(nthreads);
-        s_engines[0] = libint2::Engine(libint2::Operator::overlap, obs.max_nprim(), obs.max_l(), deriv_order);
-        t_engines[0] = libint2::Engine(libint2::Operator::kinetic, obs.max_nprim(), obs.max_l(), deriv_order);
-        v_engines[0] = libint2::Engine(libint2::Operator::nuclear, obs.max_nprim(), obs.max_l(), deriv_order);
+        s_engines[0] = libint2::Engine(libint2::Operator::overlap, max_nprim, max_l, deriv_order);
+        t_engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l, deriv_order);
+        v_engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l, deriv_order);
         v_engines[0].set_params(make_point_charges(atoms));
         for (size_t i = 1; i != nthreads; ++i) {
             s_engines[i] = s_engines[0];
@@ -897,7 +934,7 @@ void oei_deriv_disk(int max_deriv_order) {
         const H5std_string potential_dset_name("potential_deriv" + std::to_string(deriv_order));
 
         // Define rank and dimensions of data that will be written to the file
-        hsize_t file_dims[] = {nbf, nbf, nderivs_triu};
+        hsize_t file_dims[] = {nbf1, nbf2, nderivs_triu};
         DataSpace fspace(3, file_dims);
         // Create dataset for each integral type and write 0.0's into the file 
         DataSet* overlap_dataset = new DataSet(file->createDataSet(overlap_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
@@ -907,23 +944,24 @@ void oei_deriv_disk(int max_deriv_order) {
         hsize_t block[3] = {1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
         hsize_t zerostart[3] = {0, 0, 0};
 
-        for(auto s1 = 0; s1 != obs.size(); ++s1) {
-            auto bf1 = shell2bf[s1];  // first basis function in first shell
-            auto atom1 = shell2atom[s1]; // Atom index of shell 1
-            auto n1 = obs[s1].size(); // number of basis functions in first shell
-            for(auto s2 = 0; s2 != obs.size(); ++s2) {
-                auto bf2 = shell2bf[s2];  // first basis function in second shell
-                auto atom2 = shell2atom[s2]; // Atom index of shell 2
-                auto n2 = obs[s2].size(); // number of basis functions in second shell
+#pragma omp parallel for collapse(2) num_threads(nthreads)
+        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+                auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                auto n2 = bs2[s2].size();    // number of basis functions in shell 2
                 std::vector<long> shell_atom_index_list{atom1, atom2};
 
                 size_t thread_id = 0;
 #ifdef _OPENMP
                 thread_id = omp_get_thread_num();
 #endif
-                s_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
-                t_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
-                v_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
+                s_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+                t_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+                v_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
                 const auto& overlap_buffer = s_engines[thread_id].results(); // will point to computed shell sets
                 const auto& kinetic_buffer = t_engines[thread_id].results(); // will point to computed shell sets
                 const auto& potential_buffer = v_engines[thread_id].results(); // will point to computed shell sets;
@@ -1060,12 +1098,15 @@ void eri_deriv_disk(int max_deriv_order) {
     DSetCreatPropList plist;
     plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
 
+    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+
     // Check to make sure you are not flooding the disk.
     long total_deriv_slices = 0;
     for (int i = 1; i <= max_deriv_order; i++){
         total_deriv_slices += how_many_derivs(natom, i);
-        }
-    double check = (nbf * nbf * nbf * nbf * total_deriv_slices * 8) * (1e-9);
+    }
+    double check = (nbf1 * nbf2 * nbf3 * nbf4 * total_deriv_slices * 8) * (1e-9);
     assert(check < 10 && "Total disk space required for ERI's exceeds 10 GB. Increase threshold and recompile to proceed.");
 
     for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
@@ -1083,14 +1124,14 @@ void eri_deriv_disk(int max_deriv_order) {
 
         // Libint engine for computing shell quartet derivatives
         std::vector<libint2::Engine> eri_engines(nthreads);
-        eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, obs.max_nprim(), obs.max_l(), deriv_order);
+        eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
         for (size_t i = 1; i != nthreads; ++i) {
             eri_engines[i] = eri_engines[0];
         }
 
         // Define HDF5 dataset name
         const H5std_string eri_dset_name("eri_deriv" + std::to_string(deriv_order));
-        hsize_t file_dims[] = {nbf, nbf, nbf, nbf, nderivs_triu};
+        hsize_t file_dims[] = {nbf1, nbf2, nbf3, nbf4, nderivs_triu};
         DataSpace fspace(5, file_dims);
         // Create dataset for each integral type and write 0.0's into the file 
         DataSet* eri_dataset = new DataSet(file->createDataSet(eri_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
@@ -1099,22 +1140,22 @@ void eri_deriv_disk(int max_deriv_order) {
         hsize_t zerostart[5] = {0, 0, 0, 0, 0};
 
 #pragma omp parallel for collapse(4) num_threads(nthreads)
-        for(auto s1 = 0; s1 != obs.size(); ++s1) {
-            for(auto s2 = 0; s2 != obs.size(); ++s2) {
-                for(auto s3 = 0; s3 != obs.size(); ++s3) {
-                    for(auto s4 = 0; s4 != obs.size(); ++s4) {
-                        auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
-                        auto atom1 = shell2atom[s1]; // Atom index of shell 1
-                        auto n1 = obs[s1].size();    // number of basis functions in shell 1
-                        auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
-                        auto atom2 = shell2atom[s2]; // Atom index of shell 2
-                        auto n2 = obs[s2].size();    // number of basis functions in shell 2
-                        auto bf3 = shell2bf[s3];     // Index of first basis function in shell 3
-                        auto atom3 = shell2atom[s3]; // Atom index of shell 3
-                        auto n3 = obs[s3].size();    // number of basis functions in shell 3
-                        auto bf4 = shell2bf[s4];     // Index of first basis function in shell 4
-                        auto atom4 = shell2atom[s4]; // Atom index of shell 4
-                        auto n4 = obs[s4].size();    // number of basis functions in shell 4
+        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+                for(auto s3 = 0; s3 != bs3.size(); ++s3) {
+                    for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                        auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                        auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                        auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                        auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                        auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                        auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+                        auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
+                        auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
+                        auto n3 = bs3[s3].size();    // number of basis functions in shell 3
+                        auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
+                        auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
+                        auto n4 = bs4[s4].size();    // number of basis functions in shell 4
 
                         if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
                         std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
@@ -1123,7 +1164,7 @@ void eri_deriv_disk(int max_deriv_order) {
 #ifdef _OPENMP
                         thread_id = omp_get_thread_num();
 #endif
-                        eri_engines[thread_id].compute(obs[s1], obs[s2], obs[s3], obs[s4]); // Compute shell set
+                        eri_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
                         const auto& eri_buffer = eri_engines[thread_id].results(); // will point to computed shell sets
 
                         // Define shell set slab, with extra dimension for unique derivatives, initialized with 0.0's
@@ -1230,9 +1271,11 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
 
     // Define engines and buffers
     std::vector<libint2::Engine> s_engines(nthreads), t_engines(nthreads), v_engines(nthreads);
-    s_engines[0] = libint2::Engine(libint2::Operator::overlap, obs.max_nprim(), obs.max_l(), deriv_order);
-    t_engines[0] = libint2::Engine(libint2::Operator::kinetic, obs.max_nprim(), obs.max_l(), deriv_order);
-    v_engines[0] = libint2::Engine(libint2::Operator::nuclear, obs.max_nprim(), obs.max_l(), deriv_order);
+    size_t max_nprim = std::max(bs1.max_nprim(), bs2.max_nprim());
+    int max_l = std::max(bs1.max_l(), bs2.max_l());
+    s_engines[0] = libint2::Engine(libint2::Operator::overlap, max_nprim, max_l, deriv_order);
+    t_engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l, deriv_order);
+    v_engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l, deriv_order);
     v_engines[0].set_params(make_point_charges(atoms));
     for (size_t i = 1; i != nthreads; ++i) {
         s_engines[i] = s_engines[0];
@@ -1240,29 +1283,29 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
         v_engines[i] = v_engines[0];
     }
 
-    size_t length = nbf * nbf * nderivs_triu;
+    size_t length = nbf1 * nbf2 * nderivs_triu;
     std::vector<double> S(length);
     std::vector<double> T(length);
     std::vector<double> V(length);
 
 #pragma omp parallel for collapse(2) num_threads(nthreads)
-    for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        for(auto s2 = 0; s2 != obs.size(); ++s2) {
-            auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
-            auto atom1 = shell2atom[s1]; // Atom index of shell 1
-            auto n1 = obs[s1].size();    // number of basis functions in shell 1
-            auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
-            auto atom2 = shell2atom[s2]; // Atom index of shell 2
-            auto n2 = obs[s2].size();    // number of basis functions in shell 2
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+            auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+            auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+            auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+            auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+            auto n2 = bs2[s2].size();    // number of basis functions in shell 2
             std::vector<long> shell_atom_index_list{atom1, atom2};
 
             size_t thread_id = 0;
 #ifdef _OPENMP
             thread_id = omp_get_thread_num();
 #endif
-            s_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
-            t_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
-            v_engines[thread_id].compute(obs[s1], obs[s2]); // Compute shell set
+            s_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+            t_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+            v_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
             const auto& overlap_buffer = s_engines[thread_id].results(); // will point to computed shell sets
             const auto& kinetic_buffer = t_engines[thread_id].results(); // will point to computed shell sets
             const auto& potential_buffer = v_engines[thread_id].results(); // will point to computed shell sets
@@ -1270,7 +1313,7 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
             // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
             // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
             for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                size_t offset_nuc_idx = nuc_idx * nbf * nbf;
+                size_t offset_nuc_idx = nuc_idx * nbf1 * nbf2;
 
                 // Look up multidimensional cartesian derivative index
                 auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
@@ -1342,8 +1385,8 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
                     auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
                     for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                         for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                            S[(bf1 + f1) * nbf + bf2 + f2 + offset_nuc_idx] += overlap_shellset[idx];
-                            T[(bf1 + f1) * nbf + bf2 + f2 + offset_nuc_idx] += kinetic_shellset[idx];
+                            S[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += overlap_shellset[idx];
+                            T[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += kinetic_shellset[idx];
                         }
                     }
                 }
@@ -1352,7 +1395,7 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
                     auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
                     for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                         for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                            V[(bf1 + f1) * nbf + bf2 + f2 + offset_nuc_idx] += potential_shellset[idx];
+                            V[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += potential_shellset[idx];
                         }
                     }
                 }
@@ -1378,32 +1421,34 @@ py::array eri_deriv_core(int deriv_order) {
 
     // Libint engine for computing shell quartet derivatives
     std::vector<libint2::Engine> eri_engines(nthreads);
-    eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, obs.max_nprim(), obs.max_l(), deriv_order);
+    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+    eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
     for (size_t i = 1; i != nthreads; ++i) {
         eri_engines[i] = eri_engines[0];
     }
 
-    size_t length = nbf * nbf * nbf * nbf * nderivs_triu;
+    size_t length = nbf1 * nbf2 * nbf3 * nbf4 * nderivs_triu;
     std::vector<double> result(length);
 
     // Begin shell quartet loops
 #pragma omp parallel for collapse(4) num_threads(nthreads)
-    for(auto s1 = 0; s1 != obs.size(); ++s1) {
-        for(auto s2 = 0; s2 != obs.size(); ++s2) {
-            for(auto s3 = 0; s3 != obs.size(); ++s3) {
-                for(auto s4 = 0; s4 != obs.size(); ++s4) {
-                    auto bf1 = shell2bf[s1];     // Index of first basis function in shell 1
-                    auto atom1 = shell2atom[s1]; // Atom index of shell 1
-                    auto n1 = obs[s1].size();    // number of basis functions in shell 1
-                    auto bf2 = shell2bf[s2];     // Index of first basis function in shell 2
-                    auto atom2 = shell2atom[s2]; // Atom index of shell 2
-                    auto n2 = obs[s2].size();    // number of basis functions in shell 2
-                    auto bf3 = shell2bf[s3];     // Index of first basis function in shell 3
-                    auto atom3 = shell2atom[s3]; // Atom index of shell 3
-                    auto n3 = obs[s3].size();    // number of basis functions in shell 3
-                    auto bf4 = shell2bf[s4];     // Index of first basis function in shell 4
-                    auto atom4 = shell2atom[s4]; // Atom index of shell 4
-                    auto n4 = obs[s4].size();    // number of basis functions in shell 4
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
+                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
+                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
+                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
+                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
+                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
+                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
 
                     if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
                     std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
@@ -1412,12 +1457,12 @@ py::array eri_deriv_core(int deriv_order) {
 #ifdef _OPENMP
                     thread_id = omp_get_thread_num();
 #endif
-                    eri_engines[thread_id].compute(obs[s1], obs[s2], obs[s3], obs[s4]); // Compute shell set
+                    eri_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
                     const auto& eri_buffer = eri_engines[thread_id].results(); // will point to computed shell sets
 
                     // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
                     for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                        size_t offset_nuc_idx = nuc_idx * nbf * nbf * nbf * nbf;
+                        size_t offset_nuc_idx = nuc_idx * nbf1 * nbf2 * nbf3 * nbf4;
 
                         // Look up multidimensional cartesian derivative index
                         auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
@@ -1465,11 +1510,11 @@ py::array eri_deriv_core(int deriv_order) {
                             auto eri_shellset = eri_buffer[buffer_indices[i]];
                             if (eri_shellset == nullptr) continue;
                             for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                                size_t offset_1 = (bf1 + f1) * nbf * nbf * nbf;
+                                size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
                                 for(auto f2 = 0; f2 != n2; ++f2) {
-                                    size_t offset_2 = (bf2 + f2) * nbf * nbf;
+                                    size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
                                     for(auto f3 = 0; f3 != n3; ++f3) {
-                                        size_t offset_3 = (bf3 + f3) * nbf;
+                                        size_t offset_3 = (bf3 + f3) * nbf4;
                                         for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
                                             size_t offset_4 = bf4 + f4;
                                             result[offset_1 + offset_2 + offset_3 + offset_4 + offset_nuc_idx] += eri_shellset[idx];
diff --git a/quax/integrals/makefile b/quax/integrals/makefile
index eb7acde..26123f1 100644
--- a/quax/integrals/makefile
+++ b/quax/integrals/makefile
@@ -2,7 +2,7 @@
 # Eigen headers, Python headers, Pybind11 headers, Libint API headers libint2.h libint2.hpp, the rest of the Libint2 headers, and the library location of libint2.a,
 CC      := g++
 # Options passed to compiler
-CFLAGS  := -O3 -fPIC -fopenmp
+CFLAGS  := -O3 -fPIC -fopenmp -g
 # Libint prefix location (where /include, /include/libint2, /lib, /share are located) 
 LIBINT_PREFIX := /home/ecm23353/psi_env
 
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index 95e681e..aed6bb6 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -27,14 +27,14 @@ def compute_integrals(geom, basis_name, xyz_path, nuclear_charges, charge, deriv
         oei_obj = OEI(basis_name, xyz_path, deriv_order, 'disk')
         # If disk integral derivs are right, nothing to do
         if check:
-            libint_interface.initialize(xyz_path, basis_name)
+            libint_interface.initialize(xyz_path, basis_name, basis_name, basis_name, basis_name)
             S = oei_obj.overlap(geom)
             T = oei_obj.kinetic(geom)
             V = oei_obj.potential(geom)
             G = tei_obj.tei(geom)
             libint_interface.finalize()
         else:
-            libint_interface.initialize(xyz_path, basis_name)
+            libint_interface.initialize(xyz_path, basis_name, basis_name, basis_name, basis_name)
             libint_interface.oei_deriv_disk(deriv_order)
             libint_interface.eri_deriv_disk(deriv_order)
             S = oei_obj.overlap(geom)
@@ -44,7 +44,7 @@ def compute_integrals(geom, basis_name, xyz_path, nuclear_charges, charge, deriv
             libint_interface.finalize()
 
     else:
-        libint_interface.initialize(xyz_path, basis_name)
+        libint_interface.initialize(xyz_path, basis_name, basis_name, basis_name, basis_name)
         # Precompute TEI derivatives
         tei_obj = TEI(basis_name, xyz_path, deriv_order, 'core')
         oei_obj = OEI(basis_name, xyz_path, deriv_order, 'core')
diff --git a/quax/utils.py b/quax/utils.py
index 07a5574..7dcbf7a 100644
--- a/quax/utils.py
+++ b/quax/utils.py
@@ -7,7 +7,7 @@ def how_many_derivs(k,n):
     fact = 1
     for i in range(n):
         val *= 3 * k + i
-        fact *= i + 1;
+        fact *= i + 1
     val /= fact
     return int(val)
 

From 16145584dd5df629cfe5335906dbf7c498ab8064 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 25 Sep 2023 12:40:44 -0400
Subject: [PATCH 10/91] Change tei to eri

---
 quax/integrals/tei.py | 42 +++++++++++++++++++++---------------------
 quax/methods/ints.py  |  6 +++---
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/quax/integrals/tei.py b/quax/integrals/tei.py
index 0d1e866..5c6b98f 100644
--- a/quax/integrals/tei.py
+++ b/quax/integrals/tei.py
@@ -34,35 +34,35 @@ def __init__(self, basis_name, xyz_path, max_deriv_order, mode):
         self.nbf = nbf
 
         # Create new JAX primitive for TEI evaluation
-        self.tei_p = jax.core.Primitive("tei")
-        self.tei_deriv_p = jax.core.Primitive("tei_deriv")
+        self.eri_p = jax.core.Primitive("eri")
+        self.eri_deriv_p = jax.core.Primitive("eri_deriv")
 
         # Register primitive evaluation rules
-        self.tei_p.def_impl(self.tei_impl)
-        self.tei_deriv_p.def_impl(self.tei_deriv_impl)
+        self.eri_p.def_impl(self.eri_impl)
+        self.eri_deriv_p.def_impl(self.eri_deriv_impl)
 
         # Register the JVP rules with JAX
-        jax.interpreters.ad.primitive_jvps[self.tei_p] = self.tei_jvp
-        jax.interpreters.ad.primitive_jvps[self.tei_deriv_p] = self.tei_deriv_jvp
+        jax.interpreters.ad.primitive_jvps[self.eri_p] = self.eri_jvp
+        jax.interpreters.ad.primitive_jvps[self.eri_deriv_p] = self.eri_deriv_jvp
 
         # Register tei_deriv batching rule with JAX
-        jax.interpreters.batching.primitive_batchers[self.tei_deriv_p] = self.tei_deriv_batch
+        jax.interpreters.batching.primitive_batchers[self.eri_deriv_p] = self.eri_deriv_batch
 
     # Create functions to call primitives
-    def tei(self, geom):
-        return self.tei_p.bind(geom)
+    def eri(self, geom):
+        return self.eri_p.bind(geom)
 
-    def tei_deriv(self, geom, deriv_vec):
-        return self.tei_deriv_p.bind(geom, deriv_vec)
+    def eri_deriv(self, geom, deriv_vec):
+        return self.eri_deriv_p.bind(geom, deriv_vec)
 
     # Create primitive evaluation rules
-    def tei_impl(self, geom):
+    def eri_impl(self, geom):
         G = libint_interface.eri()
         #d = int(np.sqrt(np.sqrt(G.shape[0])))
         G = G.reshape(self.nbf,self.nbf,self.nbf,self.nbf)
         return jnp.asarray(G)
 
-    def tei_deriv_impl(self, geom, deriv_vec):
+    def eri_deriv_impl(self, geom, deriv_vec):
         deriv_vec = np.asarray(deriv_vec, int)
         deriv_order = np.sum(deriv_vec)
         idx = get_deriv_vec_idx(deriv_vec)
@@ -99,22 +99,22 @@ def tei_deriv_impl(self, geom, deriv_vec):
     # Create Jacobian-vector product rule, which given some input args (primals)
     # and a tangent std basis vector (tangent), returns the function evaluated at that point (primals_out)
     # and the slice of the Jacobian (tangents_out)
-    def tei_jvp(self, primals, tangents):
+    def eri_jvp(self, primals, tangents):
         geom, = primals
-        primals_out = self.tei(geom)
-        tangents_out = self.tei_deriv(geom, tangents[0])
+        primals_out = self.eri(geom)
+        tangents_out = self.eri_deriv(geom, tangents[0])
         return primals_out, tangents_out
 
-    def tei_deriv_jvp(self, primals, tangents):
+    def eri_deriv_jvp(self, primals, tangents):
         geom, deriv_vec = primals
-        primals_out = self.tei_deriv(geom, deriv_vec)
+        primals_out = self.eri_deriv(geom, deriv_vec)
         # Here we add the current value of deriv_vec to the incoming tangent vector,
         # so that nested higher order differentiation works
-        tangents_out = self.tei_deriv(geom, deriv_vec + tangents[0])
+        tangents_out = self.eri_deriv(geom, deriv_vec + tangents[0])
         return primals_out, tangents_out
 
     # Define Batching rules, this is only needed since jax.jacfwd will call vmap on the JVP of tei
-    def tei_deriv_batch(self, batched_args, batch_dims):
+    def eri_deriv_batch(self, batched_args, batch_dims):
         # When the input argument of deriv_batch is batched along the 0'th axis
         # we want to evaluate every 4d slice, gather up a (ncart, n,n,n,n) array,
         # (expand dims at 0 and concatenate at 0)
@@ -124,7 +124,7 @@ def tei_deriv_batch(self, batched_args, batch_dims):
         geom_dim, deriv_dim = batch_dims
         results = []
         for i in deriv_batch:
-            tmp = self.tei_deriv(geom_batch, i)
+            tmp = self.eri_deriv(geom_batch, i)
             results.append(jnp.expand_dims(tmp, axis=0))
         results = jnp.concatenate(results, axis=0)
         return results, 0
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index aed6bb6..6255d79 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -31,7 +31,7 @@ def compute_integrals(geom, basis_name, xyz_path, nuclear_charges, charge, deriv
             S = oei_obj.overlap(geom)
             T = oei_obj.kinetic(geom)
             V = oei_obj.potential(geom)
-            G = tei_obj.tei(geom)
+            G = tei_obj.eri(geom)
             libint_interface.finalize()
         else:
             libint_interface.initialize(xyz_path, basis_name, basis_name, basis_name, basis_name)
@@ -40,7 +40,7 @@ def compute_integrals(geom, basis_name, xyz_path, nuclear_charges, charge, deriv
             S = oei_obj.overlap(geom)
             T = oei_obj.kinetic(geom)
             V = oei_obj.potential(geom)
-            G = tei_obj.tei(geom)
+            G = tei_obj.eri(geom)
             libint_interface.finalize()
 
     else:
@@ -52,7 +52,7 @@ def compute_integrals(geom, basis_name, xyz_path, nuclear_charges, charge, deriv
         S = oei_obj.overlap(geom)
         T = oei_obj.kinetic(geom)
         V = oei_obj.potential(geom)
-        G = tei_obj.tei(geom)
+        G = tei_obj.eri(geom)
         libint_interface.finalize()
 
     return S, T, V, G

From 8eacd28d92f56070c3f964e1a24080adf87fb37f Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Tue, 26 Sep 2023 16:30:42 -0400
Subject: [PATCH 11/91] F12 Derivatives and basisset generalization

---
 quax/integrals/libint_interface.cc | 3062 +++++++++++++++++++++++-----
 quax/integrals/oei.py              |    6 +-
 quax/integrals/tei.py              |  239 ++-
 quax/methods/ints.py               |   14 +-
 4 files changed, 2833 insertions(+), 488 deletions(-)

diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index 1aa2d49..c7fb89a 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -97,6 +97,40 @@ void finalize() {
     libint2::finalize();
 }
 
+// Used to make contracted Gaussian-type geminal for F12 methods
+std::vector<std::pair<double, double>> make_cgtg(double exponent) {
+    // The fitting coefficients and the exponents from MPQC
+    std::vector<std::pair<double, double>> exp_coeff = {};
+    std::vector<double> coeffs = {-0.31442480597241274, -0.30369575353387201, -0.16806968430232927,
+                                  -0.098115812152857612, -0.060246640234342785, -0.037263541968504843};
+    std::vector<double> exps = {0.22085085450735284, 1.0040191632019282, 3.6212173098378728,
+                                12.162483236221904, 45.855332448029337, 254.23460688554644};
+
+    for (int i = 0; i < exps.size(); i++){
+        auto exp_scaled = (exponent * exponent) * exps[i];
+        exp_coeff.push_back(std::make_pair(exp_scaled, coeffs[i]));
+    }
+    
+    return exp_coeff;
+}
+
+// Returns square of cgtg
+std::vector<std::pair<double, double>> take_square(std::vector<std::pair<double, double>> input) {
+    auto n = input.size();
+    std::vector<std::pair<double, double>> output;
+    for (int i = 0; i < n; ++i) {
+        auto e_i = input[i].first;
+        auto c_i = input[i].second;
+        for (int j = i; j < n; ++j) {
+            auto e_j = input[j].first;
+            auto c_j = input[j].second;
+            double scale = i == j ? 1.0 : 2.0;
+            output.emplace_back(std::make_pair(e_i + e_j, scale * c_i * c_j));
+        }
+    }
+    return output;
+}
+
 // Cartesian product of arbitrary number of vectors, given a vector of vectors
 // Used to find all possible combinations of indices which correspond to desired nuclear derivatives
 // For example, if molecule has two atoms, A and B, and we want nuclear derivative d^2/dAz dBz, represented by deriv_vec = [0,0,1,0,0,1], 
@@ -378,6 +412,250 @@ py::array eri() {
     return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
 }
 
+// Computes integrals of contracted Gaussian-type geminal
+py::array f12(double beta) {
+    // workaround for data copying: perhaps pass an empty numpy array, then populate it in C++? avoids last line, which copies
+    auto cgtg_params = make_cgtg(beta);
+    std::vector<libint2::Engine> cgtg_engines(nthreads);
+    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+    cgtg_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l);
+    cgtg_engines[0].set_params(cgtg_params);
+    for (size_t i = 1; i != nthreads; ++i) {
+        cgtg_engines[i] = cgtg_engines[0];
+    }
+
+    size_t length = nbf1 * nbf2 * nbf3 * nbf4;
+    std::vector<double> result(length);
+    
+#pragma omp parallel for collapse(4) num_threads(nthreads)
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            for(auto s3=0; s3 != bs3.size(); ++s3) {
+                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                    auto bf1 = shell2bf_1[s1];  // first basis function in first shell
+                    auto n1 = bs1[s1].size(); // number of basis functions in first shell
+                    auto bf2 = shell2bf_2[s2];  // first basis function in second shell
+                    auto n2 = bs2[s2].size(); // number of basis functions in second shell
+                    auto bf3 = shell2bf_3[s3];  // first basis function in third shell
+                    auto n3 = bs3[s3].size(); // number of basis functions in third shell
+                    auto bf4 = shell2bf_4[s4];  // first basis function in fourth shell
+                    auto n4 = bs4[s4].size(); // number of basis functions in fourth shell
+
+                    size_t thread_id = 0;
+#ifdef _OPENMP
+                    thread_id = omp_get_thread_num();
+#endif
+                    cgtg_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                    const auto& buf_vec = cgtg_engines[thread_id].results(); // will point to computed shell sets
+
+                    auto ints_shellset = buf_vec[0];    // Location of the computed integrals
+                    if (ints_shellset == nullptr)
+                        continue;  // nullptr returned if the entire shell-set was screened out
+
+                    // Loop over shell block, keeping a total count idx for the size of shell set
+                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                        size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                        for(auto f2 = 0; f2 != n2; ++f2) {
+                            size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                            for(auto f3 = 0; f3 != n3; ++f3) {
+                                size_t offset_3 = (bf3 + f3) * nbf4;
+                                for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                    result[offset_1 + offset_2 + offset_3 + bf4 + f4] = ints_shellset[idx];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
+}
+
+// Computes integrals of squared contracted Gaussian-type geminal
+py::array f12_squared(double beta) {
+    // workaround for data copying: perhaps pass an empty numpy array, then populate it in C++? avoids last line, which copies
+    auto cgtg_params = take_square(make_cgtg(beta));
+    std::vector<libint2::Engine> cgtg_squared_engines(nthreads);
+    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+    cgtg_squared_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l);
+    cgtg_squared_engines[0].set_params(cgtg_params);
+    for (size_t i = 1; i != nthreads; ++i) {
+        cgtg_squared_engines[i] = cgtg_squared_engines[0];
+    }
+
+    size_t length = nbf1 * nbf2 * nbf3 * nbf4;
+    std::vector<double> result(length);
+    
+#pragma omp parallel for collapse(4) num_threads(nthreads)
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            for(auto s3=0; s3 != bs3.size(); ++s3) {
+                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                    auto bf1 = shell2bf_1[s1];  // first basis function in first shell
+                    auto n1 = bs1[s1].size(); // number of basis functions in first shell
+                    auto bf2 = shell2bf_2[s2];  // first basis function in second shell
+                    auto n2 = bs2[s2].size(); // number of basis functions in second shell
+                    auto bf3 = shell2bf_3[s3];  // first basis function in third shell
+                    auto n3 = bs3[s3].size(); // number of basis functions in third shell
+                    auto bf4 = shell2bf_4[s4];  // first basis function in fourth shell
+                    auto n4 = bs4[s4].size(); // number of basis functions in fourth shell
+
+                    size_t thread_id = 0;
+#ifdef _OPENMP
+                    thread_id = omp_get_thread_num();
+#endif
+                    cgtg_squared_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                    const auto& buf_vec = cgtg_squared_engines[thread_id].results(); // will point to computed shell sets
+
+                    auto ints_shellset = buf_vec[0];    // Location of the computed integrals
+                    if (ints_shellset == nullptr)
+                        continue;  // nullptr returned if the entire shell-set was screened out
+
+                    // Loop over shell block, keeping a total count idx for the size of shell set
+                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                        size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                        for(auto f2 = 0; f2 != n2; ++f2) {
+                            size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                            for(auto f3 = 0; f3 != n3; ++f3) {
+                                size_t offset_3 = (bf3 + f3) * nbf4;
+                                for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                    result[offset_1 + offset_2 + offset_3 + bf4 + f4] = ints_shellset[idx];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
+}
+
+// Computes electron repulsion integrals of contracted Gaussian-type geminal
+py::array f12g12(double beta) {
+    // workaround for data copying: perhaps pass an empty numpy array, then populate it in C++? avoids last line, which copies
+    auto cgtg_params = make_cgtg(beta);
+    std::vector<libint2::Engine> cgtg_coulomb_engines(nthreads);
+    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+    cgtg_coulomb_engines[0] = libint2::Engine(libint2::Operator::cgtg_x_coulomb, max_nprim, max_l);
+    cgtg_coulomb_engines[0].set_params(cgtg_params);
+    for (size_t i = 1; i != nthreads; ++i) {
+        cgtg_coulomb_engines[i] = cgtg_coulomb_engines[0];
+    }
+
+    size_t length = nbf1 * nbf2 * nbf3 * nbf4;
+    std::vector<double> result(length);
+    
+#pragma omp parallel for collapse(4) num_threads(nthreads)
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            for(auto s3=0; s3 != bs3.size(); ++s3) {
+                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                    auto bf1 = shell2bf_1[s1];  // first basis function in first shell
+                    auto n1 = bs1[s1].size(); // number of basis functions in first shell
+                    auto bf2 = shell2bf_2[s2];  // first basis function in second shell
+                    auto n2 = bs2[s2].size(); // number of basis functions in second shell
+                    auto bf3 = shell2bf_3[s3];  // first basis function in third shell
+                    auto n3 = bs3[s3].size(); // number of basis functions in third shell
+                    auto bf4 = shell2bf_4[s4];  // first basis function in fourth shell
+                    auto n4 = bs4[s4].size(); // number of basis functions in fourth shell
+
+                    size_t thread_id = 0;
+#ifdef _OPENMP
+                    thread_id = omp_get_thread_num();
+#endif
+                    cgtg_coulomb_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                    const auto& buf_vec = cgtg_coulomb_engines[thread_id].results(); // will point to computed shell sets
+
+                    auto ints_shellset = buf_vec[0];    // Location of the computed integrals
+                    if (ints_shellset == nullptr)
+                        continue;  // nullptr returned if the entire shell-set was screened out
+
+                    // Loop over shell block, keeping a total count idx for the size of shell set
+                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                        size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                        for(auto f2 = 0; f2 != n2; ++f2) {
+                            size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                            for(auto f3 = 0; f3 != n3; ++f3) {
+                                size_t offset_3 = (bf3 + f3) * nbf4;
+                                for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                    result[offset_1 + offset_2 + offset_3 + bf4 + f4] = ints_shellset[idx];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
+}
+
+// Computes gradient norm of contracted Gaussian-type geminal
+py::array f12_double_commutator(double beta) {
+    // workaround for data copying: perhaps pass an empty numpy array, then populate it in C++? avoids last line, which copies
+    auto cgtg_params = make_cgtg(beta);
+    std::vector<libint2::Engine> cgtg_del_engines(nthreads);
+    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+    // Returns Runtime Error: bad any_cast if shorthand version is used, may be an error on the Libint side since Psi4 works with this as well
+    cgtg_del_engines[0] = libint2::Engine(libint2::Operator::delcgtg2, max_nprim, max_l, 0, 0., cgtg_params, libint2::BraKet::xx_xx);
+    for (size_t i = 1; i != nthreads; ++i) {
+        cgtg_del_engines[i] = cgtg_del_engines[0];
+    }
+
+    size_t length = nbf1 * nbf2 * nbf3 * nbf4;
+    std::vector<double> result(length);
+    
+#pragma omp parallel for collapse(4) num_threads(nthreads)
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            for(auto s3=0; s3 != bs3.size(); ++s3) {
+                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                    auto bf1 = shell2bf_1[s1];  // first basis function in first shell
+                    auto n1 = bs1[s1].size(); // number of basis functions in first shell
+                    auto bf2 = shell2bf_2[s2];  // first basis function in second shell
+                    auto n2 = bs2[s2].size(); // number of basis functions in second shell
+                    auto bf3 = shell2bf_3[s3];  // first basis function in third shell
+                    auto n3 = bs3[s3].size(); // number of basis functions in third shell
+                    auto bf4 = shell2bf_4[s4];  // first basis function in fourth shell
+                    auto n4 = bs4[s4].size(); // number of basis functions in fourth shell
+
+                    size_t thread_id = 0;
+#ifdef _OPENMP
+                    thread_id = omp_get_thread_num();
+#endif
+                    cgtg_del_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                    const auto& buf_vec = cgtg_del_engines[thread_id].results(); // will point to computed shell sets
+
+                    auto ints_shellset = buf_vec[0];    // Location of the computed integrals
+                    if (ints_shellset == nullptr)
+                        continue;  // nullptr returned if the entire shell-set was screened out
+
+                    // Loop over shell block, keeping a total count idx for the size of shell set
+                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                        size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                        for(auto f2 = 0; f2 != n2; ++f2) {
+                            size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                            for(auto f3 = 0; f3 != n3; ++f3) {
+                                size_t offset_3 = (bf3 + f3) * nbf4;
+                                for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                    result[offset_1 + offset_2 + offset_3 + bf4 + f4] = ints_shellset[idx];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
+}
+
 // Computes nuclear derivatives of overlap integrals
 py::array overlap_deriv(std::vector<int> deriv_vec) {
     assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
@@ -858,555 +1136,2375 @@ py::array eri_deriv(std::vector<int> deriv_vec) {
     return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
 }
 
-// The following function writes all overlap, kinetic, and potential derivatives up to `max_deriv_order` to disk
-// HDF5 File Name: oei_derivs.h5 
-//      HDF5 Dataset names within the file:
-//      overlap_deriv1 
-//          shape (nbf,nbf,n_unique_1st_derivs)
-//      overlap_deriv2 
-//          shape (nbf,nbf,n_unique_2nd_derivs)
-//      overlap_deriv3 
-//          shape (nbf,nbf,n_unique_3rd_derivs)
-//      ...
-//      kinetic_deriv1 
-//          shape (nbf,nbf,n_unique_1st_derivs)
-//      kinetic_deriv2 
-//          shape (nbf,nbf,n_unique_2nd_derivs)
-//      kinetic_deriv3 
-//          shape (nbf,nbf,n_unique_3rd_derivs)
-//      ...
-//      potential_deriv1 
-//          shape (nbf,nbf,n_unique_1st_derivs)
-//      potential_deriv2 
-//          shape (nbf,nbf,n_unique_2nd_derivs)
-//      potential_deriv3 
-//          shape (nbf,nbf,n_unique_3rd_derivs)
-// The number of unique derivatives is essentially equal to the size of the generalized upper triangle of the derivative tensor.
-void oei_deriv_disk(int max_deriv_order) {
-    std::cout << "Writing one-electron integral derivative tensors up to order " << max_deriv_order << " to disk...";
-    long total_deriv_slices = 0;
-    for (int i = 1; i <= max_deriv_order; i++){
-        total_deriv_slices += how_many_derivs(natom, i);
-    }
+// Computes nuclear derivatives of contracted Gaussian-type geminal integrals
+py::array f12_deriv(double beta, std::vector<int> deriv_vec) {
+    int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
 
-    size_t max_nprim = std::max(bs1.max_nprim(), bs2.max_nprim());
-    int max_l = std::max(bs1.max_l(), bs2.max_l());
+    // Convert deriv_vec to set of atom indices and their cartesian components which we are differentiating wrt
+    std::vector<int> desired_atom_indices;
+    std::vector<int> desired_coordinates;
+    process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
 
-    // Create H5 File and prepare to fill with 0.0's
-    const H5std_string file_name("oei_derivs.h5");
-    H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
-    double fillvalue = 0.0;
-    DSetCreatPropList plist;
-    plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
-
-    for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
-        // how many shell derivatives in the Libint buffer for overlap/kinetic integrals
-        // how many shell and operator derivatives for potential integrals
-        int nshell_derivs = how_many_derivs(2, deriv_order);
-        int nshell_derivs_potential = how_many_derivs(2, deriv_order, natom);
-        // how many unique cartesian nuclear derivatives (e.g., so we only save one of d^2/dx1dx2 and d^2/dx2dx1, etc)
-        unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
-
-        // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-        // Overlap and kinetic have different mappings than potential since potential has more elements in the buffer 
-        const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
-        // Potential integrals buffer is flattened upper triangle of (6 + NCART) dimensional deriv_order tensor
-        const std::vector<std::vector<int>> potential_buffer_multidim_lookup = generate_multi_index_lookup(6 + ncart, deriv_order);
-
-        // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
-        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
-
-        // Define engines and buffers
-        std::vector<libint2::Engine> s_engines(nthreads), t_engines(nthreads), v_engines(nthreads);
-        s_engines[0] = libint2::Engine(libint2::Operator::overlap, max_nprim, max_l, deriv_order);
-        t_engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l, deriv_order);
-        v_engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l, deriv_order);
-        v_engines[0].set_params(make_point_charges(atoms));
-        for (size_t i = 1; i != nthreads; ++i) {
-            s_engines[i] = s_engines[0];
-            t_engines[i] = t_engines[0];
-            v_engines[i] = v_engines[0];
-        }
-
-        // Define HDF5 dataset names
-        const H5std_string overlap_dset_name("overlap_deriv" + std::to_string(deriv_order));
-        const H5std_string kinetic_dset_name("kinetic_deriv" + std::to_string(deriv_order));
-        const H5std_string potential_dset_name("potential_deriv" + std::to_string(deriv_order));
+    assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
 
-        // Define rank and dimensions of data that will be written to the file
-        hsize_t file_dims[] = {nbf1, nbf2, nderivs_triu};
-        DataSpace fspace(3, file_dims);
-        // Create dataset for each integral type and write 0.0's into the file 
-        DataSet* overlap_dataset = new DataSet(file->createDataSet(overlap_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
-        DataSet* kinetic_dataset = new DataSet(file->createDataSet(kinetic_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
-        DataSet* potential_dataset = new DataSet(file->createDataSet(potential_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
-        hsize_t stride[3] = {1, 1, 1}; // stride and block can be used to 
-        hsize_t block[3] = {1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
-        hsize_t zerostart[3] = {0, 0, 0};
+    // F12 derivative integral engine
+    auto cgtg_params = make_cgtg(beta);
+    std::vector<libint2::Engine> cgtg_engines(nthreads);
+    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+    cgtg_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
+    cgtg_engines[0].set_params(cgtg_params);
+    for (size_t i = 1; i != nthreads; ++i) {
+        cgtg_engines[i] = cgtg_engines[0];
+    }
 
-#pragma omp parallel for collapse(2) num_threads(nthreads)
-        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-                auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                std::vector<long> shell_atom_index_list{atom1, atom2};
+    size_t length = nbf1 * nbf2 * nbf3 * nbf4;
+    std::vector<double> result(length);
 
-                size_t thread_id = 0;
-#ifdef _OPENMP
-                thread_id = omp_get_thread_num();
-#endif
-                s_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-                t_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-                v_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-                const auto& overlap_buffer = s_engines[thread_id].results(); // will point to computed shell sets
-                const auto& kinetic_buffer = t_engines[thread_id].results(); // will point to computed shell sets
-                const auto& potential_buffer = v_engines[thread_id].results(); // will point to computed shell sets;
+#pragma omp parallel for collapse(4) num_threads(nthreads)
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
+                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
+                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
+                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
+                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
+                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
+                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
 
-                // Define shell set slabs
-                double overlap_shellset_slab [n1][n2][nderivs_triu] = {};
-                double kinetic_shellset_slab [n1][n2][nderivs_triu] = {};
-                double potential_shellset_slab [n1][n2][nderivs_triu] = {};
+                    // If the atoms are the same we ignore it as the derivatives will be zero.
+                    if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
+                    // Ensure all desired_atoms correspond to at least one shell atom to ensure desired derivative exists. else, skip this shell quartet.
+                    bool atoms_not_present = false;
+                    for (int i = 0; i < deriv_order; i++){
+                        if (atom1 == desired_atom_indices[i]) continue; 
+                        else if (atom2 == desired_atom_indices[i]) continue;
+                        else if (atom3 == desired_atom_indices[i]) continue;
+                        else if (atom4 == desired_atom_indices[i]) continue;
+                        else {atoms_not_present = true; break;}
+                    }
+                    if (atoms_not_present) continue;
 
-                // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-                // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
-                for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                    // Look up multidimensional cartesian derivative index
-                    auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
-                    // For overlap/kinetic and potential sepearately, create a vector of vectors called `indices`, where each subvector
-                    // is your possible choices for the first derivative operator, second, third, etc and the total number of subvectors is order of differentiation
-                    // What follows fills these indices
-                    std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-                    std::vector<std::vector<int>> potential_indices(deriv_order, std::vector<int> (0,0));
+                    // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
+                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
 
-                    // Loop over each cartesian coordinate index which we are differentiating wrt for this nuclear cartesian derivative index
-                    // and check to see if it is present in the shell duet, and where it is present in the potential operator
-                    for (int j = 0; j < multi_cart_idx.size(); j++){
-                        int desired_atom_idx = multi_cart_idx[j] / 3;
-                        int desired_coord = multi_cart_idx[j] % 3;
-                        // Loop over shell indices
-                        for (int i = 0; i < 2; i++){
+                    // Initialize 2d vector, with DERIV_ORDER subvectors
+                    // Each subvector contains index candidates which are possible choices for each partial derivative operator
+                    // In other words, indices looks like { {choices for first deriv operator} {choices for second deriv op} {third} ...}
+                    // The cartesian product of these subvectors gives all combos that need to be summed to form total nuclear derivative of integrals
+                    std::vector<std::vector<int>> indices;
+                    for (int i = 0; i < deriv_order; i++){
+                        std::vector<int> new_vec;
+                        indices.push_back(new_vec);
+                    }
+                
+                    // For every desired atom derivative, check shell indices for a match, add it to subvector for that derivative
+                    // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
+                    for (int j = 0; j < desired_atom_indices.size(); j++){
+                        int desired_atom_idx = desired_atom_indices[j];
+                        // Shell indices
+                        for (int i = 0; i < 4; i++){
                             int atom_idx = shell_atom_index_list[i];
                             if (atom_idx == desired_atom_idx) {
-                                int tmp = 3 * i + desired_coord;
+                                int tmp = 3 * i + desired_coordinates[j];
                                 indices[j].push_back(tmp);
-                                potential_indices[j].push_back(tmp);
-                            }
-                        }
-                        // Now for potentials only, loop over each atom in molecule, and if this derivative
-                        // differentiates wrt that atom, we also need to collect that index.
-                        for (int i = 0; i < natom; i++){
-                            if (i == desired_atom_idx) {
-                                int tmp = 3 * (i + 2) + desired_coord;
-                                potential_indices[j].push_back(tmp);
                             }
                         }
                     }
-
+                    
                     // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
                     // and the total number of subvectors is the order of differentiation
                     // Now we want all combinations where we pick exactly one index from each subvector.
-                    // This is achievable through a cartesian product
+                    // This is achievable through a cartesian product 
                     std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                    std::vector<std::vector<int>> potential_index_combos = cartesian_product(potential_indices);
+
+                    // Now create buffer_indices from these index combos using lookup array
                     std::vector<int> buffer_indices;
-                    std::vector<int> potential_buffer_indices;
-                    // Overlap/Kinetic integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
-                    for (auto vec : index_combos)  {
-                        std::sort(vec.begin(), vec.end());
-                        int buf_idx = 0;
-                        auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                        if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                        buffer_indices.push_back(buf_idx);
+                    if (deriv_order == 1){ 
+                        for (int i = 0; i < index_combos.size(); i++){
+                            int idx1 = index_combos[i][0];
+                            buffer_indices.push_back(buffer_index_eri1d[idx1]);
+                        }
                     }
-                    // Potential integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
-                    for (auto vec : potential_index_combos)  {
-                        std::sort(vec.begin(), vec.end());
-                        int buf_idx = 0;
-                        auto it = lower_bound(potential_buffer_multidim_lookup.begin(), potential_buffer_multidim_lookup.end(), vec);
-                        if (it != potential_buffer_multidim_lookup.end()) buf_idx = it - potential_buffer_multidim_lookup.begin();
-                        potential_buffer_indices.push_back(buf_idx);
+                    else if (deriv_order == 2){ 
+                        for (int i = 0; i < index_combos.size(); i++){
+                            int idx1 = index_combos[i][0];
+                            int idx2 = index_combos[i][1];
+                            buffer_indices.push_back(buffer_index_eri2d[idx1][idx2]);
+                        }
                     }
-
-                    // Loop over shell block for each buffer index which contributes to this derivative
-                    // Overlap and Kinetic
-                    for(auto i = 0; i < buffer_indices.size(); ++i) {
-                        auto overlap_shellset = overlap_buffer[buffer_indices[i]];
-                        auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
-                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                                overlap_shellset_slab[f1][f2][nuc_idx] += overlap_shellset[idx];
-                                kinetic_shellset_slab[f1][f2][nuc_idx] += kinetic_shellset[idx];
-                            }
+                    else if (deriv_order == 3){ 
+                        for (int i = 0; i < index_combos.size(); i++){
+                            int idx1 = index_combos[i][0];
+                            int idx2 = index_combos[i][1];
+                            int idx3 = index_combos[i][2];
+                            buffer_indices.push_back(buffer_index_eri3d[idx1][idx2][idx3]);
                         }
                     }
-                    // Potential
-                    for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
-                        auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
+                    else if (deriv_order == 4){ 
+                        for (int i = 0; i < index_combos.size(); i++){
+                            int idx1 = index_combos[i][0];
+                            int idx2 = index_combos[i][1];
+                            int idx3 = index_combos[i][2];
+                            int idx4 = index_combos[i][3];
+                            buffer_indices.push_back(buffer_index_eri4d[idx1][idx2][idx3][idx4]);
+                        }
+                    }
+
+                    // If we made it this far, the shell derivative we want is contained in the buffer. 
+                    size_t thread_id = 0;
+#ifdef _OPENMP
+                    thread_id = omp_get_thread_num();
+#endif
+                    cgtg_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                    const auto& buf_vec = cgtg_engines[thread_id].results(); // will point to computed shell sets
+
+                    for(auto i = 0; i<buffer_indices.size(); ++i) {
+                        auto ints_shellset = buf_vec[buffer_indices[i]];
+                        if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
                         for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                                potential_shellset_slab[f1][f2][nuc_idx] += potential_shellset[idx];
+                            size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                            for(auto f2 = 0; f2 != n2; ++f2) {
+                                size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                                for(auto f3 = 0; f3 != n3; ++f3) {
+                                    size_t offset_3 = (bf3 + f3) * nbf4;
+                                    for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                        result[offset_1 + offset_2 + offset_3 + bf4 + f4] += ints_shellset[idx];
+                                    }
+                                }
                             }
                         }
                     }
-                } // Unique nuclear cartesian derivative indices loop
-
-                // Now write this shell set slab to HDF5 file
-                // Create file space hyperslab, defining where to write data to in file
-                hsize_t count[3] = {n1, n2, nderivs_triu};
-                hsize_t start[3] = {bf1, bf2, 0};
-                fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
-                // Create dataspace defining for memory dataset to write to file
-                hsize_t mem_dims[] = {n1, n2, nderivs_triu};
-                DataSpace mspace(3, mem_dims);
-                mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
-                // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
-                overlap_dataset->write(overlap_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
-                kinetic_dataset->write(kinetic_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
-                potential_dataset->write(potential_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+                }
             }
-        } // shell duet loops
-    // Delete datasets for this derivative order
-    delete overlap_dataset;
-    delete kinetic_dataset;
-    delete potential_dataset;
-    } // deriv order loop
-// close the file
-delete file;
-std::cout << " done" << std::endl;
-} //oei_deriv_disk 
+        }
+    }
+    // This is not the bottleneck
+    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
+}
 
+// Computes nuclear derivatives of squared contracted Gaussian-type geminal integrals
+py::array f12_squared_deriv(double beta, std::vector<int> deriv_vec) {
+    int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
 
-// Writes all ERI's up to `max_deriv_order` to disk.
-// HDF5 File Name: eri_derivs.h5 
-//      HDF5 Dataset names within the file:
-//      eri_deriv1 
-//          shape (nbf,nbf,nbf,nbf,n_unique_1st_derivs)
-//      eri_deriv2
-//          shape (nbf,nbf,nbf,nbf,n_unique_2nd_derivs)
-//      eri_deriv3
-//          shape (nbf,nbf,nbf,nbf,n_unique_3rd_derivs)
-//      ...
-void eri_deriv_disk(int max_deriv_order) { 
-    std::cout << "Writing two-electron integral derivative tensors up to order " << max_deriv_order << " to disk...";
-    const H5std_string file_name("eri_derivs.h5");
-    H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
-    double fillvalue = 0.0;
-    DSetCreatPropList plist;
-    plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
+    // Convert deriv_vec to set of atom indices and their cartesian components which we are differentiating wrt
+    std::vector<int> desired_atom_indices;
+    std::vector<int> desired_coordinates;
+    process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
+
+    assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
 
+    // F12 Squared derivative integral engine
+    auto cgtg_params = take_square(make_cgtg(beta));
+    std::vector<libint2::Engine> cgtg_squared_engines(nthreads);
     size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
     int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
-
-    // Check to make sure you are not flooding the disk.
-    long total_deriv_slices = 0;
-    for (int i = 1; i <= max_deriv_order; i++){
-        total_deriv_slices += how_many_derivs(natom, i);
+    cgtg_squared_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
+    cgtg_squared_engines[0].set_params(cgtg_params);
+    for (size_t i = 1; i != nthreads; ++i) {
+        cgtg_squared_engines[i] = cgtg_squared_engines[0];
     }
-    double check = (nbf1 * nbf2 * nbf3 * nbf4 * total_deriv_slices * 8) * (1e-9);
-    assert(check < 10 && "Total disk space required for ERI's exceeds 10 GB. Increase threshold and recompile to proceed.");
-
-    for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
-        // Number of unique shell derivatives output by libint (number of indices in buffer)
-        int nshell_derivs = how_many_derivs(4, deriv_order);
-        // Number of unique nuclear derivatives of ERI's
-        unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
-        // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-        // Currently not used due to predefined lookup arrays
-        //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+    size_t length = nbf1 * nbf2 * nbf3 * nbf4;
+    std::vector<double> result(length);
 
-        // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
-        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
+#pragma omp parallel for collapse(4) num_threads(nthreads)
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
+                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
+                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
+                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
+                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
+                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
+                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
 
-        // Libint engine for computing shell quartet derivatives
-        std::vector<libint2::Engine> eri_engines(nthreads);
-        eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
-        for (size_t i = 1; i != nthreads; ++i) {
-            eri_engines[i] = eri_engines[0];
-        }
+                    // If the atoms are the same we ignore it as the derivatives will be zero.
+                    if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
+                    // Ensure all desired_atoms correspond to at least one shell atom to ensure desired derivative exists. else, skip this shell quartet.
+                    bool atoms_not_present = false;
+                    for (int i = 0; i < deriv_order; i++){
+                        if (atom1 == desired_atom_indices[i]) continue; 
+                        else if (atom2 == desired_atom_indices[i]) continue;
+                        else if (atom3 == desired_atom_indices[i]) continue;
+                        else if (atom4 == desired_atom_indices[i]) continue;
+                        else {atoms_not_present = true; break;}
+                    }
+                    if (atoms_not_present) continue;
 
-        // Define HDF5 dataset name
-        const H5std_string eri_dset_name("eri_deriv" + std::to_string(deriv_order));
-        hsize_t file_dims[] = {nbf1, nbf2, nbf3, nbf4, nderivs_triu};
-        DataSpace fspace(5, file_dims);
-        // Create dataset for each integral type and write 0.0's into the file 
-        DataSet* eri_dataset = new DataSet(file->createDataSet(eri_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
-        hsize_t stride[5] = {1, 1, 1, 1, 1}; // stride and block can be used to 
-        hsize_t block[5] = {1, 1, 1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
-        hsize_t zerostart[5] = {0, 0, 0, 0, 0};
+                    // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
+                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
 
-#pragma omp parallel for collapse(4) num_threads(nthreads)
-        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-                for(auto s3 = 0; s3 != bs3.size(); ++s3) {
-                    for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                        auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                        auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                        auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                        auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                        auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                        auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                        auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
-                        auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
-                        auto n3 = bs3[s3].size();    // number of basis functions in shell 3
-                        auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
-                        auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
-                        auto n4 = bs4[s4].size();    // number of basis functions in shell 4
+                    // Initialize 2d vector, with DERIV_ORDER subvectors
+                    // Each subvector contains index candidates which are possible choices for each partial derivative operator
+                    // In other words, indices looks like { {choices for first deriv operator} {choices for second deriv op} {third} ...}
+                    // The cartesian product of these subvectors gives all combos that need to be summed to form total nuclear derivative of integrals
+                    std::vector<std::vector<int>> indices;
+                    for (int i = 0; i < deriv_order; i++){
+                        std::vector<int> new_vec;
+                        indices.push_back(new_vec);
+                    }
+                
+                    // For every desired atom derivative, check shell indices for a match, add it to subvector for that derivative
+                    // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
+                    for (int j = 0; j < desired_atom_indices.size(); j++){
+                        int desired_atom_idx = desired_atom_indices[j];
+                        // Shell indices
+                        for (int i = 0; i < 4; i++){
+                            int atom_idx = shell_atom_index_list[i];
+                            if (atom_idx == desired_atom_idx) {
+                                int tmp = 3 * i + desired_coordinates[j];
+                                indices[j].push_back(tmp);
+                            }
+                        }
+                    }
+                    
+                    // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+                    // and the total number of subvectors is the order of differentiation
+                    // Now we want all combinations where we pick exactly one index from each subvector.
+                    // This is achievable through a cartesian product 
+                    std::vector<std::vector<int>> index_combos = cartesian_product(indices);
 
-                        if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
-                        std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
+                    // Now create buffer_indices from these index combos using lookup array
+                    std::vector<int> buffer_indices;
+                    if (deriv_order == 1){ 
+                        for (int i = 0; i < index_combos.size(); i++){
+                            int idx1 = index_combos[i][0];
+                            buffer_indices.push_back(buffer_index_eri1d[idx1]);
+                        }
+                    }
+                    else if (deriv_order == 2){ 
+                        for (int i = 0; i < index_combos.size(); i++){
+                            int idx1 = index_combos[i][0];
+                            int idx2 = index_combos[i][1];
+                            buffer_indices.push_back(buffer_index_eri2d[idx1][idx2]);
+                        }
+                    }
+                    else if (deriv_order == 3){ 
+                        for (int i = 0; i < index_combos.size(); i++){
+                            int idx1 = index_combos[i][0];
+                            int idx2 = index_combos[i][1];
+                            int idx3 = index_combos[i][2];
+                            buffer_indices.push_back(buffer_index_eri3d[idx1][idx2][idx3]);
+                        }
+                    }
+                    else if (deriv_order == 4){ 
+                        for (int i = 0; i < index_combos.size(); i++){
+                            int idx1 = index_combos[i][0];
+                            int idx2 = index_combos[i][1];
+                            int idx3 = index_combos[i][2];
+                            int idx4 = index_combos[i][3];
+                            buffer_indices.push_back(buffer_index_eri4d[idx1][idx2][idx3][idx4]);
+                        }
+                    }
 
-                        size_t thread_id = 0;
+                    // If we made it this far, the shell derivative we want is contained in the buffer. 
+                    size_t thread_id = 0;
 #ifdef _OPENMP
-                        thread_id = omp_get_thread_num();
+                    thread_id = omp_get_thread_num();
 #endif
-                        eri_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                        const auto& eri_buffer = eri_engines[thread_id].results(); // will point to computed shell sets
+                    cgtg_squared_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                    const auto& buf_vec = cgtg_squared_engines[thread_id].results(); // will point to computed shell sets
 
-                        // Define shell set slab, with extra dimension for unique derivatives, initialized with 0.0's
-                        double eri_shellset_slab [n1][n2][n3][n4][nderivs_triu] = {};
-                        // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-                        for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                            // Look up multidimensional cartesian derivative index
-                            auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
-    
-                            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-    
-                            // Find out which 
-                            for (int j = 0; j < multi_cart_idx.size(); j++){
-                                int desired_atom_idx = multi_cart_idx[j] / 3;
-                                int desired_coord = multi_cart_idx[j] % 3;
-                                for (int i = 0; i < 4; i++){
-                                    int atom_idx = shell_atom_index_list[i];
-                                    if (atom_idx == desired_atom_idx) {
-                                        int tmp = 3 * i + desired_coord;
-                                        indices[j].push_back(tmp);
+                    for(auto i = 0; i<buffer_indices.size(); ++i) {
+                        auto ints_shellset = buf_vec[buffer_indices[i]];
+                        if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                            for(auto f2 = 0; f2 != n2; ++f2) {
+                                size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                                for(auto f3 = 0; f3 != n3; ++f3) {
+                                    size_t offset_3 = (bf3 + f3) * nbf4;
+                                    for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                        result[offset_1 + offset_2 + offset_3 + bf4 + f4] += ints_shellset[idx];
                                     }
                                 }
                             }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    // This is not the bottleneck
+    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
+}
 
-                            // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                            // and the total number of subvectors is the order of differentiation
-                            // Now we want all combinations where we pick exactly one index from each subvector.
-                            // This is achievable through a cartesian product 
-                            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                            std::vector<int> buffer_indices;
-                            
-                            // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                            //for (auto vec : index_combos)  {
-                            //    std::sort(vec.begin(), vec.end());
-                            //    int buf_idx = 0;
-                            //    // buffer_multidim_lookup
-                            //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                            //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                            //    buffer_indices.push_back(buf_idx);
-                            //}
-                            // Eventually, if you stop using lookup arrays, use above implementation, but these are sitting around so might as well use them 
-                            for (auto vec : index_combos)  {
-                                if (deriv_order == 1) buffer_indices.push_back(buffer_index_eri1d[vec[0]]);
-                                else if (deriv_order == 2) buffer_indices.push_back(buffer_index_eri2d[vec[0]][vec[1]]);
-                                else if (deriv_order == 3) buffer_indices.push_back(buffer_index_eri3d[vec[0]][vec[1]][vec[2]]);
-                                else if (deriv_order == 4) buffer_indices.push_back(buffer_index_eri4d[vec[0]][vec[1]][vec[2]][vec[3]]);
-                            }
+// Computes nuclear derivatives of contracted Gaussian-type geminal times Coulomb replusion integrals
+py::array f12g12_deriv(double beta, std::vector<int> deriv_vec) {
+    int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
 
-                            // Loop over shell block, keeping a total count idx for the size of shell set
-                            for(auto i = 0; i < buffer_indices.size(); ++i) {
-                                auto eri_shellset = eri_buffer[buffer_indices[i]];
-                                if (eri_shellset == nullptr) continue;
-                                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+    // Convert deriv_vec to set of atom indices and their cartesian components which we are differentiating wrt
+    std::vector<int> desired_atom_indices;
+    std::vector<int> desired_coordinates;
+    process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
+
+    assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
+
+    // F12 derivative integral engine
+    auto cgtg_params = make_cgtg(beta);
+    std::vector<libint2::Engine> cgtg_coulomb_engines(nthreads);
+    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+    cgtg_coulomb_engines[0] = libint2::Engine(libint2::Operator::cgtg_x_coulomb, max_nprim, max_l, deriv_order);
+    cgtg_coulomb_engines[0].set_params(cgtg_params);
+    for (size_t i = 1; i != nthreads; ++i) {
+        cgtg_coulomb_engines[i] = cgtg_coulomb_engines[0];
+    }
+
+    size_t length = nbf1 * nbf2 * nbf3 * nbf4;
+    std::vector<double> result(length);
+
+#pragma omp parallel for collapse(4) num_threads(nthreads)
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
+                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
+                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
+                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
+                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
+                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
+                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
+
+                    // If the atoms are the same we ignore it as the derivatives will be zero.
+                    if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
+                    // Ensure all desired_atoms correspond to at least one shell atom to ensure desired derivative exists. else, skip this shell quartet.
+                    bool atoms_not_present = false;
+                    for (int i = 0; i < deriv_order; i++){
+                        if (atom1 == desired_atom_indices[i]) continue; 
+                        else if (atom2 == desired_atom_indices[i]) continue;
+                        else if (atom3 == desired_atom_indices[i]) continue;
+                        else if (atom4 == desired_atom_indices[i]) continue;
+                        else {atoms_not_present = true; break;}
+                    }
+                    if (atoms_not_present) continue;
+
+                    // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
+                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
+
+                    // Initialize 2d vector, with DERIV_ORDER subvectors
+                    // Each subvector contains index candidates which are possible choices for each partial derivative operator
+                    // In other words, indices looks like { {choices for first deriv operator} {choices for second deriv op} {third} ...}
+                    // The cartesian product of these subvectors gives all combos that need to be summed to form total nuclear derivative of integrals
+                    std::vector<std::vector<int>> indices;
+                    for (int i = 0; i < deriv_order; i++){
+                        std::vector<int> new_vec;
+                        indices.push_back(new_vec);
+                    }
+                
+                    // For every desired atom derivative, check shell indices for a match, add it to subvector for that derivative
+                    // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
+                    for (int j = 0; j < desired_atom_indices.size(); j++){
+                        int desired_atom_idx = desired_atom_indices[j];
+                        // Shell indices
+                        for (int i = 0; i < 4; i++){
+                            int atom_idx = shell_atom_index_list[i];
+                            if (atom_idx == desired_atom_idx) {
+                                int tmp = 3 * i + desired_coordinates[j];
+                                indices[j].push_back(tmp);
+                            }
+                        }
+                    }
+                    
+                    // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+                    // and the total number of subvectors is the order of differentiation
+                    // Now we want all combinations where we pick exactly one index from each subvector.
+                    // This is achievable through a cartesian product 
+                    std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+
+                    // Now create buffer_indices from these index combos using lookup array
+                    std::vector<int> buffer_indices;
+                    if (deriv_order == 1){ 
+                        for (int i = 0; i < index_combos.size(); i++){
+                            int idx1 = index_combos[i][0];
+                            buffer_indices.push_back(buffer_index_eri1d[idx1]);
+                        }
+                    }
+                    else if (deriv_order == 2){ 
+                        for (int i = 0; i < index_combos.size(); i++){
+                            int idx1 = index_combos[i][0];
+                            int idx2 = index_combos[i][1];
+                            buffer_indices.push_back(buffer_index_eri2d[idx1][idx2]);
+                        }
+                    }
+                    else if (deriv_order == 3){ 
+                        for (int i = 0; i < index_combos.size(); i++){
+                            int idx1 = index_combos[i][0];
+                            int idx2 = index_combos[i][1];
+                            int idx3 = index_combos[i][2];
+                            buffer_indices.push_back(buffer_index_eri3d[idx1][idx2][idx3]);
+                        }
+                    }
+                    else if (deriv_order == 4){ 
+                        for (int i = 0; i < index_combos.size(); i++){
+                            int idx1 = index_combos[i][0];
+                            int idx2 = index_combos[i][1];
+                            int idx3 = index_combos[i][2];
+                            int idx4 = index_combos[i][3];
+                            buffer_indices.push_back(buffer_index_eri4d[idx1][idx2][idx3][idx4]);
+                        }
+                    }
+
+                    // If we made it this far, the shell derivative we want is contained in the buffer. 
+                    size_t thread_id = 0;
+#ifdef _OPENMP
+                    thread_id = omp_get_thread_num();
+#endif
+                    cgtg_coulomb_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                    const auto& buf_vec = cgtg_coulomb_engines[thread_id].results(); // will point to computed shell sets
+
+                    for(auto i = 0; i<buffer_indices.size(); ++i) {
+                        auto ints_shellset = buf_vec[buffer_indices[i]];
+                        if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                            for(auto f2 = 0; f2 != n2; ++f2) {
+                                size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                                for(auto f3 = 0; f3 != n3; ++f3) {
+                                    size_t offset_3 = (bf3 + f3) * nbf4;
+                                    for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                        result[offset_1 + offset_2 + offset_3 + bf4 + f4] += ints_shellset[idx];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    // This is not the bottleneck
+    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
+}
+
+// Computes nuclear derivatives of gradient norm of contracted Gaussian-type geminal integrals
+py::array f12_double_commutator_deriv(double beta, std::vector<int> deriv_vec) {
+    int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
+
+    // Convert deriv_vec to set of atom indices and their cartesian components which we are differentiating wrt
+    std::vector<int> desired_atom_indices;
+    std::vector<int> desired_coordinates;
+    process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
+
+    assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
+
+    // F12 derivative integral engine
+    auto cgtg_params = make_cgtg(beta);
+    std::vector<libint2::Engine> cgtg_del_engines(nthreads);
+    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+    // Returns Runtime Error: bad any_cast if shorthand version is used, may be an error on the Libint side since Psi4 works with this as well
+    cgtg_del_engines[0] = libint2::Engine(libint2::Operator::delcgtg2, max_nprim, max_l, deriv_order, 0., cgtg_params, libint2::BraKet::xx_xx);
+    for (size_t i = 1; i != nthreads; ++i) {
+        cgtg_del_engines[i] = cgtg_del_engines[0];
+    }
+
+    size_t length = nbf1 * nbf2 * nbf3 * nbf4;
+    std::vector<double> result(length);
+
+#pragma omp parallel for collapse(4) num_threads(nthreads)
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
+                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
+                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
+                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
+                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
+                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
+                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
+
+                    // If the atoms are the same we ignore it as the derivatives will be zero.
+                    if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
+                    // Ensure all desired_atoms correspond to at least one shell atom to ensure desired derivative exists. else, skip this shell quartet.
+                    bool atoms_not_present = false;
+                    for (int i = 0; i < deriv_order; i++){
+                        if (atom1 == desired_atom_indices[i]) continue; 
+                        else if (atom2 == desired_atom_indices[i]) continue;
+                        else if (atom3 == desired_atom_indices[i]) continue;
+                        else if (atom4 == desired_atom_indices[i]) continue;
+                        else {atoms_not_present = true; break;}
+                    }
+                    if (atoms_not_present) continue;
+
+                    // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
+                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
+
+                    // Initialize 2d vector, with DERIV_ORDER subvectors
+                    // Each subvector contains index candidates which are possible choices for each partial derivative operator
+                    // In other words, indices looks like { {choices for first deriv operator} {choices for second deriv op} {third} ...}
+                    // The cartesian product of these subvectors gives all combos that need to be summed to form total nuclear derivative of integrals
+                    std::vector<std::vector<int>> indices;
+                    for (int i = 0; i < deriv_order; i++){
+                        std::vector<int> new_vec;
+                        indices.push_back(new_vec);
+                    }
+                
+                    // For every desired atom derivative, check shell indices for a match, add it to subvector for that derivative
+                    // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
+                    for (int j = 0; j < desired_atom_indices.size(); j++){
+                        int desired_atom_idx = desired_atom_indices[j];
+                        // Shell indices
+                        for (int i = 0; i < 4; i++){
+                            int atom_idx = shell_atom_index_list[i];
+                            if (atom_idx == desired_atom_idx) {
+                                int tmp = 3 * i + desired_coordinates[j];
+                                indices[j].push_back(tmp);
+                            }
+                        }
+                    }
+                    
+                    // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+                    // and the total number of subvectors is the order of differentiation
+                    // Now we want all combinations where we pick exactly one index from each subvector.
+                    // This is achievable through a cartesian product 
+                    std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+
+                    // Now create buffer_indices from these index combos using lookup array
+                    std::vector<int> buffer_indices;
+                    if (deriv_order == 1){ 
+                        for (int i = 0; i < index_combos.size(); i++){
+                            int idx1 = index_combos[i][0];
+                            buffer_indices.push_back(buffer_index_eri1d[idx1]);
+                        }
+                    }
+                    else if (deriv_order == 2){ 
+                        for (int i = 0; i < index_combos.size(); i++){
+                            int idx1 = index_combos[i][0];
+                            int idx2 = index_combos[i][1];
+                            buffer_indices.push_back(buffer_index_eri2d[idx1][idx2]);
+                        }
+                    }
+                    else if (deriv_order == 3){ 
+                        for (int i = 0; i < index_combos.size(); i++){
+                            int idx1 = index_combos[i][0];
+                            int idx2 = index_combos[i][1];
+                            int idx3 = index_combos[i][2];
+                            buffer_indices.push_back(buffer_index_eri3d[idx1][idx2][idx3]);
+                        }
+                    }
+                    else if (deriv_order == 4){ 
+                        for (int i = 0; i < index_combos.size(); i++){
+                            int idx1 = index_combos[i][0];
+                            int idx2 = index_combos[i][1];
+                            int idx3 = index_combos[i][2];
+                            int idx4 = index_combos[i][3];
+                            buffer_indices.push_back(buffer_index_eri4d[idx1][idx2][idx3][idx4]);
+                        }
+                    }
+
+                    // If we made it this far, the shell derivative we want is contained in the buffer. 
+                    size_t thread_id = 0;
+#ifdef _OPENMP
+                    thread_id = omp_get_thread_num();
+#endif
+                    cgtg_del_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                    const auto& buf_vec = cgtg_del_engines[thread_id].results(); // will point to computed shell sets
+
+                    for(auto i = 0; i<buffer_indices.size(); ++i) {
+                        auto ints_shellset = buf_vec[buffer_indices[i]];
+                        if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                            for(auto f2 = 0; f2 != n2; ++f2) {
+                                size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                                for(auto f3 = 0; f3 != n3; ++f3) {
+                                    size_t offset_3 = (bf3 + f3) * nbf4;
+                                    for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                        result[offset_1 + offset_2 + offset_3 + bf4 + f4] += ints_shellset[idx];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    // This is not the bottleneck
+    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
+}
+
+// The following function writes all overlap, kinetic, and potential derivatives up to `max_deriv_order` to disk
+// HDF5 File Name: oei_derivs.h5 
+//      HDF5 Dataset names within the file:
+//      overlap_deriv1 
+//          shape (nbf,nbf,n_unique_1st_derivs)
+//      overlap_deriv2 
+//          shape (nbf,nbf,n_unique_2nd_derivs)
+//      overlap_deriv3 
+//          shape (nbf,nbf,n_unique_3rd_derivs)
+//      ...
+//      kinetic_deriv1 
+//          shape (nbf,nbf,n_unique_1st_derivs)
+//      kinetic_deriv2 
+//          shape (nbf,nbf,n_unique_2nd_derivs)
+//      kinetic_deriv3 
+//          shape (nbf,nbf,n_unique_3rd_derivs)
+//      ...
+//      potential_deriv1 
+//          shape (nbf,nbf,n_unique_1st_derivs)
+//      potential_deriv2 
+//          shape (nbf,nbf,n_unique_2nd_derivs)
+//      potential_deriv3 
+//          shape (nbf,nbf,n_unique_3rd_derivs)
+// The number of unique derivatives is essentially equal to the size of the generalized upper triangle of the derivative tensor.
+void oei_deriv_disk(int max_deriv_order) {
+    std::cout << "Writing one-electron integral derivative tensors up to order " << max_deriv_order << " to disk...";
+    long total_deriv_slices = 0;
+    for (int i = 1; i <= max_deriv_order; i++){
+        total_deriv_slices += how_many_derivs(natom, i);
+    }
+
+    size_t max_nprim = std::max(bs1.max_nprim(), bs2.max_nprim());
+    int max_l = std::max(bs1.max_l(), bs2.max_l());
+
+    // Create H5 File and prepare to fill with 0.0's
+    const H5std_string file_name("oei_derivs.h5");
+    H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
+    double fillvalue = 0.0;
+    DSetCreatPropList plist;
+    plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
+
+    for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
+        // how many shell derivatives in the Libint buffer for overlap/kinetic integrals
+        // how many shell and operator derivatives for potential integrals
+        int nshell_derivs = how_many_derivs(2, deriv_order);
+        int nshell_derivs_potential = how_many_derivs(2, deriv_order, natom);
+        // how many unique cartesian nuclear derivatives (e.g., so we only save one of d^2/dx1dx2 and d^2/dx2dx1, etc)
+        unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
+
+        // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+        // Overlap and kinetic have different mappings than potential since potential has more elements in the buffer 
+        const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
+        // Potential integrals buffer is flattened upper triangle of (6 + NCART) dimensional deriv_order tensor
+        const std::vector<std::vector<int>> potential_buffer_multidim_lookup = generate_multi_index_lookup(6 + ncart, deriv_order);
+
+        // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
+        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
+
+        // Define engines and buffers
+        std::vector<libint2::Engine> s_engines(nthreads), t_engines(nthreads), v_engines(nthreads);
+        s_engines[0] = libint2::Engine(libint2::Operator::overlap, max_nprim, max_l, deriv_order);
+        t_engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l, deriv_order);
+        v_engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l, deriv_order);
+        v_engines[0].set_params(make_point_charges(atoms));
+        for (size_t i = 1; i != nthreads; ++i) {
+            s_engines[i] = s_engines[0];
+            t_engines[i] = t_engines[0];
+            v_engines[i] = v_engines[0];
+        }
+
+        // Define HDF5 dataset names
+        const H5std_string overlap_dset_name("overlap_deriv" + std::to_string(deriv_order));
+        const H5std_string kinetic_dset_name("kinetic_deriv" + std::to_string(deriv_order));
+        const H5std_string potential_dset_name("potential_deriv" + std::to_string(deriv_order));
+
+        // Define rank and dimensions of data that will be written to the file
+        hsize_t file_dims[] = {nbf1, nbf2, nderivs_triu};
+        DataSpace fspace(3, file_dims);
+        // Create dataset for each integral type and write 0.0's into the file 
+        DataSet* overlap_dataset = new DataSet(file->createDataSet(overlap_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
+        DataSet* kinetic_dataset = new DataSet(file->createDataSet(kinetic_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
+        DataSet* potential_dataset = new DataSet(file->createDataSet(potential_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
+        hsize_t stride[3] = {1, 1, 1}; // stride and block can be used to 
+        hsize_t block[3] = {1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
+        hsize_t zerostart[3] = {0, 0, 0};
+
+#pragma omp parallel for collapse(2) num_threads(nthreads)
+        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+                auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+                std::vector<long> shell_atom_index_list{atom1, atom2};
+
+                size_t thread_id = 0;
+#ifdef _OPENMP
+                thread_id = omp_get_thread_num();
+#endif
+                s_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+                t_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+                v_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+                const auto& overlap_buffer = s_engines[thread_id].results(); // will point to computed shell sets
+                const auto& kinetic_buffer = t_engines[thread_id].results(); // will point to computed shell sets
+                const auto& potential_buffer = v_engines[thread_id].results(); // will point to computed shell sets;
+
+                // Define shell set slabs
+                double overlap_shellset_slab [n1][n2][nderivs_triu] = {};
+                double kinetic_shellset_slab [n1][n2][nderivs_triu] = {};
+                double potential_shellset_slab [n1][n2][nderivs_triu] = {};
+
+                // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+                // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
+                for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                    // Look up multidimensional cartesian derivative index
+                    auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+                    // For overlap/kinetic and potential sepearately, create a vector of vectors called `indices`, where each subvector
+                    // is your possible choices for the first derivative operator, second, third, etc and the total number of subvectors is order of differentiation
+                    // What follows fills these indices
+                    std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+                    std::vector<std::vector<int>> potential_indices(deriv_order, std::vector<int> (0,0));
+
+                    // Loop over each cartesian coordinate index which we are differentiating wrt for this nuclear cartesian derivative index
+                    // and check to see if it is present in the shell duet, and where it is present in the potential operator
+                    for (int j = 0; j < multi_cart_idx.size(); j++){
+                        int desired_atom_idx = multi_cart_idx[j] / 3;
+                        int desired_coord = multi_cart_idx[j] % 3;
+                        // Loop over shell indices
+                        for (int i = 0; i < 2; i++){
+                            int atom_idx = shell_atom_index_list[i];
+                            if (atom_idx == desired_atom_idx) {
+                                int tmp = 3 * i + desired_coord;
+                                indices[j].push_back(tmp);
+                                potential_indices[j].push_back(tmp);
+                            }
+                        }
+                        // Now for potentials only, loop over each atom in molecule, and if this derivative
+                        // differentiates wrt that atom, we also need to collect that index.
+                        for (int i = 0; i < natom; i++){
+                            if (i == desired_atom_idx) {
+                                int tmp = 3 * (i + 2) + desired_coord;
+                                potential_indices[j].push_back(tmp);
+                            }
+                        }
+                    }
+
+                    // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+                    // and the total number of subvectors is the order of differentiation
+                    // Now we want all combinations where we pick exactly one index from each subvector.
+                    // This is achievable through a cartesian product
+                    std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                    std::vector<std::vector<int>> potential_index_combos = cartesian_product(potential_indices);
+                    std::vector<int> buffer_indices;
+                    std::vector<int> potential_buffer_indices;
+                    // Overlap/Kinetic integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+                    for (auto vec : index_combos)  {
+                        std::sort(vec.begin(), vec.end());
+                        int buf_idx = 0;
+                        auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                        if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                        buffer_indices.push_back(buf_idx);
+                    }
+                    // Potential integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+                    for (auto vec : potential_index_combos)  {
+                        std::sort(vec.begin(), vec.end());
+                        int buf_idx = 0;
+                        auto it = lower_bound(potential_buffer_multidim_lookup.begin(), potential_buffer_multidim_lookup.end(), vec);
+                        if (it != potential_buffer_multidim_lookup.end()) buf_idx = it - potential_buffer_multidim_lookup.begin();
+                        potential_buffer_indices.push_back(buf_idx);
+                    }
+
+                    // Loop over shell block for each buffer index which contributes to this derivative
+                    // Overlap and Kinetic
+                    for(auto i = 0; i < buffer_indices.size(); ++i) {
+                        auto overlap_shellset = overlap_buffer[buffer_indices[i]];
+                        auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                                overlap_shellset_slab[f1][f2][nuc_idx] += overlap_shellset[idx];
+                                kinetic_shellset_slab[f1][f2][nuc_idx] += kinetic_shellset[idx];
+                            }
+                        }
+                    }
+                    // Potential
+                    for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
+                        auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                                potential_shellset_slab[f1][f2][nuc_idx] += potential_shellset[idx];
+                            }
+                        }
+                    }
+                } // Unique nuclear cartesian derivative indices loop
+
+                // Now write this shell set slab to HDF5 file
+                // Create file space hyperslab, defining where to write data to in file
+                hsize_t count[3] = {n1, n2, nderivs_triu};
+                hsize_t start[3] = {bf1, bf2, 0};
+                fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
+                // Create dataspace defining for memory dataset to write to file
+                hsize_t mem_dims[] = {n1, n2, nderivs_triu};
+                DataSpace mspace(3, mem_dims);
+                mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
+                // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
+                overlap_dataset->write(overlap_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+                kinetic_dataset->write(kinetic_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+                potential_dataset->write(potential_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+            }
+        } // shell duet loops
+    // Delete datasets for this derivative order
+    delete overlap_dataset;
+    delete kinetic_dataset;
+    delete potential_dataset;
+    } // deriv order loop
+// close the file
+delete file;
+std::cout << " done" << std::endl;
+} //oei_deriv_disk 
+
+
+// Writes all ERI's up to `max_deriv_order` to disk.
+// HDF5 File Name: eri_derivs.h5 
+//      HDF5 Dataset names within the file:
+//      eri_deriv1 
+//          shape (nbf,nbf,nbf,nbf,n_unique_1st_derivs)
+//      eri_deriv2
+//          shape (nbf,nbf,nbf,nbf,n_unique_2nd_derivs)
+//      eri_deriv3
+//          shape (nbf,nbf,nbf,nbf,n_unique_3rd_derivs)
+//      ...
+void eri_deriv_disk(int max_deriv_order) { 
+    std::cout << "Writing two-electron integral derivative tensors up to order " << max_deriv_order << " to disk...";
+    const H5std_string file_name("eri_derivs.h5");
+    H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
+    double fillvalue = 0.0;
+    DSetCreatPropList plist;
+    plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
+
+    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+
+    // Check to make sure you are not flooding the disk.
+    long total_deriv_slices = 0;
+    for (int i = 1; i <= max_deriv_order; i++){
+        total_deriv_slices += how_many_derivs(natom, i);
+    }
+    double check = (nbf1 * nbf2 * nbf3 * nbf4 * total_deriv_slices * 8) * (1e-9);
+    assert(check < 10 && "Total disk space required for ERI's exceeds 10 GB. Increase threshold and recompile to proceed.");
+
+    for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
+        // Number of unique shell derivatives output by libint (number of indices in buffer)
+        int nshell_derivs = how_many_derivs(4, deriv_order);
+        // Number of unique nuclear derivatives of ERI's
+        unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
+
+        // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+        // Currently not used due to predefined lookup arrays
+        //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+
+        // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
+        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
+
+        // Libint engine for computing shell quartet derivatives
+        std::vector<libint2::Engine> eri_engines(nthreads);
+        eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
+        for (size_t i = 1; i != nthreads; ++i) {
+            eri_engines[i] = eri_engines[0];
+        }
+
+        // Define HDF5 dataset name
+        const H5std_string eri_dset_name("eri_deriv" + std::to_string(deriv_order));
+        hsize_t file_dims[] = {nbf1, nbf2, nbf3, nbf4, nderivs_triu};
+        DataSpace fspace(5, file_dims);
+        // Create dataset for each integral type and write 0.0's into the file 
+        DataSet* eri_dataset = new DataSet(file->createDataSet(eri_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
+        hsize_t stride[5] = {1, 1, 1, 1, 1}; // stride and block can be used to 
+        hsize_t block[5] = {1, 1, 1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
+        hsize_t zerostart[5] = {0, 0, 0, 0, 0};
+
+#pragma omp parallel for collapse(4) num_threads(nthreads)
+        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+                for(auto s3 = 0; s3 != bs3.size(); ++s3) {
+                    for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                        auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                        auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                        auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                        auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                        auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                        auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+                        auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
+                        auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
+                        auto n3 = bs3[s3].size();    // number of basis functions in shell 3
+                        auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
+                        auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
+                        auto n4 = bs4[s4].size();    // number of basis functions in shell 4
+
+                        if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
+                        std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
+
+                        size_t thread_id = 0;
+#ifdef _OPENMP
+                        thread_id = omp_get_thread_num();
+#endif
+                        eri_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                        const auto& eri_buffer = eri_engines[thread_id].results(); // will point to computed shell sets
+
+                        // Define shell set slab, with extra dimension for unique derivatives, initialized with 0.0's
+                        double eri_shellset_slab [n1][n2][n3][n4][nderivs_triu] = {};
+                        // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+                        for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                            // Look up multidimensional cartesian derivative index
+                            auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+    
+                            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+    
+                            // Find out which 
+                            for (int j = 0; j < multi_cart_idx.size(); j++){
+                                int desired_atom_idx = multi_cart_idx[j] / 3;
+                                int desired_coord = multi_cart_idx[j] % 3;
+                                for (int i = 0; i < 4; i++){
+                                    int atom_idx = shell_atom_index_list[i];
+                                    if (atom_idx == desired_atom_idx) {
+                                        int tmp = 3 * i + desired_coord;
+                                        indices[j].push_back(tmp);
+                                    }
+                                }
+                            }
+
+                            // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+                            // and the total number of subvectors is the order of differentiation
+                            // Now we want all combinations where we pick exactly one index from each subvector.
+                            // This is achievable through a cartesian product 
+                            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                            std::vector<int> buffer_indices;
+                            
+                            // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
+                            //for (auto vec : index_combos)  {
+                            //    std::sort(vec.begin(), vec.end());
+                            //    int buf_idx = 0;
+                            //    // buffer_multidim_lookup
+                            //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                            //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                            //    buffer_indices.push_back(buf_idx);
+                            //}
+                            // Eventually, if you stop using lookup arrays, use above implementation, but these are sitting around so might as well use them 
+                            for (auto vec : index_combos)  {
+                                if (deriv_order == 1) buffer_indices.push_back(buffer_index_eri1d[vec[0]]);
+                                else if (deriv_order == 2) buffer_indices.push_back(buffer_index_eri2d[vec[0]][vec[1]]);
+                                else if (deriv_order == 3) buffer_indices.push_back(buffer_index_eri3d[vec[0]][vec[1]][vec[2]]);
+                                else if (deriv_order == 4) buffer_indices.push_back(buffer_index_eri4d[vec[0]][vec[1]][vec[2]][vec[3]]);
+                            }
+
+                            // Loop over shell block, keeping a total count idx for the size of shell set
+                            for(auto i = 0; i < buffer_indices.size(); ++i) {
+                                auto eri_shellset = eri_buffer[buffer_indices[i]];
+                                if (eri_shellset == nullptr) continue;
+                                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                                    for(auto f2 = 0; f2 != n2; ++f2) {
+                                        for(auto f3 = 0; f3 != n3; ++f3) {
+                                            for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                                eri_shellset_slab[f1][f2][f3][f4][nuc_idx] += eri_shellset[idx];
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        } // For every nuc_idx 0, nderivs_triu
+                        // Now write this shell set slab to HDF5 file
+                        hsize_t count[5] = {n1, n2, n3, n4, nderivs_triu};
+                        hsize_t start[5] = {bf1, bf2, bf3, bf4, 0};
+                        fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
+                        // Create dataspace defining for memory dataset to write to file
+                        hsize_t mem_dims[] = {n1, n2, n3, n4, nderivs_triu};
+                        DataSpace mspace(5, mem_dims);
+                        mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
+                        // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
+                        eri_dataset->write(eri_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+                    }
+                }
+            }
+        } // shell quartet loops
+    // Close the dataset for this derivative order
+    delete eri_dataset;
+    } // deriv order loop 
+// Close the file
+delete file;
+std::cout << " done" << std::endl;
+} // eri_deriv_disk function
+
+// Writes all F12 ints up to `max_deriv_order` to disk.
+// HDF5 File Name: f12_derivs.h5 
+//      HDF5 Dataset names within the file:
+//      f12_deriv1 
+//          shape (nbf,nbf,nbf,nbf,n_unique_1st_derivs)
+//      f12_deriv2
+//          shape (nbf,nbf,nbf,nbf,n_unique_2nd_derivs)
+//      f12_deriv3
+//          shape (nbf,nbf,nbf,nbf,n_unique_3rd_derivs)
+//      ...
+void f12_deriv_disk(double beta, int max_deriv_order) { 
+    std::cout << "Writing two-electron F12 integral derivative tensors up to order " << max_deriv_order << " to disk...";
+    const H5std_string file_name("f12_derivs.h5");
+    H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
+    double fillvalue = 0.0;
+    DSetCreatPropList plist;
+    plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
+
+    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+
+    // Check to make sure you are not flooding the disk.
+    long total_deriv_slices = 0;
+    for (int i = 1; i <= max_deriv_order; i++){
+        total_deriv_slices += how_many_derivs(natom, i);
+    }
+    double check = (nbf1 * nbf2 * nbf3 * nbf4 * total_deriv_slices * 8) * (1e-9);
+    assert(check < 10 && "Total disk space required for ERI's exceeds 10 GB. Increase threshold and recompile to proceed.");
+
+    auto cgtg_params = make_cgtg(beta);
+    
+    for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
+        // Number of unique shell derivatives output by libint (number of indices in buffer)
+        int nshell_derivs = how_many_derivs(4, deriv_order);
+        // Number of unique nuclear derivatives of ERI's
+        unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
+
+        // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+        // Currently not used due to predefined lookup arrays
+        //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+
+        // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
+        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
+
+        // Libint engine for computing shell quartet derivatives
+        std::vector<libint2::Engine> cgtg_engines(nthreads);
+        cgtg_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
+        cgtg_engines[0].set_params(cgtg_params);
+        for (size_t i = 1; i != nthreads; ++i) {
+            cgtg_engines[i] = cgtg_engines[0];
+        }
+
+        // Define HDF5 dataset name
+        const H5std_string eri_dset_name("f12_deriv" + std::to_string(deriv_order));
+        hsize_t file_dims[] = {nbf1, nbf2, nbf3, nbf4, nderivs_triu};
+        DataSpace fspace(5, file_dims);
+        // Create dataset for each integral type and write 0.0's into the file 
+        DataSet* f12_dataset = new DataSet(file->createDataSet(eri_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
+        hsize_t stride[5] = {1, 1, 1, 1, 1}; // stride and block can be used to 
+        hsize_t block[5] = {1, 1, 1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
+        hsize_t zerostart[5] = {0, 0, 0, 0, 0};
+
+#pragma omp parallel for collapse(4) num_threads(nthreads)
+        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+                for(auto s3 = 0; s3 != bs3.size(); ++s3) {
+                    for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                        auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                        auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                        auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                        auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                        auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                        auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+                        auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
+                        auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
+                        auto n3 = bs3[s3].size();    // number of basis functions in shell 3
+                        auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
+                        auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
+                        auto n4 = bs4[s4].size();    // number of basis functions in shell 4
+
+                        if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
+                        std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
+
+                        size_t thread_id = 0;
+#ifdef _OPENMP
+                        thread_id = omp_get_thread_num();
+#endif
+                        cgtg_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                        const auto& f12_buffer = cgtg_engines[thread_id].results(); // will point to computed shell sets
+
+                        // Define shell set slab, with extra dimension for unique derivatives, initialized with 0.0's
+                        double f12_shellset_slab [n1][n2][n3][n4][nderivs_triu] = {};
+                        // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+                        for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                            // Look up multidimensional cartesian derivative index
+                            auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+    
+                            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+    
+                            // Find out which 
+                            for (int j = 0; j < multi_cart_idx.size(); j++){
+                                int desired_atom_idx = multi_cart_idx[j] / 3;
+                                int desired_coord = multi_cart_idx[j] % 3;
+                                for (int i = 0; i < 4; i++){
+                                    int atom_idx = shell_atom_index_list[i];
+                                    if (atom_idx == desired_atom_idx) {
+                                        int tmp = 3 * i + desired_coord;
+                                        indices[j].push_back(tmp);
+                                    }
+                                }
+                            }
+
+                            // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+                            // and the total number of subvectors is the order of differentiation
+                            // Now we want all combinations where we pick exactly one index from each subvector.
+                            // This is achievable through a cartesian product 
+                            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                            std::vector<int> buffer_indices;
+                            
+                            // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
+                            //for (auto vec : index_combos)  {
+                            //    std::sort(vec.begin(), vec.end());
+                            //    int buf_idx = 0;
+                            //    // buffer_multidim_lookup
+                            //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                            //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                            //    buffer_indices.push_back(buf_idx);
+                            //}
+                            // Eventually, if you stop using lookup arrays, use above implementation, but these are sitting around so might as well use them 
+                            for (auto vec : index_combos)  {
+                                if (deriv_order == 1) buffer_indices.push_back(buffer_index_eri1d[vec[0]]);
+                                else if (deriv_order == 2) buffer_indices.push_back(buffer_index_eri2d[vec[0]][vec[1]]);
+                                else if (deriv_order == 3) buffer_indices.push_back(buffer_index_eri3d[vec[0]][vec[1]][vec[2]]);
+                                else if (deriv_order == 4) buffer_indices.push_back(buffer_index_eri4d[vec[0]][vec[1]][vec[2]][vec[3]]);
+                            }
+
+                            // Loop over shell block, keeping a total count idx for the size of shell set
+                            for(auto i = 0; i < buffer_indices.size(); ++i) {
+                                auto f12_shellset = f12_buffer[buffer_indices[i]];
+                                if (f12_shellset == nullptr) continue;
+                                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                                    for(auto f2 = 0; f2 != n2; ++f2) {
+                                        for(auto f3 = 0; f3 != n3; ++f3) {
+                                            for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                                f12_shellset_slab[f1][f2][f3][f4][nuc_idx] += f12_shellset[idx];
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        } // For every nuc_idx 0, nderivs_triu
+                        // Now write this shell set slab to HDF5 file
+                        hsize_t count[5] = {n1, n2, n3, n4, nderivs_triu};
+                        hsize_t start[5] = {bf1, bf2, bf3, bf4, 0};
+                        fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
+                        // Create dataspace defining for memory dataset to write to file
+                        hsize_t mem_dims[] = {n1, n2, n3, n4, nderivs_triu};
+                        DataSpace mspace(5, mem_dims);
+                        mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
+                        // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
+                        f12_dataset->write(f12_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+                    }
+                }
+            }
+        } // shell quartet loops
+    // Close the dataset for this derivative order
+    delete f12_dataset;
+    } // deriv order loop 
+// Close the file
+delete file;
+std::cout << " done" << std::endl;
+} // f12_deriv_disk function
+
+// Writes all F12 Squared ints up to `max_deriv_order` to disk.
+// HDF5 File Name: f12_squared_derivs.h5 
+//      HDF5 Dataset names within the file:
+//      f12_squared_deriv1 
+//          shape (nbf,nbf,nbf,nbf,n_unique_1st_derivs)
+//      f12_squared_deriv2
+//          shape (nbf,nbf,nbf,nbf,n_unique_2nd_derivs)
+//      f12_squared_deriv3
+//          shape (nbf,nbf,nbf,nbf,n_unique_3rd_derivs)
+//      ...
+void f12_squared_deriv_disk(double beta, int max_deriv_order) { 
+    std::cout << "Writing two-electron F12 squared integral derivative tensors up to order " << max_deriv_order << " to disk...";
+    const H5std_string file_name("f12_squared_derivs.h5");
+    H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
+    double fillvalue = 0.0;
+    DSetCreatPropList plist;
+    plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
+
+    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+
+    // Check to make sure you are not flooding the disk.
+    long total_deriv_slices = 0;
+    for (int i = 1; i <= max_deriv_order; i++){
+        total_deriv_slices += how_many_derivs(natom, i);
+    }
+    double check = (nbf1 * nbf2 * nbf3 * nbf4 * total_deriv_slices * 8) * (1e-9);
+    assert(check < 10 && "Total disk space required for ERI's exceeds 10 GB. Increase threshold and recompile to proceed.");
+
+    auto cgtg_params = take_square(make_cgtg(beta));
+    
+    for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
+        // Number of unique shell derivatives output by libint (number of indices in buffer)
+        int nshell_derivs = how_many_derivs(4, deriv_order);
+        // Number of unique nuclear derivatives of ERI's
+        unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
+
+        // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+        // Currently not used due to predefined lookup arrays
+        //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+
+        // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
+        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
+
+        // Libint engine for computing shell quartet derivatives
+        std::vector<libint2::Engine> cgtg_squared_engines(nthreads);
+        size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+        int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+        cgtg_squared_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
+        cgtg_squared_engines[0].set_params(cgtg_params);
+        for (size_t i = 1; i != nthreads; ++i) {
+            cgtg_squared_engines[i] = cgtg_squared_engines[0];
+        }
+
+        // Define HDF5 dataset name
+        const H5std_string eri_dset_name("f12_squared_deriv" + std::to_string(deriv_order));
+        hsize_t file_dims[] = {nbf1, nbf2, nbf3, nbf4, nderivs_triu};
+        DataSpace fspace(5, file_dims);
+        // Create dataset for each integral type and write 0.0's into the file 
+        DataSet* f12_squared_dataset = new DataSet(file->createDataSet(eri_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
+        hsize_t stride[5] = {1, 1, 1, 1, 1}; // stride and block can be used to 
+        hsize_t block[5] = {1, 1, 1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
+        hsize_t zerostart[5] = {0, 0, 0, 0, 0};
+
+#pragma omp parallel for collapse(4) num_threads(nthreads)
+        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+                for(auto s3 = 0; s3 != bs3.size(); ++s3) {
+                    for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                        auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                        auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                        auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                        auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                        auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                        auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+                        auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
+                        auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
+                        auto n3 = bs3[s3].size();    // number of basis functions in shell 3
+                        auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
+                        auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
+                        auto n4 = bs4[s4].size();    // number of basis functions in shell 4
+
+                        if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
+                        std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
+
+                        size_t thread_id = 0;
+#ifdef _OPENMP
+                        thread_id = omp_get_thread_num();
+#endif
+                        cgtg_squared_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                        const auto& f12_squared_buffer = cgtg_squared_engines[thread_id].results(); // will point to computed shell sets
+
+                        // Define shell set slab, with extra dimension for unique derivatives, initialized with 0.0's
+                        double f12_squared_shellset_slab [n1][n2][n3][n4][nderivs_triu] = {};
+                        // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+                        for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                            // Look up multidimensional cartesian derivative index
+                            auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+    
+                            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+    
+                            // Find out which 
+                            for (int j = 0; j < multi_cart_idx.size(); j++){
+                                int desired_atom_idx = multi_cart_idx[j] / 3;
+                                int desired_coord = multi_cart_idx[j] % 3;
+                                for (int i = 0; i < 4; i++){
+                                    int atom_idx = shell_atom_index_list[i];
+                                    if (atom_idx == desired_atom_idx) {
+                                        int tmp = 3 * i + desired_coord;
+                                        indices[j].push_back(tmp);
+                                    }
+                                }
+                            }
+
+                            // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+                            // and the total number of subvectors is the order of differentiation
+                            // Now we want all combinations where we pick exactly one index from each subvector.
+                            // This is achievable through a cartesian product 
+                            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                            std::vector<int> buffer_indices;
+                            
+                            // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
+                            //for (auto vec : index_combos)  {
+                            //    std::sort(vec.begin(), vec.end());
+                            //    int buf_idx = 0;
+                            //    // buffer_multidim_lookup
+                            //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                            //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                            //    buffer_indices.push_back(buf_idx);
+                            //}
+                            // Eventually, if you stop using lookup arrays, use above implementation, but these are sitting around so might as well use them 
+                            for (auto vec : index_combos)  {
+                                if (deriv_order == 1) buffer_indices.push_back(buffer_index_eri1d[vec[0]]);
+                                else if (deriv_order == 2) buffer_indices.push_back(buffer_index_eri2d[vec[0]][vec[1]]);
+                                else if (deriv_order == 3) buffer_indices.push_back(buffer_index_eri3d[vec[0]][vec[1]][vec[2]]);
+                                else if (deriv_order == 4) buffer_indices.push_back(buffer_index_eri4d[vec[0]][vec[1]][vec[2]][vec[3]]);
+                            }
+
+                            // Loop over shell block, keeping a total count idx for the size of shell set
+                            for(auto i = 0; i < buffer_indices.size(); ++i) {
+                                auto f12_squared_shellset = f12_squared_buffer[buffer_indices[i]];
+                                if (f12_squared_shellset == nullptr) continue;
+                                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                                    for(auto f2 = 0; f2 != n2; ++f2) {
+                                        for(auto f3 = 0; f3 != n3; ++f3) {
+                                            for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                                f12_squared_shellset_slab[f1][f2][f3][f4][nuc_idx] += f12_squared_shellset[idx];
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        } // For every nuc_idx 0, nderivs_triu
+                        // Now write this shell set slab to HDF5 file
+                        hsize_t count[5] = {n1, n2, n3, n4, nderivs_triu};
+                        hsize_t start[5] = {bf1, bf2, bf3, bf4, 0};
+                        fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
+                        // Create dataspace defining for memory dataset to write to file
+                        hsize_t mem_dims[] = {n1, n2, n3, n4, nderivs_triu};
+                        DataSpace mspace(5, mem_dims);
+                        mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
+                        // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
+                        f12_squared_dataset->write(f12_squared_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+                    }
+                }
+            }
+        } // shell quartet loops
+    // Close the dataset for this derivative order
+    delete f12_squared_dataset;
+    } // deriv order loop 
+// Close the file
+delete file;
+std::cout << " done" << std::endl;
+} // f12_squared_deriv_disk function
+
+// Writes all F12G12 ints up to `max_deriv_order` to disk.
+// HDF5 File Name: f12g12_derivs.h5 
+//      HDF5 Dataset names within the file:
+//      f12g12_deriv1 
+//          shape (nbf,nbf,nbf,nbf,n_unique_1st_derivs)
+//      f12g12_deriv2
+//          shape (nbf,nbf,nbf,nbf,n_unique_2nd_derivs)
+//      f12g12_deriv3
+//          shape (nbf,nbf,nbf,nbf,n_unique_3rd_derivs)
+//      ...
+void f12g12_deriv_disk(double beta, int max_deriv_order) { 
+    std::cout << "Writing two-electron F12G12 integral derivative tensors up to order " << max_deriv_order << " to disk...";
+    const H5std_string file_name("f12g12_derivs.h5");
+    H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
+    double fillvalue = 0.0;
+    DSetCreatPropList plist;
+    plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
+
+    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+
+    // Check to make sure you are not flooding the disk.
+    long total_deriv_slices = 0;
+    for (int i = 1; i <= max_deriv_order; i++){
+        total_deriv_slices += how_many_derivs(natom, i);
+    }
+    double check = (nbf1 * nbf2 * nbf3 * nbf4 * total_deriv_slices * 8) * (1e-9);
+    assert(check < 10 && "Total disk space required for ERI's exceeds 10 GB. Increase threshold and recompile to proceed.");
+
+    auto cgtg_params = make_cgtg(beta);
+    
+    for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
+        // Number of unique shell derivatives output by libint (number of indices in buffer)
+        int nshell_derivs = how_many_derivs(4, deriv_order);
+        // Number of unique nuclear derivatives of ERI's
+        unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
+
+        // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+        // Currently not used due to predefined lookup arrays
+        //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+
+        // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
+        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
+
+        // Libint engine for computing shell quartet derivatives
+        std::vector<libint2::Engine> cgtg_coulomb_engines(nthreads);
+        cgtg_coulomb_engines[0] = libint2::Engine(libint2::Operator::cgtg_x_coulomb, max_nprim, max_l, deriv_order);
+        cgtg_coulomb_engines[0].set_params(cgtg_params);
+        for (size_t i = 1; i != nthreads; ++i) {
+            cgtg_coulomb_engines[i] = cgtg_coulomb_engines[0];
+        }
+
+        // Define HDF5 dataset name
+        const H5std_string eri_dset_name("f12g12_deriv" + std::to_string(deriv_order));
+        hsize_t file_dims[] = {nbf1, nbf2, nbf3, nbf4, nderivs_triu};
+        DataSpace fspace(5, file_dims);
+        // Create dataset for each integral type and write 0.0's into the file 
+        DataSet* f12g12_dataset = new DataSet(file->createDataSet(eri_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
+        hsize_t stride[5] = {1, 1, 1, 1, 1}; // stride and block can be used to 
+        hsize_t block[5] = {1, 1, 1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
+        hsize_t zerostart[5] = {0, 0, 0, 0, 0};
+
+#pragma omp parallel for collapse(4) num_threads(nthreads)
+        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+                for(auto s3 = 0; s3 != bs3.size(); ++s3) {
+                    for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                        auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                        auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                        auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                        auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                        auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                        auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+                        auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
+                        auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
+                        auto n3 = bs3[s3].size();    // number of basis functions in shell 3
+                        auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
+                        auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
+                        auto n4 = bs4[s4].size();    // number of basis functions in shell 4
+
+                        if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
+                        std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
+
+                        size_t thread_id = 0;
+#ifdef _OPENMP
+                        thread_id = omp_get_thread_num();
+#endif
+                        cgtg_coulomb_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                        const auto& f12g12_buffer = cgtg_coulomb_engines[thread_id].results(); // will point to computed shell sets
+
+                        // Define shell set slab, with extra dimension for unique derivatives, initialized with 0.0's
+                        double f12g12_shellset_slab [n1][n2][n3][n4][nderivs_triu] = {};
+                        // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+                        for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                            // Look up multidimensional cartesian derivative index
+                            auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+    
+                            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+    
+                            // Find out which 
+                            for (int j = 0; j < multi_cart_idx.size(); j++){
+                                int desired_atom_idx = multi_cart_idx[j] / 3;
+                                int desired_coord = multi_cart_idx[j] % 3;
+                                for (int i = 0; i < 4; i++){
+                                    int atom_idx = shell_atom_index_list[i];
+                                    if (atom_idx == desired_atom_idx) {
+                                        int tmp = 3 * i + desired_coord;
+                                        indices[j].push_back(tmp);
+                                    }
+                                }
+                            }
+
+                            // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+                            // and the total number of subvectors is the order of differentiation
+                            // Now we want all combinations where we pick exactly one index from each subvector.
+                            // This is achievable through a cartesian product 
+                            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                            std::vector<int> buffer_indices;
+                            
+                            // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
+                            //for (auto vec : index_combos)  {
+                            //    std::sort(vec.begin(), vec.end());
+                            //    int buf_idx = 0;
+                            //    // buffer_multidim_lookup
+                            //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                            //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                            //    buffer_indices.push_back(buf_idx);
+                            //}
+                            // Eventually, if you stop using lookup arrays, use above implementation, but these are sitting around so might as well use them 
+                            for (auto vec : index_combos)  {
+                                if (deriv_order == 1) buffer_indices.push_back(buffer_index_eri1d[vec[0]]);
+                                else if (deriv_order == 2) buffer_indices.push_back(buffer_index_eri2d[vec[0]][vec[1]]);
+                                else if (deriv_order == 3) buffer_indices.push_back(buffer_index_eri3d[vec[0]][vec[1]][vec[2]]);
+                                else if (deriv_order == 4) buffer_indices.push_back(buffer_index_eri4d[vec[0]][vec[1]][vec[2]][vec[3]]);
+                            }
+
+                            // Loop over shell block, keeping a total count idx for the size of shell set
+                            for(auto i = 0; i < buffer_indices.size(); ++i) {
+                                auto f12g12_shellset = f12g12_buffer[buffer_indices[i]];
+                                if (f12g12_shellset == nullptr) continue;
+                                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                                     for(auto f2 = 0; f2 != n2; ++f2) {
                                         for(auto f3 = 0; f3 != n3; ++f3) {
                                             for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                                eri_shellset_slab[f1][f2][f3][f4][nuc_idx] += eri_shellset[idx];
+                                                f12g12_shellset_slab[f1][f2][f3][f4][nuc_idx] += f12g12_shellset[idx];
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        } // For every nuc_idx 0, nderivs_triu
+                        // Now write this shell set slab to HDF5 file
+                        hsize_t count[5] = {n1, n2, n3, n4, nderivs_triu};
+                        hsize_t start[5] = {bf1, bf2, bf3, bf4, 0};
+                        fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
+                        // Create dataspace defining for memory dataset to write to file
+                        hsize_t mem_dims[] = {n1, n2, n3, n4, nderivs_triu};
+                        DataSpace mspace(5, mem_dims);
+                        mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
+                        // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
+                        f12g12_dataset->write(f12g12_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+                    }
+                }
+            }
+        } // shell quartet loops
+    // Close the dataset for this derivative order
+    delete f12g12_dataset;
+    } // deriv order loop 
+// Close the file
+delete file;
+std::cout << " done" << std::endl;
+} // f12g12_deriv_disk function
+
+// Writes all F12 Double Commutator ints up to `max_deriv_order` to disk.
+// HDF5 File Name: f12_derivs.h5 
+//      HDF5 Dataset names within the file:
+//      f12_double_commutator_deriv1 
+//          shape (nbf,nbf,nbf,nbf,n_unique_1st_derivs)
+//      f12_double_commutator_deriv2
+//          shape (nbf,nbf,nbf,nbf,n_unique_2nd_derivs)
+//      f12_double_commutator_deriv3
+//          shape (nbf,nbf,nbf,nbf,n_unique_3rd_derivs)
+//      ...
+void f12_double_commutator_deriv_disk(double beta, int max_deriv_order) { 
+    std::cout << "Writing two-electron F12 Double Commutator integral derivative tensors up to order " << max_deriv_order << " to disk...";
+    const H5std_string file_name("f12_double_commutator_derivs.h5");
+    H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
+    double fillvalue = 0.0;
+    DSetCreatPropList plist;
+    plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
+
+    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+
+    // Check to make sure you are not flooding the disk.
+    long total_deriv_slices = 0;
+    for (int i = 1; i <= max_deriv_order; i++){
+        total_deriv_slices += how_many_derivs(natom, i);
+    }
+    double check = (nbf1 * nbf2 * nbf3 * nbf4 * total_deriv_slices * 8) * (1e-9);
+    assert(check < 10 && "Total disk space required for ERI's exceeds 10 GB. Increase threshold and recompile to proceed.");
+
+    auto cgtg_params = make_cgtg(beta);
+    
+    for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
+        // Number of unique shell derivatives output by libint (number of indices in buffer)
+        int nshell_derivs = how_many_derivs(4, deriv_order);
+        // Number of unique nuclear derivatives of ERI's
+        unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
+
+        // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+        // Currently not used due to predefined lookup arrays
+        //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+
+        // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
+        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
+
+        // Libint engine for computing shell quartet derivatives
+        std::vector<libint2::Engine> cgtg_del_engines(nthreads);
+        // Returns Runtime Error: bad any_cast if shorthand version is used, may be an error on the Libint side since Psi4 works with this as well
+        cgtg_del_engines[0] = libint2::Engine(libint2::Operator::delcgtg2, max_nprim, max_l, deriv_order, 0., cgtg_params, libint2::BraKet::xx_xx);
+        for (size_t i = 1; i != nthreads; ++i) {
+            cgtg_del_engines[i] = cgtg_del_engines[0];
+        }
+
+        // Define HDF5 dataset name
+        const H5std_string eri_dset_name("f12_double_commutator_deriv" + std::to_string(deriv_order));
+        hsize_t file_dims[] = {nbf1, nbf2, nbf3, nbf4, nderivs_triu};
+        DataSpace fspace(5, file_dims);
+        // Create dataset for each integral type and write 0.0's into the file 
+        DataSet* f12_double_commutator_dataset = new DataSet(file->createDataSet(eri_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
+        hsize_t stride[5] = {1, 1, 1, 1, 1}; // stride and block can be used to 
+        hsize_t block[5] = {1, 1, 1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
+        hsize_t zerostart[5] = {0, 0, 0, 0, 0};
+
+#pragma omp parallel for collapse(4) num_threads(nthreads)
+        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+                for(auto s3 = 0; s3 != bs3.size(); ++s3) {
+                    for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                        auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                        auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                        auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                        auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                        auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                        auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+                        auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
+                        auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
+                        auto n3 = bs3[s3].size();    // number of basis functions in shell 3
+                        auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
+                        auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
+                        auto n4 = bs4[s4].size();    // number of basis functions in shell 4
+
+                        if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
+                        std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
+
+                        size_t thread_id = 0;
+#ifdef _OPENMP
+                        thread_id = omp_get_thread_num();
+#endif
+                        cgtg_del_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                        const auto& f12_double_commutator_buffer = cgtg_del_engines[thread_id].results(); // will point to computed shell sets
+
+                        // Define shell set slab, with extra dimension for unique derivatives, initialized with 0.0's
+                        double f12_double_commutator_shellset_slab [n1][n2][n3][n4][nderivs_triu] = {};
+                        // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+                        for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                            // Look up multidimensional cartesian derivative index
+                            auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+    
+                            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+    
+                            // Find out which 
+                            for (int j = 0; j < multi_cart_idx.size(); j++){
+                                int desired_atom_idx = multi_cart_idx[j] / 3;
+                                int desired_coord = multi_cart_idx[j] % 3;
+                                for (int i = 0; i < 4; i++){
+                                    int atom_idx = shell_atom_index_list[i];
+                                    if (atom_idx == desired_atom_idx) {
+                                        int tmp = 3 * i + desired_coord;
+                                        indices[j].push_back(tmp);
+                                    }
+                                }
+                            }
+
+                            // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+                            // and the total number of subvectors is the order of differentiation
+                            // Now we want all combinations where we pick exactly one index from each subvector.
+                            // This is achievable through a cartesian product 
+                            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                            std::vector<int> buffer_indices;
+                            
+                            // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
+                            //for (auto vec : index_combos)  {
+                            //    std::sort(vec.begin(), vec.end());
+                            //    int buf_idx = 0;
+                            //    // buffer_multidim_lookup
+                            //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                            //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                            //    buffer_indices.push_back(buf_idx);
+                            //}
+                            // Eventually, if you stop using lookup arrays, use above implementation, but these are sitting around so might as well use them 
+                            for (auto vec : index_combos)  {
+                                if (deriv_order == 1) buffer_indices.push_back(buffer_index_eri1d[vec[0]]);
+                                else if (deriv_order == 2) buffer_indices.push_back(buffer_index_eri2d[vec[0]][vec[1]]);
+                                else if (deriv_order == 3) buffer_indices.push_back(buffer_index_eri3d[vec[0]][vec[1]][vec[2]]);
+                                else if (deriv_order == 4) buffer_indices.push_back(buffer_index_eri4d[vec[0]][vec[1]][vec[2]][vec[3]]);
+                            }
+
+                            // Loop over shell block, keeping a total count idx for the size of shell set
+                            for(auto i = 0; i < buffer_indices.size(); ++i) {
+                                auto f12_double_commutator_shellset = f12_double_commutator_buffer[buffer_indices[i]];
+                                if (f12_double_commutator_shellset == nullptr) continue;
+                                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                                    for(auto f2 = 0; f2 != n2; ++f2) {
+                                        for(auto f3 = 0; f3 != n3; ++f3) {
+                                            for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                                f12_double_commutator_shellset_slab[f1][f2][f3][f4][nuc_idx] += f12_double_commutator_shellset[idx];
                                             }
                                         }
                                     }
                                 }
                             }
-                        } // For every nuc_idx 0, nderivs_triu
-                        // Now write this shell set slab to HDF5 file
-                        hsize_t count[5] = {n1, n2, n3, n4, nderivs_triu};
-                        hsize_t start[5] = {bf1, bf2, bf3, bf4, 0};
-                        fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
-                        // Create dataspace defining for memory dataset to write to file
-                        hsize_t mem_dims[] = {n1, n2, n3, n4, nderivs_triu};
-                        DataSpace mspace(5, mem_dims);
-                        mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
-                        // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
-                        eri_dataset->write(eri_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
-                    }
+                        } // For every nuc_idx 0, nderivs_triu
+                        // Now write this shell set slab to HDF5 file
+                        hsize_t count[5] = {n1, n2, n3, n4, nderivs_triu};
+                        hsize_t start[5] = {bf1, bf2, bf3, bf4, 0};
+                        fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
+                        // Create dataspace defining for memory dataset to write to file
+                        hsize_t mem_dims[] = {n1, n2, n3, n4, nderivs_triu};
+                        DataSpace mspace(5, mem_dims);
+                        mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
+                        // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
+                        f12_double_commutator_dataset->write(f12_double_commutator_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+                    }
+                }
+            }
+        } // shell quartet loops
+    // Close the dataset for this derivative order
+    delete f12_double_commutator_dataset;
+    } // deriv order loop 
+// Close the file
+delete file;
+std::cout << " done" << std::endl;
+} // f12_double_commutator_deriv_disk function
+
+// Computes a single 'deriv_order' derivative tensor of OEIs, keeps everything in core memory
+std::vector<py::array> oei_deriv_core(int deriv_order) {
+    // how many shell derivatives in the Libint buffer for overlap/kinetic integrals
+    // how many shell and operator derivatives for potential integrals
+    int nshell_derivs = how_many_derivs(2, deriv_order);
+    int nshell_derivs_potential = how_many_derivs(2, deriv_order, natom);
+    // how many unique cartesian nuclear derivatives (e.g., so we only save one of d^2/dx1dx2 and d^2/dx2dx1, etc)
+    unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
+
+    // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+    // Overlap and kinetic have different mappings than potential since potential has more elements in the buffer
+    // Currently unused due to predefined lookup arrays
+    //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
+    // Potential integrals buffer is flattened upper triangle of (6 + NCART) dimensional deriv_order tensor
+    const std::vector<std::vector<int>> potential_buffer_multidim_lookup = generate_multi_index_lookup(6 + ncart, deriv_order);
+
+    // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
+    const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
+
+    // Define engines and buffers
+    std::vector<libint2::Engine> s_engines(nthreads), t_engines(nthreads), v_engines(nthreads);
+    size_t max_nprim = std::max(bs1.max_nprim(), bs2.max_nprim());
+    int max_l = std::max(bs1.max_l(), bs2.max_l());
+    s_engines[0] = libint2::Engine(libint2::Operator::overlap, max_nprim, max_l, deriv_order);
+    t_engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l, deriv_order);
+    v_engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l, deriv_order);
+    v_engines[0].set_params(make_point_charges(atoms));
+    for (size_t i = 1; i != nthreads; ++i) {
+        s_engines[i] = s_engines[0];
+        t_engines[i] = t_engines[0];
+        v_engines[i] = v_engines[0];
+    }
+
+    size_t length = nbf1 * nbf2 * nderivs_triu;
+    std::vector<double> S(length);
+    std::vector<double> T(length);
+    std::vector<double> V(length);
+
+#pragma omp parallel for collapse(2) num_threads(nthreads)
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+            auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+            auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+            auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+            auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+            auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+            std::vector<long> shell_atom_index_list{atom1, atom2};
+
+            size_t thread_id = 0;
+#ifdef _OPENMP
+            thread_id = omp_get_thread_num();
+#endif
+            s_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+            t_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+            v_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+            const auto& overlap_buffer = s_engines[thread_id].results(); // will point to computed shell sets
+            const auto& kinetic_buffer = t_engines[thread_id].results(); // will point to computed shell sets
+            const auto& potential_buffer = v_engines[thread_id].results(); // will point to computed shell sets
+
+            // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+            // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
+            for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                size_t offset_nuc_idx = nuc_idx * nbf1 * nbf2;
+
+                // Look up multidimensional cartesian derivative index
+                auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+                // For overlap/kinetic and potential sepearately, create a vector of vectors called `indices`, where each subvector
+                // is your possible choices for the first derivative operator, second, third, etc and the total number of subvectors is order of differentiation
+                // What follows fills these indices
+                std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+                std::vector<std::vector<int>> potential_indices(deriv_order, std::vector<int> (0,0));
+
+                // Loop over each cartesian coordinate index which we are differentiating wrt for this nuclear cartesian derivative index
+                // and check to see if it is present in the shell duet, and where it is present in the potential operator
+                for (int j = 0; j < multi_cart_idx.size(); j++){
+                    int desired_atom_idx = multi_cart_idx[j] / 3;
+                    int desired_coord = multi_cart_idx[j] % 3;
+                    // Loop over shell indices
+                    for (int i = 0; i < 2; i++){
+                        int atom_idx = shell_atom_index_list[i];
+                        if (atom_idx == desired_atom_idx) {
+                            int tmp = 3 * i + desired_coord;
+                            indices[j].push_back(tmp);
+                            potential_indices[j].push_back(tmp);
+                        }
+                    }
+                    // Now for potentials only, loop over each atom in molecule, and if this derivative
+                    // differentiates wrt that atom, we also need to collect that index.
+                    for (int i = 0; i < natom; i++){
+                        if (i == desired_atom_idx) {
+                            int tmp = 3 * (i + 2) + desired_coord;
+                            potential_indices[j].push_back(tmp);
+                        }
+                    }
+                }
+
+                // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+                // and the total number of subvectors is the order of differentiation
+                // Now we want all combinations where we pick exactly one index from each subvector.
+                // This is achievable through a cartesian product
+                std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                std::vector<std::vector<int>> potential_index_combos = cartesian_product(potential_indices);
+                std::vector<int> buffer_indices;
+                std::vector<int> potential_buffer_indices;
+                // Overlap/Kinetic integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+                //for (auto vec : index_combos)  {
+                //    std::sort(vec.begin(), vec.end());
+                //    int buf_idx = 0;
+                //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                //    buffer_indices.push_back(buf_idx);
+                //}
+                for (auto vec : index_combos)  {
+                    if (deriv_order == 1) buffer_indices.push_back(buffer_index_oei1d[vec[0]]);
+                    else if (deriv_order == 2) buffer_indices.push_back(buffer_index_oei2d[vec[0]][vec[1]]);
+                    else if (deriv_order == 3) buffer_indices.push_back(buffer_index_oei3d[vec[0]][vec[1]][vec[2]]);
+                    else if (deriv_order == 4) buffer_indices.push_back(buffer_index_oei4d[vec[0]][vec[1]][vec[2]][vec[3]]);
+                }
+                // Potential integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+                for (auto vec : potential_index_combos)  {
+                    std::sort(vec.begin(), vec.end());
+                    int buf_idx = 0;
+                    auto it = lower_bound(potential_buffer_multidim_lookup.begin(), potential_buffer_multidim_lookup.end(), vec);
+                    if (it != potential_buffer_multidim_lookup.end()) buf_idx = it - potential_buffer_multidim_lookup.begin();
+                    potential_buffer_indices.push_back(buf_idx);
+                }
+
+                // Loop over shell block for each buffer index which contributes to this derivative
+                // Overlap and Kinetic
+                for(auto i = 0; i < buffer_indices.size(); ++i) {
+                    auto overlap_shellset = overlap_buffer[buffer_indices[i]];
+                    auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
+                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                        for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                            S[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += overlap_shellset[idx];
+                            T[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += kinetic_shellset[idx];
+                        }
+                    }
+                }
+                // Potential
+                for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
+                    auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
+                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                        for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                            V[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += potential_shellset[idx];
+                        }
+                    }
+                }
+            } // Unique nuclear cartesian derivative indices loop
+        }
+    } // shell duet loops
+    return {py::array(S.size(), S.data()), py::array(T.size(), T.data()), py::array(V.size(), V.data())}; // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
+} // oei_deriv_core function
+
+// Computes a single 'deriv_order' derivative tensor of electron repulsion integrals, keeps everything in core memory
+py::array eri_deriv_core(int deriv_order) {
+    // Number of unique shell derivatives output by libint (number of indices in buffer)
+    int nshell_derivs = how_many_derivs(4, deriv_order);
+    // Number of unique nuclear derivatives of ERI's
+    unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
+
+    // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+    // Currently unused due to predefined lookup arrays
+    //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+
+    // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
+    const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
+
+    // Libint engine for computing shell quartet derivatives
+    std::vector<libint2::Engine> eri_engines(nthreads);
+    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+    eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
+    for (size_t i = 1; i != nthreads; ++i) {
+        eri_engines[i] = eri_engines[0];
+    }
+
+    size_t length = nbf1 * nbf2 * nbf3 * nbf4 * nderivs_triu;
+    std::vector<double> result(length);
+
+    // Begin shell quartet loops
+#pragma omp parallel for collapse(4) num_threads(nthreads)
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
+                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
+                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
+                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
+                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
+                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
+                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
+
+                    if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
+                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
+
+                    size_t thread_id = 0;
+#ifdef _OPENMP
+                    thread_id = omp_get_thread_num();
+#endif
+                    eri_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                    const auto& eri_buffer = eri_engines[thread_id].results(); // will point to computed shell sets
+
+                    // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+                    for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                        size_t offset_nuc_idx = nuc_idx * nbf1 * nbf2 * nbf3 * nbf4;
+
+                        // Look up multidimensional cartesian derivative index
+                        auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+    
+                        // Find out which shell derivatives provided by Libint correspond to this nuclear cartesian derivative
+                        std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+                        for (int j = 0; j < multi_cart_idx.size(); j++){
+                            int desired_atom_idx = multi_cart_idx[j] / 3;
+                            int desired_coord = multi_cart_idx[j] % 3;
+                            for (int i = 0; i < 4; i++){
+                                int atom_idx = shell_atom_index_list[i];
+                                if (atom_idx == desired_atom_idx) {
+                                    int tmp = 3 * i + desired_coord;
+                                    indices[j].push_back(tmp);
+                                }
+                            }
+                        }
+
+                        // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+                        // and the total number of subvectors is the order of differentiation
+                        // Now we want all combinations where we pick exactly one index from each subvector.
+                        // This is achievable through a cartesian product 
+                        std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                        std::vector<int> buffer_indices;
+                        
+                        // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
+                        //for (auto vec : index_combos)  {
+                        //    std::sort(vec.begin(), vec.end());
+                        //    int buf_idx = 0;
+                        //    // buffer_multidim_lookup
+                        //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                        //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                        //    buffer_indices.push_back(buf_idx);
+                        //}
+                        // Eventually, if you stop using lookup arrays, use above implementation, but these are sitting around so might as well use them 
+                        for (auto vec : index_combos)  {
+                            if (deriv_order == 1) buffer_indices.push_back(buffer_index_eri1d[vec[0]]);
+                            else if (deriv_order == 2) buffer_indices.push_back(buffer_index_eri2d[vec[0]][vec[1]]);
+                            else if (deriv_order == 3) buffer_indices.push_back(buffer_index_eri3d[vec[0]][vec[1]][vec[2]]);
+                            else if (deriv_order == 4) buffer_indices.push_back(buffer_index_eri4d[vec[0]][vec[1]][vec[2]][vec[3]]);
+                        }
+
+                        // Loop over shell block, keeping a total count idx for the size of shell set
+                        for(auto i = 0; i < buffer_indices.size(); ++i) {
+                            auto eri_shellset = eri_buffer[buffer_indices[i]];
+                            if (eri_shellset == nullptr) continue;
+                            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                                size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                                for(auto f2 = 0; f2 != n2; ++f2) {
+                                    size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                                    for(auto f3 = 0; f3 != n3; ++f3) {
+                                        size_t offset_3 = (bf3 + f3) * nbf4;
+                                        for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                            size_t offset_4 = bf4 + f4;
+                                            result[offset_1 + offset_2 + offset_3 + offset_4 + offset_nuc_idx] += eri_shellset[idx];
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    } // For every nuc_idx 0, nderivs_triu
+                }
+            }
+        }
+    } // shell quartet loops
+    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
+} // eri_deriv_core function
+
+// Computes a single 'deriv_order' derivative tensor of contracted Gaussian-type geminal integrals, keeps everything in core memory
+py::array f12_deriv_core(double beta, int deriv_order) {
+    // Number of unique shell derivatives output by libint (number of indices in buffer)
+    int nshell_derivs = how_many_derivs(4, deriv_order);
+    // Number of unique nuclear derivatives of ERI's
+    unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
+
+    // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+    // Currently unused due to predefined lookup arrays
+    //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+
+    // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
+    const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
+
+    // Libint engine for computing shell quartet derivatives
+    auto cgtg_params = make_cgtg(beta);
+    std::vector<libint2::Engine> cgtg_engines(nthreads);
+    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+    cgtg_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
+    cgtg_engines[0].set_params(cgtg_params);
+    for (size_t i = 1; i != nthreads; ++i) {
+        cgtg_engines[i] = cgtg_engines[0];
+    }
+
+    size_t length = nbf1 * nbf2 * nbf3 * nbf4 * nderivs_triu;
+    std::vector<double> result(length);
+
+    // Begin shell quartet loops
+#pragma omp parallel for collapse(4) num_threads(nthreads)
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
+                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
+                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
+                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
+                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
+                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
+                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
+
+                    if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
+                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
+
+                    size_t thread_id = 0;
+#ifdef _OPENMP
+                    thread_id = omp_get_thread_num();
+#endif
+                    cgtg_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                    const auto& f12_buffer = cgtg_engines[thread_id].results(); // will point to computed shell sets
+
+                    // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+                    for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                        size_t offset_nuc_idx = nuc_idx * nbf1 * nbf2 * nbf3 * nbf4;
+
+                        // Look up multidimensional cartesian derivative index
+                        auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+    
+                        // Find out which shell derivatives provided by Libint correspond to this nuclear cartesian derivative
+                        std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+                        for (int j = 0; j < multi_cart_idx.size(); j++){
+                            int desired_atom_idx = multi_cart_idx[j] / 3;
+                            int desired_coord = multi_cart_idx[j] % 3;
+                            for (int i = 0; i < 4; i++){
+                                int atom_idx = shell_atom_index_list[i];
+                                if (atom_idx == desired_atom_idx) {
+                                    int tmp = 3 * i + desired_coord;
+                                    indices[j].push_back(tmp);
+                                }
+                            }
+                        }
+
+                        // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+                        // and the total number of subvectors is the order of differentiation
+                        // Now we want all combinations where we pick exactly one index from each subvector.
+                        // This is achievable through a cartesian product 
+                        std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                        std::vector<int> buffer_indices;
+                        
+                        // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
+                        //for (auto vec : index_combos)  {
+                        //    std::sort(vec.begin(), vec.end());
+                        //    int buf_idx = 0;
+                        //    // buffer_multidim_lookup
+                        //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                        //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                        //    buffer_indices.push_back(buf_idx);
+                        //}
+                        // Eventually, if you stop using lookup arrays, use above implementation, but these are sitting around so might as well use them 
+                        for (auto vec : index_combos)  {
+                            if (deriv_order == 1) buffer_indices.push_back(buffer_index_eri1d[vec[0]]);
+                            else if (deriv_order == 2) buffer_indices.push_back(buffer_index_eri2d[vec[0]][vec[1]]);
+                            else if (deriv_order == 3) buffer_indices.push_back(buffer_index_eri3d[vec[0]][vec[1]][vec[2]]);
+                            else if (deriv_order == 4) buffer_indices.push_back(buffer_index_eri4d[vec[0]][vec[1]][vec[2]][vec[3]]);
+                        }
+
+                        // Loop over shell block, keeping a total count idx for the size of shell set
+                        for(auto i = 0; i < buffer_indices.size(); ++i) {
+                            auto f12_shellset = f12_buffer[buffer_indices[i]];
+                            if (f12_shellset == nullptr) continue;
+                            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                                size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                                for(auto f2 = 0; f2 != n2; ++f2) {
+                                    size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                                    for(auto f3 = 0; f3 != n3; ++f3) {
+                                        size_t offset_3 = (bf3 + f3) * nbf4;
+                                        for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                            size_t offset_4 = bf4 + f4;
+                                            result[offset_1 + offset_2 + offset_3 + offset_4 + offset_nuc_idx] += f12_shellset[idx];
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    } // For every nuc_idx 0, nderivs_triu
+                }
+            }
+        }
+    } // shell quartet loops
+    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
+} // f12_deriv_core function
+
+// Computes a single 'deriv_order' derivative tensor of squared contracted Gaussian-type geminal integrals, keeps everything in core memory
+py::array f12_squared_deriv_core(double beta, int deriv_order) {
+    // Number of unique shell derivatives output by libint (number of indices in buffer)
+    int nshell_derivs = how_many_derivs(4, deriv_order);
+    // Number of unique nuclear derivatives of ERI's
+    unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
+
+    // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+    // Currently unused due to predefined lookup arrays
+    //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+
+    // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
+    const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
+
+    // Libint engine for computing shell quartet derivatives
+    auto cgtg_params = take_square(make_cgtg(beta));
+    std::vector<libint2::Engine> cgtg_squared_engines(nthreads);
+    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+    cgtg_squared_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
+    cgtg_squared_engines[0].set_params(cgtg_params);
+    for (size_t i = 1; i != nthreads; ++i) {
+        cgtg_squared_engines[i] = cgtg_squared_engines[0];
+    }
+
+    size_t length = nbf1 * nbf2 * nbf3 * nbf4 * nderivs_triu;
+    std::vector<double> result(length);
+
+    // Begin shell quartet loops
+#pragma omp parallel for collapse(4) num_threads(nthreads)
+    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
+                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
+                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
+                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
+                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
+                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
+                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
+
+                    if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
+                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
+
+                    size_t thread_id = 0;
+#ifdef _OPENMP
+                    thread_id = omp_get_thread_num();
+#endif
+                    cgtg_squared_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                    const auto& f12_squared_buffer = cgtg_squared_engines[thread_id].results(); // will point to computed shell sets
+
+                    // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+                    for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                        size_t offset_nuc_idx = nuc_idx * nbf1 * nbf2 * nbf3 * nbf4;
+
+                        // Look up multidimensional cartesian derivative index
+                        auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+    
+                        // Find out which shell derivatives provided by Libint correspond to this nuclear cartesian derivative
+                        std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+                        for (int j = 0; j < multi_cart_idx.size(); j++){
+                            int desired_atom_idx = multi_cart_idx[j] / 3;
+                            int desired_coord = multi_cart_idx[j] % 3;
+                            for (int i = 0; i < 4; i++){
+                                int atom_idx = shell_atom_index_list[i];
+                                if (atom_idx == desired_atom_idx) {
+                                    int tmp = 3 * i + desired_coord;
+                                    indices[j].push_back(tmp);
+                                }
+                            }
+                        }
+
+                        // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+                        // and the total number of subvectors is the order of differentiation
+                        // Now we want all combinations where we pick exactly one index from each subvector.
+                        // This is achievable through a cartesian product 
+                        std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                        std::vector<int> buffer_indices;
+                        
+                        // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
+                        //for (auto vec : index_combos)  {
+                        //    std::sort(vec.begin(), vec.end());
+                        //    int buf_idx = 0;
+                        //    // buffer_multidim_lookup
+                        //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                        //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                        //    buffer_indices.push_back(buf_idx);
+                        //}
+                        // Eventually, if you stop using lookup arrays, use above implementation, but these are sitting around so might as well use them 
+                        for (auto vec : index_combos)  {
+                            if (deriv_order == 1) buffer_indices.push_back(buffer_index_eri1d[vec[0]]);
+                            else if (deriv_order == 2) buffer_indices.push_back(buffer_index_eri2d[vec[0]][vec[1]]);
+                            else if (deriv_order == 3) buffer_indices.push_back(buffer_index_eri3d[vec[0]][vec[1]][vec[2]]);
+                            else if (deriv_order == 4) buffer_indices.push_back(buffer_index_eri4d[vec[0]][vec[1]][vec[2]][vec[3]]);
+                        }
+
+                        // Loop over shell block, keeping a total count idx for the size of shell set
+                        for(auto i = 0; i < buffer_indices.size(); ++i) {
+                            auto f12_squared_shellset = f12_squared_buffer[buffer_indices[i]];
+                            if (f12_squared_shellset == nullptr) continue;
+                            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                                size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                                for(auto f2 = 0; f2 != n2; ++f2) {
+                                    size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                                    for(auto f3 = 0; f3 != n3; ++f3) {
+                                        size_t offset_3 = (bf3 + f3) * nbf4;
+                                        for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                            size_t offset_4 = bf4 + f4;
+                                            result[offset_1 + offset_2 + offset_3 + offset_4 + offset_nuc_idx] += f12_squared_shellset[idx];
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    } // For every nuc_idx 0, nderivs_triu
                 }
             }
-        } // shell quartet loops
-    // Close the dataset for this derivative order
-    delete eri_dataset;
-    } // deriv order loop 
-// Close the file
-delete file;
-std::cout << " done" << std::endl;
-} // eri_deriv_disk function
+        }
+    } // shell quartet loops
+    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
+} // f12_squared_deriv_core function
 
-// Computes a single 'deriv_order' derivative tensor of OEIs, keeps everything in core memory
-std::vector<py::array> oei_deriv_core(int deriv_order) {
-    // how many shell derivatives in the Libint buffer for overlap/kinetic integrals
-    // how many shell and operator derivatives for potential integrals
-    int nshell_derivs = how_many_derivs(2, deriv_order);
-    int nshell_derivs_potential = how_many_derivs(2, deriv_order, natom);
-    // how many unique cartesian nuclear derivatives (e.g., so we only save one of d^2/dx1dx2 and d^2/dx2dx1, etc)
+// Computes a single 'deriv_order' derivative tensor of contracted Gaussian-type geminal times Coulomb replusion integrals, keeps everything in core memory
+py::array f12g12_deriv_core(double beta, int deriv_order) {
+    // Number of unique shell derivatives output by libint (number of indices in buffer)
+    int nshell_derivs = how_many_derivs(4, deriv_order);
+    // Number of unique nuclear derivatives of ERI's
     unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
-    // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-    // Overlap and kinetic have different mappings than potential since potential has more elements in the buffer
+    // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
     // Currently unused due to predefined lookup arrays
-    //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
-    // Potential integrals buffer is flattened upper triangle of (6 + NCART) dimensional deriv_order tensor
-    const std::vector<std::vector<int>> potential_buffer_multidim_lookup = generate_multi_index_lookup(6 + ncart, deriv_order);
+    //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
 
     // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
     const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
 
-    // Define engines and buffers
-    std::vector<libint2::Engine> s_engines(nthreads), t_engines(nthreads), v_engines(nthreads);
-    size_t max_nprim = std::max(bs1.max_nprim(), bs2.max_nprim());
-    int max_l = std::max(bs1.max_l(), bs2.max_l());
-    s_engines[0] = libint2::Engine(libint2::Operator::overlap, max_nprim, max_l, deriv_order);
-    t_engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l, deriv_order);
-    v_engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l, deriv_order);
-    v_engines[0].set_params(make_point_charges(atoms));
+    // Libint engine for computing shell quartet derivatives
+    auto cgtg_params = make_cgtg(beta);
+    std::vector<libint2::Engine> cgtg_coulomb_engines(nthreads);
+    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+    cgtg_coulomb_engines[0] = libint2::Engine(libint2::Operator::cgtg_x_coulomb, max_nprim, max_l, deriv_order);
+    cgtg_coulomb_engines[0].set_params(cgtg_params);
     for (size_t i = 1; i != nthreads; ++i) {
-        s_engines[i] = s_engines[0];
-        t_engines[i] = t_engines[0];
-        v_engines[i] = v_engines[0];
+        cgtg_coulomb_engines[i] = cgtg_coulomb_engines[0];
     }
 
-    size_t length = nbf1 * nbf2 * nderivs_triu;
-    std::vector<double> S(length);
-    std::vector<double> T(length);
-    std::vector<double> V(length);
+    size_t length = nbf1 * nbf2 * nbf3 * nbf4 * nderivs_triu;
+    std::vector<double> result(length);
 
-#pragma omp parallel for collapse(2) num_threads(nthreads)
+    // Begin shell quartet loops
+#pragma omp parallel for collapse(4) num_threads(nthreads)
     for(auto s1 = 0; s1 != bs1.size(); ++s1) {
         for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-            auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-            auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-            auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-            auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-            auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-            std::vector<long> shell_atom_index_list{atom1, atom2};
+            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
+                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
+                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
+                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
+                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
+                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
+                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
 
-            size_t thread_id = 0;
+                    if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
+                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
+
+                    size_t thread_id = 0;
 #ifdef _OPENMP
-            thread_id = omp_get_thread_num();
+                    thread_id = omp_get_thread_num();
 #endif
-            s_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-            t_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-            v_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-            const auto& overlap_buffer = s_engines[thread_id].results(); // will point to computed shell sets
-            const auto& kinetic_buffer = t_engines[thread_id].results(); // will point to computed shell sets
-            const auto& potential_buffer = v_engines[thread_id].results(); // will point to computed shell sets
-
-            // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-            // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
-            for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                size_t offset_nuc_idx = nuc_idx * nbf1 * nbf2;
+                    cgtg_coulomb_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                    const auto& f12g12_buffer = cgtg_coulomb_engines[thread_id].results(); // will point to computed shell sets
 
-                // Look up multidimensional cartesian derivative index
-                auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
-                // For overlap/kinetic and potential sepearately, create a vector of vectors called `indices`, where each subvector
-                // is your possible choices for the first derivative operator, second, third, etc and the total number of subvectors is order of differentiation
-                // What follows fills these indices
-                std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-                std::vector<std::vector<int>> potential_indices(deriv_order, std::vector<int> (0,0));
+                    // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+                    for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                        size_t offset_nuc_idx = nuc_idx * nbf1 * nbf2 * nbf3 * nbf4;
 
-                // Loop over each cartesian coordinate index which we are differentiating wrt for this nuclear cartesian derivative index
-                // and check to see if it is present in the shell duet, and where it is present in the potential operator
-                for (int j = 0; j < multi_cart_idx.size(); j++){
-                    int desired_atom_idx = multi_cart_idx[j] / 3;
-                    int desired_coord = multi_cart_idx[j] % 3;
-                    // Loop over shell indices
-                    for (int i = 0; i < 2; i++){
-                        int atom_idx = shell_atom_index_list[i];
-                        if (atom_idx == desired_atom_idx) {
-                            int tmp = 3 * i + desired_coord;
-                            indices[j].push_back(tmp);
-                            potential_indices[j].push_back(tmp);
-                        }
-                    }
-                    // Now for potentials only, loop over each atom in molecule, and if this derivative
-                    // differentiates wrt that atom, we also need to collect that index.
-                    for (int i = 0; i < natom; i++){
-                        if (i == desired_atom_idx) {
-                            int tmp = 3 * (i + 2) + desired_coord;
-                            potential_indices[j].push_back(tmp);
+                        // Look up multidimensional cartesian derivative index
+                        auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+    
+                        // Find out which shell derivatives provided by Libint correspond to this nuclear cartesian derivative
+                        std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+                        for (int j = 0; j < multi_cart_idx.size(); j++){
+                            int desired_atom_idx = multi_cart_idx[j] / 3;
+                            int desired_coord = multi_cart_idx[j] % 3;
+                            for (int i = 0; i < 4; i++){
+                                int atom_idx = shell_atom_index_list[i];
+                                if (atom_idx == desired_atom_idx) {
+                                    int tmp = 3 * i + desired_coord;
+                                    indices[j].push_back(tmp);
+                                }
+                            }
                         }
-                    }
-                }
-
-                // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                // and the total number of subvectors is the order of differentiation
-                // Now we want all combinations where we pick exactly one index from each subvector.
-                // This is achievable through a cartesian product
-                std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                std::vector<std::vector<int>> potential_index_combos = cartesian_product(potential_indices);
-                std::vector<int> buffer_indices;
-                std::vector<int> potential_buffer_indices;
-                // Overlap/Kinetic integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
-                //for (auto vec : index_combos)  {
-                //    std::sort(vec.begin(), vec.end());
-                //    int buf_idx = 0;
-                //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                //    buffer_indices.push_back(buf_idx);
-                //}
-                for (auto vec : index_combos)  {
-                    if (deriv_order == 1) buffer_indices.push_back(buffer_index_oei1d[vec[0]]);
-                    else if (deriv_order == 2) buffer_indices.push_back(buffer_index_oei2d[vec[0]][vec[1]]);
-                    else if (deriv_order == 3) buffer_indices.push_back(buffer_index_oei3d[vec[0]][vec[1]][vec[2]]);
-                    else if (deriv_order == 4) buffer_indices.push_back(buffer_index_oei4d[vec[0]][vec[1]][vec[2]][vec[3]]);
-                }
-                // Potential integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
-                for (auto vec : potential_index_combos)  {
-                    std::sort(vec.begin(), vec.end());
-                    int buf_idx = 0;
-                    auto it = lower_bound(potential_buffer_multidim_lookup.begin(), potential_buffer_multidim_lookup.end(), vec);
-                    if (it != potential_buffer_multidim_lookup.end()) buf_idx = it - potential_buffer_multidim_lookup.begin();
-                    potential_buffer_indices.push_back(buf_idx);
-                }
 
-                // Loop over shell block for each buffer index which contributes to this derivative
-                // Overlap and Kinetic
-                for(auto i = 0; i < buffer_indices.size(); ++i) {
-                    auto overlap_shellset = overlap_buffer[buffer_indices[i]];
-                    auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
-                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                        for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                            S[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += overlap_shellset[idx];
-                            T[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += kinetic_shellset[idx];
+                        // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+                        // and the total number of subvectors is the order of differentiation
+                        // Now we want all combinations where we pick exactly one index from each subvector.
+                        // This is achievable through a cartesian product 
+                        std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                        std::vector<int> buffer_indices;
+                        
+                        // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
+                        //for (auto vec : index_combos)  {
+                        //    std::sort(vec.begin(), vec.end());
+                        //    int buf_idx = 0;
+                        //    // buffer_multidim_lookup
+                        //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                        //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                        //    buffer_indices.push_back(buf_idx);
+                        //}
+                        // Eventually, if you stop using lookup arrays, use above implementation, but these are sitting around so might as well use them 
+                        for (auto vec : index_combos)  {
+                            if (deriv_order == 1) buffer_indices.push_back(buffer_index_eri1d[vec[0]]);
+                            else if (deriv_order == 2) buffer_indices.push_back(buffer_index_eri2d[vec[0]][vec[1]]);
+                            else if (deriv_order == 3) buffer_indices.push_back(buffer_index_eri3d[vec[0]][vec[1]][vec[2]]);
+                            else if (deriv_order == 4) buffer_indices.push_back(buffer_index_eri4d[vec[0]][vec[1]][vec[2]][vec[3]]);
                         }
-                    }
-                }
-                // Potential
-                for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
-                    auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
-                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                        for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                            V[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += potential_shellset[idx];
+
+                        // Loop over shell block, keeping a total count idx for the size of shell set
+                        for(auto i = 0; i < buffer_indices.size(); ++i) {
+                            auto f12g12_shellset = f12g12_buffer[buffer_indices[i]];
+                            if (f12g12_shellset == nullptr) continue;
+                            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                                size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                                for(auto f2 = 0; f2 != n2; ++f2) {
+                                    size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                                    for(auto f3 = 0; f3 != n3; ++f3) {
+                                        size_t offset_3 = (bf3 + f3) * nbf4;
+                                        for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                            size_t offset_4 = bf4 + f4;
+                                            result[offset_1 + offset_2 + offset_3 + offset_4 + offset_nuc_idx] += f12g12_shellset[idx];
+                                        }
+                                    }
+                                }
+                            }
                         }
-                    }
+                    } // For every nuc_idx 0, nderivs_triu
                 }
-            } // Unique nuclear cartesian derivative indices loop
+            }
         }
-    } // shell duet loops
-    return {py::array(S.size(), S.data()), py::array(T.size(), T.data()), py::array(V.size(), V.data())}; // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
-} // oei_deriv_core function
+    } // shell quartet loops
+    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
+} // f12g12_deriv_core function
 
-// Computes a single 'deriv_order' derivative tensor of electron repulsion integrals, keeps everything in core memory
-py::array eri_deriv_core(int deriv_order) {
+// Computes a single 'deriv_order' derivative tensor of gradient norm of contracted Gaussian-type geminal integrals, keeps everything in core memory
+py::array f12_double_commutator_deriv_core(double beta, int deriv_order) {
     // Number of unique shell derivatives output by libint (number of indices in buffer)
     int nshell_derivs = how_many_derivs(4, deriv_order);
     // Number of unique nuclear derivatives of ERI's
@@ -1420,12 +3518,14 @@ py::array eri_deriv_core(int deriv_order) {
     const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
 
     // Libint engine for computing shell quartet derivatives
-    std::vector<libint2::Engine> eri_engines(nthreads);
+    auto cgtg_params = make_cgtg(beta);
+    std::vector<libint2::Engine> cgtg_del_engines(nthreads);
     size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
     int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
-    eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
+    // Returns Runtime Error: bad any_cast if shorthand version is used, may be an error on the Libint side since Psi4 works with this as well
+    cgtg_del_engines[0] = libint2::Engine(libint2::Operator::delcgtg2, max_nprim, max_l, deriv_order, 0., cgtg_params, libint2::BraKet::xx_xx);
     for (size_t i = 1; i != nthreads; ++i) {
-        eri_engines[i] = eri_engines[0];
+        cgtg_del_engines[i] = cgtg_del_engines[0];
     }
 
     size_t length = nbf1 * nbf2 * nbf3 * nbf4 * nderivs_triu;
@@ -1457,8 +3557,8 @@ py::array eri_deriv_core(int deriv_order) {
 #ifdef _OPENMP
                     thread_id = omp_get_thread_num();
 #endif
-                    eri_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                    const auto& eri_buffer = eri_engines[thread_id].results(); // will point to computed shell sets
+                    cgtg_del_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                    const auto& f12_double_commutator_buffer = cgtg_del_engines[thread_id].results(); // will point to computed shell sets
 
                     // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
                     for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
@@ -1507,8 +3607,8 @@ py::array eri_deriv_core(int deriv_order) {
 
                         // Loop over shell block, keeping a total count idx for the size of shell set
                         for(auto i = 0; i < buffer_indices.size(); ++i) {
-                            auto eri_shellset = eri_buffer[buffer_indices[i]];
-                            if (eri_shellset == nullptr) continue;
+                            auto f12_double_commutator_shellset = f12_double_commutator_buffer[buffer_indices[i]];
+                            if (f12_double_commutator_shellset == nullptr) continue;
                             for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                                 size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
                                 for(auto f2 = 0; f2 != n2; ++f2) {
@@ -1517,7 +3617,7 @@ py::array eri_deriv_core(int deriv_order) {
                                         size_t offset_3 = (bf3 + f3) * nbf4;
                                         for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
                                             size_t offset_4 = bf4 + f4;
-                                            result[offset_1 + offset_2 + offset_3 + offset_4 + offset_nuc_idx] += eri_shellset[idx];
+                                            result[offset_1 + offset_2 + offset_3 + offset_4 + offset_nuc_idx] += f12_double_commutator_shellset[idx];
                                         }
                                     }
                                 }
@@ -1529,7 +3629,7 @@ py::array eri_deriv_core(int deriv_order) {
         }
     } // shell quartet loops
     return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
-} // eri_deriv_core function
+} // f12_double_commutator_deriv_core function
 
 // Define module named 'libint_interface' which can be imported with python
 // The second arg, 'm' defines a variable py::module_ which can be used to create
@@ -1542,14 +3642,30 @@ PYBIND11_MODULE(libint_interface, m) {
     m.def("kinetic", &kinetic, "Computes kinetic integrals with libint");
     m.def("potential", &potential, "Computes potential integrals with libint");
     m.def("eri", &eri, "Computes electron repulsion integrals with libint");
+    m.def("f12", &f12, "Computes contracted Gaussian-type geminal integrals with libint");
+    m.def("f12_squared", &f12_squared, "Computes sqaured contracted Gaussian-type geminal integrals with libint");
+    m.def("f12g12", &f12g12, "Computes contracted Gaussian-type geminal times Coulomb repulsion integrals with libint");
+    m.def("f12_double_commutator", &f12_double_commutator, "Computes gradient norm of contracted Gaussian-type geminal integrals with libint");
     m.def("overlap_deriv", &overlap_deriv, "Computes overlap integral nuclear derivatives with libint");
     m.def("kinetic_deriv", &kinetic_deriv, "Computes kinetic integral nuclear derivatives with libint");
     m.def("potential_deriv", &potential_deriv, "Computes potential integral nuclear derivatives with libint");
     m.def("eri_deriv", &eri_deriv, "Computes electron repulsion integral nuclear derivatives with libint");
+    m.def("f12_deriv", &f12_deriv, "Computes contracted Gaussian-type geminal integral nuclear derivatives with libint");
+    m.def("f12_squared_deriv", &f12_squared_deriv, "Computes sqaured contracted Gaussian-type geminal integral nuclear derivatives with libint");
+    m.def("f12g12_deriv", &f12g12_deriv, "Computes contracted Gaussian-type geminal times Coulomb repulsion integral nuclear derivatives with libint");
+    m.def("f12_double_commutator_deriv", &f12_double_commutator_deriv, "Computes gradient norm of contracted Gaussian-type geminal integral nuclear derivatives with libint");
     m.def("oei_deriv_disk", &oei_deriv_disk, "Computes overlap, kinetic, and potential integral derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
     m.def("eri_deriv_disk", &eri_deriv_disk, "Computes coulomb integral nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
+    m.def("f12_deriv_disk", &f12_deriv_disk, "Computes contracted Gaussian-type geminal integral nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
+    m.def("f12_squared_deriv_disk", &f12_squared_deriv_disk, "Computes sqaured contracted Gaussian-type geminal integral nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
+    m.def("f12g12_deriv_disk", &f12g12_deriv_disk, "Computes contracted Gaussian-type geminal times Coulomb repulsion integral nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
+    m.def("f12_double_commutator_deriv_disk", &f12_double_commutator_deriv_disk, "Computes gradient norm of contracted Gaussian-type geminal integral nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
     m.def("oei_deriv_core", &oei_deriv_core, "Computes a single OEI integral derivative tensor, in memory.");
     m.def("eri_deriv_core", &eri_deriv_core, "Computes a single coulomb integral nuclear derivative tensor, in memory.");
+    m.def("f12_deriv_core", &f12_deriv_core, "Computes a single contracted Gaussian-type geminal integral nuclear derivative tensor, in memory.");
+    m.def("f12_squared_deriv_core", &f12_squared_deriv_core, "Computes a single sqaured contracted Gaussian-type geminal integral nuclear derivative tensor, in memory.");
+    m.def("f12g12_deriv_core", &f12g12_deriv_core, "Computes a single contracted Gaussian-type geminal times Coulomb repulsion integral nuclear derivative tensor, in memory.");
+    m.def("f12_double_commutator_deriv_core", &f12_double_commutator_deriv_core, "Computes a single gradient norm of contracted Gaussian-type geminal integral nuclear derivative tensor, in memory.");
     //TODO partial derivative impl's
     //m.def("eri_partial_deriv_disk", &eri_partial_deriv_disk, "Computes a subset of the full coulomb integral nuclear derivative tensor and writes them to disk with HDF5");
      m.attr("LIBINT2_MAX_DERIV_ORDER") = LIBINT2_MAX_DERIV_ORDER;
diff --git a/quax/integrals/oei.py b/quax/integrals/oei.py
index 746179f..2f8bc83 100644
--- a/quax/integrals/oei.py
+++ b/quax/integrals/oei.py
@@ -11,15 +11,15 @@
 
 class OEI(object):
 
-    def __init__(self, basis_name, xyz_path, max_deriv_order, mode):
+    def __init__(self, basis1, basis2, xyz_path, max_deriv_order, mode):
         with open(xyz_path, 'r') as f:
             tmp = f.read()
         molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
-        basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
+        basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis1, puream=0) # Not generalized yet
         natoms = molecule.natom()
         nbf = basis_set.nbf()
 
-        if mode == 'core' and max_deriv_order > 0:
+        if 'core' in mode and max_deriv_order > 0:
             # A list of OEI derivative tensors, containing only unique elements
             # corresponding to upper hypertriangle (since derivative tensors are symmetric)
             # Length of tuple is maximum deriv order, each array is (upper triangle derivatives,nbf,nbf)
diff --git a/quax/integrals/tei.py b/quax/integrals/tei.py
index 5c6b98f..82610d0 100644
--- a/quax/integrals/tei.py
+++ b/quax/integrals/tei.py
@@ -11,15 +11,15 @@
 
 class TEI(object):
 
-    def __init__(self, basis_name, xyz_path, max_deriv_order, mode):
+    def __init__(self, basis1, basis2, basis3, basis4, xyz_path, max_deriv_order, mode):
         with open(xyz_path, 'r') as f:
             tmp = f.read()
         molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
-        basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
+        basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis1, puream=0) # Not generalized yet
         natoms = molecule.natom()
         nbf = basis_set.nbf()
 
-        if mode == 'core' and max_deriv_order > 0:
+        if 'core' in mode and max_deriv_order > 0:
             # A list of ERI derivative tensors, containing only unique elements
             # corresponding to upper hypertriangle (since derivative tensors are symmetric)
             # Length of tuple is maximum deriv order, each array is (upper triangle derivatives,nbf,nbf,nbf,nbf)
@@ -36,17 +36,45 @@ def __init__(self, basis_name, xyz_path, max_deriv_order, mode):
         # Create new JAX primitive for TEI evaluation
         self.eri_p = jax.core.Primitive("eri")
         self.eri_deriv_p = jax.core.Primitive("eri_deriv")
+        self.f12_p = jax.core.Primitive("f12")
+        self.f12_deriv_p = jax.core.Primitive("f12_deriv")
+        self.f12_squared_p = jax.core.Primitive("f12_squared")
+        self.f12_squared_deriv_p = jax.core.Primitive("f12_squared_deriv")
+        self.f12g12_p = jax.core.Primitive("f12g12")
+        self.f12g12_deriv_p = jax.core.Primitive("f12g12_deriv")
+        self.f12_double_commutator_p = jax.core.Primitive("f12_double_commutator")
+        self.f12_double_commutator_deriv_p = jax.core.Primitive("f12_double_commutator_deriv")
 
         # Register primitive evaluation rules
         self.eri_p.def_impl(self.eri_impl)
         self.eri_deriv_p.def_impl(self.eri_deriv_impl)
+        self.f12_p.def_impl(self.f12_impl)
+        self.f12_deriv_p.def_impl(self.f12_deriv_impl)
+        self.f12_squared_p.def_impl(self.f12_squared_impl)
+        self.f12_squared_deriv_p.def_impl(self.f12_squared_deriv_impl)
+        self.f12g12_p.def_impl(self.f12g12_impl)
+        self.f12g12_deriv_p.def_impl(self.f12g12_deriv_impl)
+        self.f12_double_commutator_p.def_impl(self.f12_double_commutator_impl)
+        self.f12_double_commutator_deriv_p.def_impl(self.f12_double_commutator_deriv_impl)
 
         # Register the JVP rules with JAX
         jax.interpreters.ad.primitive_jvps[self.eri_p] = self.eri_jvp
         jax.interpreters.ad.primitive_jvps[self.eri_deriv_p] = self.eri_deriv_jvp
+        jax.interpreters.ad.primitive_jvps[self.f12_p] = self.f12_jvp
+        jax.interpreters.ad.primitive_jvps[self.f12_deriv_p] = self.f12_deriv_jvp
+        jax.interpreters.ad.primitive_jvps[self.f12_squared_p] = self.f12_squared_jvp
+        jax.interpreters.ad.primitive_jvps[self.f12_squared_deriv_p] = self.f12_squared_deriv_jvp
+        jax.interpreters.ad.primitive_jvps[self.f12g12_p] = self.f12g12_jvp
+        jax.interpreters.ad.primitive_jvps[self.f12g12_deriv_p] = self.f12g12_deriv_jvp
+        jax.interpreters.ad.primitive_jvps[self.f12_double_commutator_p] = self.f12_double_commutator_jvp
+        jax.interpreters.ad.primitive_jvps[self.f12_double_commutator_deriv_p] = self.f12_double_commutator_deriv_jvp
 
         # Register tei_deriv batching rule with JAX
         jax.interpreters.batching.primitive_batchers[self.eri_deriv_p] = self.eri_deriv_batch
+        jax.interpreters.batching.primitive_batchers[self.f12_deriv_p] = self.f12_deriv_batch
+        jax.interpreters.batching.primitive_batchers[self.f12_squared_deriv_p] = self.f12_squared_deriv_batch
+        jax.interpreters.batching.primitive_batchers[self.f12g12_deriv_p] = self.f12g12_deriv_batch
+        jax.interpreters.batching.primitive_batchers[self.f12_double_commutator_deriv_p] = self.f12_double_commutator_deriv_batch
 
     # Create functions to call primitives
     def eri(self, geom):
@@ -55,6 +83,30 @@ def eri(self, geom):
     def eri_deriv(self, geom, deriv_vec):
         return self.eri_deriv_p.bind(geom, deriv_vec)
 
+    def f12(self, geom, beta):
+        return self.f12_p.bind(geom, beta)
+
+    def f12_deriv(self, geom, beta, deriv_vec):
+        return self.f12_deriv_p.bind(geom, beta, deriv_vec)
+
+    def f12_squared(self, geom, beta):
+        return self.f12_squared_p.bind(geom, beta)
+
+    def f12_squared_deriv(self, geom, beta, deriv_vec):
+        return self.f12_squared_deriv_p.bind(geom, beta, deriv_vec)
+
+    def f12g12(self, geom, beta):
+        return self.f12g12_p.bind(geom, beta)
+
+    def f12g12_deriv(self, geom, beta, deriv_vec):
+        return self.f12g12_deriv_p.bind(geom, beta, deriv_vec)
+
+    def f12_double_commutator(self, geom, beta):
+        return self.f12_double_commutator_p.bind(geom, beta)
+
+    def f12_double_commutator_deriv(self, geom, beta, deriv_vec):
+        return self.f12_double_commutator_deriv_p.bind(geom, beta, deriv_vec)
+
     # Create primitive evaluation rules
     def eri_impl(self, geom):
         G = libint_interface.eri()
@@ -62,6 +114,30 @@ def eri_impl(self, geom):
         G = G.reshape(self.nbf,self.nbf,self.nbf,self.nbf)
         return jnp.asarray(G)
 
+    def f12_impl(self, geom, beta):
+        F = libint_interface.f12(beta)
+        #d = int(np.sqrt(np.sqrt(G.shape[0])))
+        F = F.reshape(self.nbf,self.nbf,self.nbf,self.nbf)
+        return jnp.asarray(F)
+
+    def f12_squared_impl(self, geom, beta):
+        F = libint_interface.f12_squared(beta)
+        #d = int(np.sqrt(np.sqrt(G.shape[0])))
+        F = F.reshape(self.nbf,self.nbf,self.nbf,self.nbf)
+        return jnp.asarray(F)
+
+    def f12g12_impl(self, geom, beta):
+        F = libint_interface.f12g12(beta)
+        #d = int(np.sqrt(np.sqrt(G.shape[0])))
+        F = F.reshape(self.nbf,self.nbf,self.nbf,self.nbf)
+        return jnp.asarray(F)
+    
+    def f12_double_commutator_impl(self, geom, beta):
+        F = libint_interface.f12_double_commutator(beta)
+        #d = int(np.sqrt(np.sqrt(G.shape[0])))
+        F = F.reshape(self.nbf,self.nbf,self.nbf,self.nbf)
+        return jnp.asarray(F)
+
     def eri_deriv_impl(self, geom, deriv_vec):
         deriv_vec = np.asarray(deriv_vec, int)
         deriv_order = np.sum(deriv_vec)
@@ -95,12 +171,51 @@ def eri_deriv_impl(self, geom, deriv_vec):
                     raise Exception("Something went wrong reading integral derivative file")
             return jnp.asarray(G)
 
+    def f12_deriv_impl(self, geom, beta, deriv_vec):
+        deriv_vec = np.asarray(deriv_vec, int)
+        deriv_order = np.sum(deriv_vec)
+        #idx = get_deriv_vec_idx(deriv_vec)
+
+        # Use eri derivatives in memory
+        if self.mode == 'core':
+            F = libint_interface.f12_deriv(beta, deriv_vec)
+            return jnp.asarray(F).reshape(self.nbf,self.nbf,self.nbf,self.nbf)
+
+    def f12_squared_deriv_impl(self, geom, beta, deriv_vec):
+        deriv_vec = np.asarray(deriv_vec, int)
+        deriv_order = np.sum(deriv_vec)
+        #idx = get_deriv_vec_idx(deriv_vec)
+
+        # Use eri derivatives in memory
+        if self.mode == 'core':
+            F = libint_interface.f12_squared_deriv(beta, deriv_vec)
+            return jnp.asarray(F).reshape(self.nbf,self.nbf,self.nbf,self.nbf)
+
+    def f12g12_deriv_impl(self, geom, beta, deriv_vec):
+        deriv_vec = np.asarray(deriv_vec, int)
+        deriv_order = np.sum(deriv_vec)
+        #idx = get_deriv_vec_idx(deriv_vec)
+
+        # Use eri derivatives in memory
+        if self.mode == 'core':
+            F = libint_interface.f12g12_deriv(beta, deriv_vec)
+            return jnp.asarray(F).reshape(self.nbf,self.nbf,self.nbf,self.nbf)
+
+    def f12_double_commutator_deriv_impl(self, geom, beta, deriv_vec):
+        deriv_vec = np.asarray(deriv_vec, int)
+        deriv_order = np.sum(deriv_vec)
+        #idx = get_deriv_vec_idx(deriv_vec)
+
+        # Use eri derivatives in memory
+        if self.mode == 'core':
+            F = libint_interface.f12_double_commutator_deriv(beta, deriv_vec)
+            return jnp.asarray(F).reshape(self.nbf,self.nbf,self.nbf,self.nbf)
 
     # Create Jacobian-vector product rule, which given some input args (primals)
     # and a tangent std basis vector (tangent), returns the function evaluated at that point (primals_out)
     # and the slice of the Jacobian (tangents_out)
     def eri_jvp(self, primals, tangents):
-        geom, = primals
+        geom = primals
         primals_out = self.eri(geom)
         tangents_out = self.eri_deriv(geom, tangents[0])
         return primals_out, tangents_out
@@ -113,6 +228,62 @@ def eri_deriv_jvp(self, primals, tangents):
         tangents_out = self.eri_deriv(geom, deriv_vec + tangents[0])
         return primals_out, tangents_out
 
+    def f12_jvp(self, primals, tangents):
+        geom, beta = primals
+        primals_out = self.f12(geom, beta)
+        tangents_out = self.f12_deriv(geom, beta, tangents[0])
+        return primals_out, tangents_out
+
+    def f12_deriv_jvp(self, primals, tangents):
+        geom, beta, deriv_vec = primals
+        primals_out = self.f12_deriv(geom, beta, deriv_vec)
+        # Here we add the current value of deriv_vec to the incoming tangent vector,
+        # so that nested higher order differentiation works
+        tangents_out = self.f12_deriv(geom, beta, deriv_vec + tangents[0])
+        return primals_out, tangents_out
+
+    def f12_squared_jvp(self, primals, tangents):
+        geom, beta = primals
+        primals_out = self.f12_squared(geom, beta)
+        tangents_out = self.f12_squared_deriv(geom, beta, tangents[0])
+        return primals_out, tangents_out
+
+    def f12_squared_deriv_jvp(self, primals, tangents):
+        geom, beta, deriv_vec = primals
+        primals_out = self.f12_squared_deriv(geom, beta, deriv_vec)
+        # Here we add the current value of deriv_vec to the incoming tangent vector,
+        # so that nested higher order differentiation works
+        tangents_out = self.f12_squared_deriv(geom, beta, deriv_vec + tangents[0])
+        return primals_out, tangents_out
+
+    def f12g12_jvp(self, primals, tangents):
+        geom, beta = primals
+        primals_out = self.f12g12(geom, beta)
+        tangents_out = self.f12g12_deriv(geom, beta, tangents[0])
+        return primals_out, tangents_out
+
+    def f12g12_deriv_jvp(self, primals, tangents):
+        geom, beta, deriv_vec = primals
+        primals_out = self.f12g12_deriv(geom, beta, deriv_vec)
+        # Here we add the current value of deriv_vec to the incoming tangent vector,
+        # so that nested higher order differentiation works
+        tangents_out = self.f12g12_deriv(geom, beta, deriv_vec + tangents[0])
+        return primals_out, tangents_out
+
+    def f12_double_commutator_jvp(self, primals, tangents):
+        geom, beta = primals
+        primals_out = self.f12_double_commutator(geom, beta)
+        tangents_out = self.f12_double_commutator_deriv(geom, beta, tangents[0])
+        return primals_out, tangents_out
+
+    def f12_double_commutator_deriv_jvp(self, primals, tangents):
+        geom, beta, deriv_vec = primals
+        primals_out = self.f12_double_commutator_deriv(geom, beta, deriv_vec)
+        # Here we add the current value of deriv_vec to the incoming tangent vector,
+        # so that nested higher order differentiation works
+        tangents_out = self.f12_double_commutator_deriv(geom, beta, deriv_vec + tangents[0])
+        return primals_out, tangents_out
+
     # Define Batching rules, this is only needed since jax.jacfwd will call vmap on the JVP of tei
     def eri_deriv_batch(self, batched_args, batch_dims):
         # When the input argument of deriv_batch is batched along the 0'th axis
@@ -128,4 +299,64 @@ def eri_deriv_batch(self, batched_args, batch_dims):
             results.append(jnp.expand_dims(tmp, axis=0))
         results = jnp.concatenate(results, axis=0)
         return results, 0
+    
+    def f12_deriv_batch(self, batched_args, batch_dims):
+        # When the input argument of deriv_batch is batched along the 0'th axis
+        # we want to evaluate every 4d slice, gather up a (ncart, n,n,n,n) array,
+        # (expand dims at 0 and concatenate at 0)
+        # and then return the results, indicating the out batch axis
+        # is in the 0th position (return results, 0)
+        geom_batch, beta_batch, deriv_batch = batched_args
+        geom_dim, beta_dim, deriv_dim = batch_dims
+        results = []
+        for i in deriv_batch:
+            tmp = self.f12_deriv(geom_batch, beta_batch, i)
+            results.append(jnp.expand_dims(tmp, axis=0))
+        results = jnp.concatenate(results, axis=0)
+        return results, 0
+
+    def f12_squared_deriv_batch(self, batched_args, batch_dims):
+        # When the input argument of deriv_batch is batched along the 0'th axis
+        # we want to evaluate every 4d slice, gather up a (ncart, n,n,n,n) array,
+        # (expand dims at 0 and concatenate at 0)
+        # and then return the results, indicating the out batch axis
+        # is in the 0th position (return results, 0)
+        geom_batch, beta_batch, deriv_batch = batched_args
+        geom_dim, beta_dim, deriv_dim = batch_dims
+        results = []
+        for i in deriv_batch:
+            tmp = self.f12_squared_deriv(geom_batch, beta_batch, i)
+            results.append(jnp.expand_dims(tmp, axis=0))
+        results = jnp.concatenate(results, axis=0)
+        return results, 0
+
+    def f12g12_deriv_batch(self, batched_args, batch_dims):
+        # When the input argument of deriv_batch is batched along the 0'th axis
+        # we want to evaluate every 4d slice, gather up a (ncart, n,n,n,n) array,
+        # (expand dims at 0 and concatenate at 0)
+        # and then return the results, indicating the out batch axis
+        # is in the 0th position (return results, 0)
+        geom_batch, beta_batch, deriv_batch = batched_args
+        geom_dim, beta_dim, deriv_dim = batch_dims
+        results = []
+        for i in deriv_batch:
+            tmp = self.f12g12_deriv(geom_batch, beta_batch, i)
+            results.append(jnp.expand_dims(tmp, axis=0))
+        results = jnp.concatenate(results, axis=0)
+        return results, 0
+
+    def f12_double_commutator_deriv_batch(self, batched_args, batch_dims):
+        # When the input argument of deriv_batch is batched along the 0'th axis
+        # we want to evaluate every 4d slice, gather up a (ncart, n,n,n,n) array,
+        # (expand dims at 0 and concatenate at 0)
+        # and then return the results, indicating the out batch axis
+        # is in the 0th position (return results, 0)
+        geom_batch, beta_batch, deriv_batch = batched_args
+        geom_dim, beta_dim, deriv_dim = batch_dims
+        results = []
+        for i in deriv_batch:
+            tmp = self.f12_double_commutator_deriv(geom_batch, beta_batch, i)
+            results.append(jnp.expand_dims(tmp, axis=0))
+        results = jnp.concatenate(results, axis=0)
+        return results, 0
 
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index 6255d79..9b2d823 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -18,23 +18,22 @@
 def compute_integrals(geom, basis_name, xyz_path, nuclear_charges, charge, deriv_order, options):
     # Load integral algo, decides to compute integrals in memory or use disk 
     algo = options['integral_algo']
+    libint_interface.initialize(xyz_path, basis_name, basis_name, basis_name, basis_name)
 
     if algo == 'libint_disk':
         # Check disk for currently existing integral derivatives
         check = check_disk(geom,basis_name,xyz_path,deriv_order)
 
-        tei_obj = TEI(basis_name, xyz_path, deriv_order, 'disk')
-        oei_obj = OEI(basis_name, xyz_path, deriv_order, 'disk')
+        tei_obj = TEI(basis_name, basis_name, basis_name, basis_name, xyz_path, deriv_order, 'disk')
+        oei_obj = OEI(basis_name, basis_name, xyz_path, deriv_order, 'disk')
         # If disk integral derivs are right, nothing to do
         if check:
-            libint_interface.initialize(xyz_path, basis_name, basis_name, basis_name, basis_name)
             S = oei_obj.overlap(geom)
             T = oei_obj.kinetic(geom)
             V = oei_obj.potential(geom)
             G = tei_obj.eri(geom)
             libint_interface.finalize()
         else:
-            libint_interface.initialize(xyz_path, basis_name, basis_name, basis_name, basis_name)
             libint_interface.oei_deriv_disk(deriv_order)
             libint_interface.eri_deriv_disk(deriv_order)
             S = oei_obj.overlap(geom)
@@ -44,17 +43,16 @@ def compute_integrals(geom, basis_name, xyz_path, nuclear_charges, charge, deriv
             libint_interface.finalize()
 
     else:
-        libint_interface.initialize(xyz_path, basis_name, basis_name, basis_name, basis_name)
         # Precompute TEI derivatives
-        tei_obj = TEI(basis_name, xyz_path, deriv_order, 'core')
-        oei_obj = OEI(basis_name, xyz_path, deriv_order, 'core')
+        tei_obj = TEI(basis_name, basis_name, basis_name, basis_name, xyz_path, deriv_order, 'core')
+        oei_obj = OEI(basis_name, basis_name, xyz_path, deriv_order, 'core')
         # Compute integrals
         S = oei_obj.overlap(geom)
         T = oei_obj.kinetic(geom)
         V = oei_obj.potential(geom)
         G = tei_obj.eri(geom)
-        libint_interface.finalize()
 
+    libint_interface.finalize()
     return S, T, V, G
 
 def check_disk(geom,basis_name,xyz_path,deriv_order,address=None):

From 9bbb27fe7b723a42f275c72c812473e89727c38a Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Wed, 27 Sep 2023 16:02:32 -0400
Subject: [PATCH 12/91] Fixed broken tests from last commit

---
 quax/integrals/oei.py |  6 +++---
 quax/integrals/tei.py | 21 ++++++++++-----------
 quax/methods/ints.py  | 12 ------------
 3 files changed, 13 insertions(+), 26 deletions(-)

diff --git a/quax/integrals/oei.py b/quax/integrals/oei.py
index 2f8bc83..3233183 100644
--- a/quax/integrals/oei.py
+++ b/quax/integrals/oei.py
@@ -106,7 +106,7 @@ def overlap_deriv_impl(self, geom, deriv_vec):
         deriv_order = np.sum(deriv_vec)
         idx = get_deriv_vec_idx(deriv_vec)
 
-        if self.mode == 'core':
+        if 'core' in self.mode:
             S = self.overlap_derivatives[deriv_order-1][idx,:,:]
             return jnp.asarray(S)
         else:
@@ -133,7 +133,7 @@ def kinetic_deriv_impl(self, geom, deriv_vec):
         deriv_order = np.sum(deriv_vec)
         idx = get_deriv_vec_idx(deriv_vec)
 
-        if self.mode == 'core':
+        if 'core' in self.mode:
             T = self.kinetic_derivatives[deriv_order-1][idx,:,:]
             return jnp.asarray(T)
         else:
@@ -160,7 +160,7 @@ def potential_deriv_impl(self, geom, deriv_vec):
         deriv_order = np.sum(deriv_vec)
         idx = get_deriv_vec_idx(deriv_vec)
 
-        if self.mode == 'core':
+        if 'core' in self.mode:
             V = self.potential_derivatives[deriv_order-1][idx,:,:]
             return jnp.asarray(V)
         else:
diff --git a/quax/integrals/tei.py b/quax/integrals/tei.py
index 82610d0..29f3eed 100644
--- a/quax/integrals/tei.py
+++ b/quax/integrals/tei.py
@@ -144,7 +144,7 @@ def eri_deriv_impl(self, geom, deriv_vec):
         idx = get_deriv_vec_idx(deriv_vec)
 
         # Use eri derivatives in memory
-        if self.mode == 'core':
+        if 'core' in self.mode:
             G = self.eri_derivatives[deriv_order-1][idx,:,:,:,:]
             return jnp.asarray(G)
 
@@ -177,7 +177,7 @@ def f12_deriv_impl(self, geom, beta, deriv_vec):
         #idx = get_deriv_vec_idx(deriv_vec)
 
         # Use eri derivatives in memory
-        if self.mode == 'core':
+        if 'core' in self.mode:
             F = libint_interface.f12_deriv(beta, deriv_vec)
             return jnp.asarray(F).reshape(self.nbf,self.nbf,self.nbf,self.nbf)
 
@@ -187,7 +187,7 @@ def f12_squared_deriv_impl(self, geom, beta, deriv_vec):
         #idx = get_deriv_vec_idx(deriv_vec)
 
         # Use eri derivatives in memory
-        if self.mode == 'core':
+        if 'core' in self.mode:
             F = libint_interface.f12_squared_deriv(beta, deriv_vec)
             return jnp.asarray(F).reshape(self.nbf,self.nbf,self.nbf,self.nbf)
 
@@ -197,7 +197,7 @@ def f12g12_deriv_impl(self, geom, beta, deriv_vec):
         #idx = get_deriv_vec_idx(deriv_vec)
 
         # Use eri derivatives in memory
-        if self.mode == 'core':
+        if 'core' in self.mode:
             F = libint_interface.f12g12_deriv(beta, deriv_vec)
             return jnp.asarray(F).reshape(self.nbf,self.nbf,self.nbf,self.nbf)
 
@@ -207,7 +207,7 @@ def f12_double_commutator_deriv_impl(self, geom, beta, deriv_vec):
         #idx = get_deriv_vec_idx(deriv_vec)
 
         # Use eri derivatives in memory
-        if self.mode == 'core':
+        if 'core' in self.mode:
             F = libint_interface.f12_double_commutator_deriv(beta, deriv_vec)
             return jnp.asarray(F).reshape(self.nbf,self.nbf,self.nbf,self.nbf)
 
@@ -215,7 +215,7 @@ def f12_double_commutator_deriv_impl(self, geom, beta, deriv_vec):
     # and a tangent std basis vector (tangent), returns the function evaluated at that point (primals_out)
     # and the slice of the Jacobian (tangents_out)
     def eri_jvp(self, primals, tangents):
-        geom = primals
+        geom, = primals
         primals_out = self.eri(geom)
         tangents_out = self.eri_deriv(geom, tangents[0])
         return primals_out, tangents_out
@@ -229,7 +229,7 @@ def eri_deriv_jvp(self, primals, tangents):
         return primals_out, tangents_out
 
     def f12_jvp(self, primals, tangents):
-        geom, beta = primals
+        geom, beta, = primals
         primals_out = self.f12(geom, beta)
         tangents_out = self.f12_deriv(geom, beta, tangents[0])
         return primals_out, tangents_out
@@ -243,7 +243,7 @@ def f12_deriv_jvp(self, primals, tangents):
         return primals_out, tangents_out
 
     def f12_squared_jvp(self, primals, tangents):
-        geom, beta = primals
+        geom, beta, = primals
         primals_out = self.f12_squared(geom, beta)
         tangents_out = self.f12_squared_deriv(geom, beta, tangents[0])
         return primals_out, tangents_out
@@ -257,7 +257,7 @@ def f12_squared_deriv_jvp(self, primals, tangents):
         return primals_out, tangents_out
 
     def f12g12_jvp(self, primals, tangents):
-        geom, beta = primals
+        geom, beta, = primals
         primals_out = self.f12g12(geom, beta)
         tangents_out = self.f12g12_deriv(geom, beta, tangents[0])
         return primals_out, tangents_out
@@ -271,7 +271,7 @@ def f12g12_deriv_jvp(self, primals, tangents):
         return primals_out, tangents_out
 
     def f12_double_commutator_jvp(self, primals, tangents):
-        geom, beta = primals
+        geom, beta, = primals
         primals_out = self.f12_double_commutator(geom, beta)
         tangents_out = self.f12_double_commutator_deriv(geom, beta, tangents[0])
         return primals_out, tangents_out
@@ -359,4 +359,3 @@ def f12_double_commutator_deriv_batch(self, batched_args, batch_dims):
             results.append(jnp.expand_dims(tmp, axis=0))
         results = jnp.concatenate(results, axis=0)
         return results, 0
-
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index 9b2d823..d4a85ea 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -32,7 +32,6 @@ def compute_integrals(geom, basis_name, xyz_path, nuclear_charges, charge, deriv
             T = oei_obj.kinetic(geom)
             V = oei_obj.potential(geom)
             G = tei_obj.eri(geom)
-            libint_interface.finalize()
         else:
             libint_interface.oei_deriv_disk(deriv_order)
             libint_interface.eri_deriv_disk(deriv_order)
@@ -40,7 +39,6 @@ def compute_integrals(geom, basis_name, xyz_path, nuclear_charges, charge, deriv
             T = oei_obj.kinetic(geom)
             V = oei_obj.potential(geom)
             G = tei_obj.eri(geom)
-            libint_interface.finalize()
 
     else:
         # Precompute TEI derivatives
@@ -95,13 +93,3 @@ def check_disk(geom,basis_name,xyz_path,deriv_order,address=None):
         correct_nbf = oeifile[sample_dataset_name].shape[0] == nbf
         correct_int_derivs = correct_nbf
     return correct_int_derivs
-
-
-
-              
-
-
-
-    
-
-

From bc1160a050c3acef53d8b85b418aaeccf853a0e0 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Thu, 28 Sep 2023 11:49:26 -0400
Subject: [PATCH 13/91] Generalize to use multiple basis sets

---
 quax/integrals/basis_utils.py |   4 +-
 quax/integrals/makefile       |   2 +-
 quax/integrals/oei.py         |  28 +++---
 quax/integrals/tei.py         | 174 +++++++++++++++++++++++++++++-----
 quax/methods/ints.py          |   4 +-
 quax/methods/mp2.py           |  10 +-
 6 files changed, 175 insertions(+), 47 deletions(-)

diff --git a/quax/integrals/basis_utils.py b/quax/integrals/basis_utils.py
index 580f651..8d039ae 100644
--- a/quax/integrals/basis_utils.py
+++ b/quax/integrals/basis_utils.py
@@ -2,13 +2,13 @@
 import jax.numpy as jnp
 import numpy as np
 
-def build_basis_set(molecule, basis):
+def build_basis_set(molecule, basis_name):
     # Avoids printing from psi4
     psi4.core.be_quiet()
     # Create empty dictionary to hold basis information
     basis_dict = {}
     # Build basis in Psi4
-    basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis, puream=0)
+    basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
     # Get total number of shells for the molecule
     nshell = basis_set.nshell()
     # Loop over each shell
diff --git a/quax/integrals/makefile b/quax/integrals/makefile
index 26123f1..eb7acde 100644
--- a/quax/integrals/makefile
+++ b/quax/integrals/makefile
@@ -2,7 +2,7 @@
 # Eigen headers, Python headers, Pybind11 headers, Libint API headers libint2.h libint2.hpp, the rest of the Libint2 headers, and the library location of libint2.a,
 CC      := g++
 # Options passed to compiler
-CFLAGS  := -O3 -fPIC -fopenmp -g
+CFLAGS  := -O3 -fPIC -fopenmp
 # Libint prefix location (where /include, /include/libint2, /lib, /share are located) 
 LIBINT_PREFIX := /home/ecm23353/psi_env
 
diff --git a/quax/integrals/oei.py b/quax/integrals/oei.py
index 3233183..68c8d3a 100644
--- a/quax/integrals/oei.py
+++ b/quax/integrals/oei.py
@@ -15,9 +15,12 @@ def __init__(self, basis1, basis2, xyz_path, max_deriv_order, mode):
         with open(xyz_path, 'r') as f:
             tmp = f.read()
         molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
-        basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis1, puream=0) # Not generalized yet
         natoms = molecule.natom()
-        nbf = basis_set.nbf()
+
+        bs1 = psi4.core.BasisSet.build(molecule, 'BASIS', basis1, puream=0)
+        bs2 = psi4.core.BasisSet.build(molecule, 'BASIS', basis2, puream=0)
+        nbf1 = bs1.nbf()
+        nbf2 = bs2.nbf()
 
         if 'core' in mode and max_deriv_order > 0:
             # A list of OEI derivative tensors, containing only unique elements
@@ -30,12 +33,13 @@ def __init__(self, basis1, basis2, xyz_path, max_deriv_order, mode):
             for i in range(max_deriv_order):
                 n_unique_derivs = how_many_derivs(natoms, i + 1)
                 oei_deriv = libint_interface.oei_deriv_core(i + 1)
-                self.overlap_derivatives.append(oei_deriv[0].reshape(n_unique_derivs,nbf,nbf))
-                self.kinetic_derivatives.append(oei_deriv[1].reshape(n_unique_derivs,nbf,nbf))
-                self.potential_derivatives.append(oei_deriv[2].reshape(n_unique_derivs,nbf,nbf))
+                self.overlap_derivatives.append(oei_deriv[0].reshape(n_unique_derivs, nbf1, nbf2))
+                self.kinetic_derivatives.append(oei_deriv[1].reshape(n_unique_derivs, nbf1, nbf2))
+                self.potential_derivatives.append(oei_deriv[2].reshape(n_unique_derivs, nbf1, nbf2))
 
         self.mode = mode
-        self.nbf = nbf
+        self.nbf1 = nbf1
+        self.nbf2 = nbf2
 
         # Create new JAX primitives for overlap, kinetic, potential evaluation and their derivatives
         self.overlap_p = jax.core.Primitive("overlap")
@@ -88,17 +92,17 @@ def potential_deriv(self, geom, deriv_vec):
     # Create primitive evaluation rules
     def overlap_impl(self, geom):
         S = libint_interface.overlap()
-        S = S.reshape(self.nbf,self.nbf)
+        S = S.reshape(self.nbf1, self.nbf2)
         return jnp.asarray(S)
 
     def kinetic_impl(self, geom):
         T = libint_interface.kinetic()
-        T = T.reshape(self.nbf,self.nbf)
+        T = T.reshape(self.nbf1, self.nbf2)
         return jnp.asarray(T)
 
     def potential_impl(self, geom):
         V = libint_interface.potential()
-        V = V.reshape(self.nbf,self.nbf)
+        V = V.reshape(self.nbf1, self.nbf2)
         return jnp.asarray(V)
 
     def overlap_deriv_impl(self, geom, deriv_vec):
@@ -109,7 +113,7 @@ def overlap_deriv_impl(self, geom, deriv_vec):
         if 'core' in self.mode:
             S = self.overlap_derivatives[deriv_order-1][idx,:,:]
             return jnp.asarray(S)
-        else:
+        elif 'disk' in self.mode:
             if os.path.exists("oei_derivs.h5"):
                 file_name = "oei_derivs.h5"
                 dataset_name = "overlap_deriv" + str(deriv_order)
@@ -136,7 +140,7 @@ def kinetic_deriv_impl(self, geom, deriv_vec):
         if 'core' in self.mode:
             T = self.kinetic_derivatives[deriv_order-1][idx,:,:]
             return jnp.asarray(T)
-        else:
+        elif 'disk' in self.mode:
             if os.path.exists("oei_derivs.h5"):
                 file_name = "oei_derivs.h5"
                 dataset_name = "kinetic_deriv" + str(deriv_order)
@@ -163,7 +167,7 @@ def potential_deriv_impl(self, geom, deriv_vec):
         if 'core' in self.mode:
             V = self.potential_derivatives[deriv_order-1][idx,:,:]
             return jnp.asarray(V)
-        else:
+        elif 'disk' in self.mode:
             if os.path.exists("oei_derivs.h5"):
                 file_name = "oei_derivs.h5"
                 dataset_name = "potential_deriv" + str(deriv_order)
diff --git a/quax/integrals/tei.py b/quax/integrals/tei.py
index 29f3eed..b1cbfce 100644
--- a/quax/integrals/tei.py
+++ b/quax/integrals/tei.py
@@ -15,9 +15,16 @@ def __init__(self, basis1, basis2, basis3, basis4, xyz_path, max_deriv_order, mo
         with open(xyz_path, 'r') as f:
             tmp = f.read()
         molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
-        basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis1, puream=0) # Not generalized yet
         natoms = molecule.natom()
-        nbf = basis_set.nbf()
+
+        bs1 = psi4.core.BasisSet.build(molecule, 'BASIS', basis1, puream=0)
+        bs2 = psi4.core.BasisSet.build(molecule, 'BASIS', basis2, puream=0)
+        bs3 = psi4.core.BasisSet.build(molecule, 'BASIS', basis3, puream=0)
+        bs4 = psi4.core.BasisSet.build(molecule, 'BASIS', basis4, puream=0)
+        nbf1 = bs1.nbf()
+        nbf2 = bs2.nbf()
+        nbf3 = bs3.nbf()
+        nbf4 = bs4.nbf()
 
         if 'core' in mode and max_deriv_order > 0:
             # A list of ERI derivative tensors, containing only unique elements
@@ -27,11 +34,34 @@ def __init__(self, basis1, basis2, basis3, basis4, xyz_path, max_deriv_order, mo
             self.eri_derivatives = []
             for i in range(max_deriv_order):
                 n_unique_derivs = how_many_derivs(natoms, i + 1)
-                eri_deriv = libint_interface.eri_deriv_core(i + 1).reshape(n_unique_derivs,nbf,nbf,nbf,nbf)
+                eri_deriv = libint_interface.eri_deriv_core(i + 1).reshape(n_unique_derivs, nbf1, nbf2, nbf3, nbf4)
                 self.eri_derivatives.append(eri_deriv)
 
+        if 'f12' in mode and max_deriv_order > 0:
+            # A list of ERI derivative tensors, containing only unique elements
+            # corresponding to upper hypertriangle (since derivative tensors are symmetric)
+            # Length of tuple is maximum deriv order, each array is (upper triangle derivatives,nbf,nbf,nbf,nbf)
+            # Then when JAX calls JVP, read appropriate slice
+            self.f12_derivatives = []
+            self.f12_squared_derivatives = []
+            self.f12g12_derivatives = []
+            self.f12_double_commutator_derivatives = []
+            for i in range(max_deriv_order):
+                n_unique_derivs = how_many_derivs(natoms, i + 1)
+                f12_deriv = libint_interface.f12_deriv_core(i + 1).reshape(n_unique_derivs, nbf1, nbf2, nbf3, nbf4)
+                f12_squared_deriv = libint_interface.f12_squared_deriv_core(i + 1).reshape(n_unique_derivs, nbf1, nbf2, nbf3, nbf4)
+                f12g12_deriv = libint_interface.f12g12_deriv_core(i + 1).reshape(n_unique_derivs, nbf1, nbf2, nbf3, nbf4)
+                f12_double_commutator_deriv = libint_interface.f12_double_commutator_deriv_core(i + 1).reshape(n_unique_derivs, nbf1, nbf2, nbf3, nbf4)
+                self.f12_derivatives.append(f12_deriv)
+                self.f12_squared_derivatives.append(f12_squared_deriv)
+                self.f12g12_derivatives.append(f12g12_deriv)
+                self.f12_double_commutator_derivatives.append(f12_double_commutator_deriv)
+
         self.mode = mode
-        self.nbf = nbf
+        self.nbf1 = nbf1
+        self.nbf2 = nbf2
+        self.nbf3 = nbf3
+        self.nbf4 = nbf4
 
         # Create new JAX primitive for TEI evaluation
         self.eri_p = jax.core.Primitive("eri")
@@ -111,31 +141,31 @@ def f12_double_commutator_deriv(self, geom, beta, deriv_vec):
     def eri_impl(self, geom):
         G = libint_interface.eri()
         #d = int(np.sqrt(np.sqrt(G.shape[0])))
-        G = G.reshape(self.nbf,self.nbf,self.nbf,self.nbf)
+        G = G.reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
         return jnp.asarray(G)
 
     def f12_impl(self, geom, beta):
         F = libint_interface.f12(beta)
         #d = int(np.sqrt(np.sqrt(G.shape[0])))
-        F = F.reshape(self.nbf,self.nbf,self.nbf,self.nbf)
+        F = F.reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
         return jnp.asarray(F)
 
     def f12_squared_impl(self, geom, beta):
         F = libint_interface.f12_squared(beta)
         #d = int(np.sqrt(np.sqrt(G.shape[0])))
-        F = F.reshape(self.nbf,self.nbf,self.nbf,self.nbf)
+        F = F.reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
         return jnp.asarray(F)
 
     def f12g12_impl(self, geom, beta):
         F = libint_interface.f12g12(beta)
         #d = int(np.sqrt(np.sqrt(G.shape[0])))
-        F = F.reshape(self.nbf,self.nbf,self.nbf,self.nbf)
+        F = F.reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
         return jnp.asarray(F)
     
     def f12_double_commutator_impl(self, geom, beta):
         F = libint_interface.f12_double_commutator(beta)
         #d = int(np.sqrt(np.sqrt(G.shape[0])))
-        F = F.reshape(self.nbf,self.nbf,self.nbf,self.nbf)
+        F = F.reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
         return jnp.asarray(F)
 
     def eri_deriv_impl(self, geom, deriv_vec):
@@ -149,7 +179,7 @@ def eri_deriv_impl(self, geom, deriv_vec):
             return jnp.asarray(G)
 
         # Read from disk
-        elif self.mode == 'disk':
+        elif 'disk' in self.mode:
             # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
             if os.path.exists("eri_derivs.h5"):
                 file_name = "eri_derivs.h5"
@@ -174,42 +204,134 @@ def eri_deriv_impl(self, geom, deriv_vec):
     def f12_deriv_impl(self, geom, beta, deriv_vec):
         deriv_vec = np.asarray(deriv_vec, int)
         deriv_order = np.sum(deriv_vec)
-        #idx = get_deriv_vec_idx(deriv_vec)
+        idx = get_deriv_vec_idx(deriv_vec)
 
-        # Use eri derivatives in memory
+        # Use f12 derivatives in memory
         if 'core' in self.mode:
-            F = libint_interface.f12_deriv(beta, deriv_vec)
-            return jnp.asarray(F).reshape(self.nbf,self.nbf,self.nbf,self.nbf)
+            F = self.f12_derivatives[deriv_order-1][idx,:,:,:,:]
+            return jnp.asarray(F)
+
+        # Read from disk
+        elif 'disk' in self.mode:
+            # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
+            if os.path.exists("f12_derivs.h5"):
+                file_name = "f12_derivs.h5"
+                dataset_name = "f12_deriv" + str(deriv_order)
+            # if not found, look for partial derivative tensor file with datasets named (type)_deriv(order)_(flattened_uppertri_idx)
+            elif os.path.exists("f12_partials.h5"):
+                file_name = "f12_partials.h5"
+                dataset_name = "f12_deriv" + str(deriv_order) + "_" + str(idx)
+            else:
+                raise Exception("F12 derivatives not found on disk")
+
+            with h5py.File(file_name, 'r') as f:
+                data_set = f[dataset_name]
+                if len(data_set.shape) == 5:
+                    F = data_set[:,:,:,:,idx]
+                elif len(data_set.shape) == 4:
+                    F = data_set[:,:,:,:]
+                else:
+                    raise Exception("Something went wrong reading integral derivative file")
+            return jnp.asarray(F)
 
     def f12_squared_deriv_impl(self, geom, beta, deriv_vec):
         deriv_vec = np.asarray(deriv_vec, int)
         deriv_order = np.sum(deriv_vec)
-        #idx = get_deriv_vec_idx(deriv_vec)
+        idx = get_deriv_vec_idx(deriv_vec)
 
-        # Use eri derivatives in memory
+        # Use f12 squared derivatives in memory
         if 'core' in self.mode:
-            F = libint_interface.f12_squared_deriv(beta, deriv_vec)
-            return jnp.asarray(F).reshape(self.nbf,self.nbf,self.nbf,self.nbf)
+            F = self.f12_squared_derivatives[deriv_order-1][idx,:,:,:,:]
+            return jnp.asarray(F)
+
+        # Read from disk
+        elif 'disk' in self.mode:
+            # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
+            if os.path.exists("f12_squared_derivs.h5"):
+                file_name = "f12_squared_derivs.h5"
+                dataset_name = "f12_squared_deriv" + str(deriv_order)
+            # if not found, look for partial derivative tensor file with datasets named (type)_deriv(order)_(flattened_uppertri_idx)
+            elif os.path.exists("f12_squared_partials.h5"):
+                file_name = "f12_squared_partials.h5"
+                dataset_name = "f12_squared_deriv" + str(deriv_order) + "_" + str(idx)
+            else:
+                raise Exception("F12 Squared derivatives not found on disk")
+
+            with h5py.File(file_name, 'r') as f:
+                data_set = f[dataset_name]
+                if len(data_set.shape) == 5:
+                    F = data_set[:,:,:,:,idx]
+                elif len(data_set.shape) == 4:
+                    F = data_set[:,:,:,:]
+                else:
+                    raise Exception("Something went wrong reading integral derivative file")
+            return jnp.asarray(F)
 
     def f12g12_deriv_impl(self, geom, beta, deriv_vec):
         deriv_vec = np.asarray(deriv_vec, int)
         deriv_order = np.sum(deriv_vec)
-        #idx = get_deriv_vec_idx(deriv_vec)
+        idx = get_deriv_vec_idx(deriv_vec)
 
-        # Use eri derivatives in memory
+        # Use f12g12 derivatives in memory
         if 'core' in self.mode:
-            F = libint_interface.f12g12_deriv(beta, deriv_vec)
-            return jnp.asarray(F).reshape(self.nbf,self.nbf,self.nbf,self.nbf)
+            F = self.f12g12_derivatives[deriv_order-1][idx,:,:,:,:]
+            return jnp.asarray(F)
+
+        # Read from disk
+        elif 'disk' in self.mode:
+            # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
+            if os.path.exists("f12g12_derivs.h5"):
+                file_name = "f12g12_derivs.h5"
+                dataset_name = "f12g12_deriv" + str(deriv_order)
+            # if not found, look for partial derivative tensor file with datasets named (type)_deriv(order)_(flattened_uppertri_idx)
+            elif os.path.exists("f12g12_partials.h5"):
+                file_name = "f12g12_partials.h5"
+                dataset_name = "f12g12_deriv" + str(deriv_order) + "_" + str(idx)
+            else:
+                raise Exception("F12G12 derivatives not found on disk")
+
+            with h5py.File(file_name, 'r') as f:
+                data_set = f[dataset_name]
+                if len(data_set.shape) == 5:
+                    F = data_set[:,:,:,:,idx]
+                elif len(data_set.shape) == 4:
+                    F = data_set[:,:,:,:]
+                else:
+                    raise Exception("Something went wrong reading integral derivative file")
+            return jnp.asarray(F)
 
     def f12_double_commutator_deriv_impl(self, geom, beta, deriv_vec):
         deriv_vec = np.asarray(deriv_vec, int)
         deriv_order = np.sum(deriv_vec)
-        #idx = get_deriv_vec_idx(deriv_vec)
+        idx = get_deriv_vec_idx(deriv_vec)
 
-        # Use eri derivatives in memory
+        # Use f12 double commutator derivatives in memory
         if 'core' in self.mode:
-            F = libint_interface.f12_double_commutator_deriv(beta, deriv_vec)
-            return jnp.asarray(F).reshape(self.nbf,self.nbf,self.nbf,self.nbf)
+            F = self.f12_double_commutator_derivatives[deriv_order-1][idx,:,:,:,:]
+            return jnp.asarray(F)
+
+        # Read from disk
+        elif 'disk' in self.mode:
+            # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
+            if os.path.exists("f12_double_commutator_derivs.h5"):
+                file_name = "f12_double_commutator_derivs.h5"
+                dataset_name = "f12_double_commutator_deriv" + str(deriv_order)
+            # if not found, look for partial derivative tensor file with datasets named (type)_deriv(order)_(flattened_uppertri_idx)
+            elif os.path.exists("f12_double_commutator_partials.h5"):
+                file_name = "f12_double_commutator_partials.h5"
+                dataset_name = "f12_double_commutator_deriv" + str(deriv_order) + "_" + str(idx)
+            else:
+                raise Exception("F12 Double Commutator derivatives not found on disk")
+
+            with h5py.File(file_name, 'r') as f:
+                data_set = f[dataset_name]
+                if len(data_set.shape) == 5:
+                    F = data_set[:,:,:,:,idx]
+                elif len(data_set.shape) == 4:
+                    F = data_set[:,:,:,:]
+                else:
+                    raise Exception("Something went wrong reading integral derivative file")
+            return jnp.asarray(F)
 
     # Create Jacobian-vector product rule, which given some input args (primals)
     # and a tangent std basis vector (tangent), returns the function evaluated at that point (primals_out)
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index d4a85ea..0ff8151 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -22,7 +22,7 @@ def compute_integrals(geom, basis_name, xyz_path, nuclear_charges, charge, deriv
 
     if algo == 'libint_disk':
         # Check disk for currently existing integral derivatives
-        check = check_disk(geom,basis_name,xyz_path,deriv_order)
+        check = check_disk(geom, basis_name, xyz_path, deriv_order)
 
         tei_obj = TEI(basis_name, basis_name, basis_name, basis_name, xyz_path, deriv_order, 'disk')
         oei_obj = OEI(basis_name, basis_name, xyz_path, deriv_order, 'disk')
@@ -53,7 +53,7 @@ def compute_integrals(geom, basis_name, xyz_path, nuclear_charges, charge, deriv
     libint_interface.finalize()
     return S, T, V, G
 
-def check_disk(geom,basis_name,xyz_path,deriv_order,address=None):
+def check_disk(geom, basis_name, xyz_path, deriv_order, address=None):
     # TODO need to check geometry and basis set name in addition to nbf
     # First check TEI's, then OEI's, return separately, check separately in compute_integrals
     correct_int_derivs = False
diff --git a/quax/methods/mp2.py b/quax/methods/mp2.py
index a2a1f4b..02e4574 100644
--- a/quax/methods/mp2.py
+++ b/quax/methods/mp2.py
@@ -7,7 +7,7 @@
 from .energy_utils import nuclear_repulsion, partial_tei_transformation, tei_transformation, cartesian_product
 from .hartree_fock import restricted_hartree_fock
 
-def restricted_mp2(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=0):
+def restricted_mp2(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=0, return_aux_data=False):
     nelectrons = int(jnp.sum(nuclear_charges)) - charge
     ndocc = nelectrons // 2
     E_scf, C, eps, G = restricted_hartree_fock(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
@@ -31,13 +31,15 @@ def restricted_mp2(geom, basis_name, xyz_path, nuclear_charges, charge, options,
     # Create all combinations of four loop variables to make XLA compilation easier
     indices = cartesian_product(jnp.arange(ndocc), jnp.arange(ndocc), jnp.arange(nvirt), jnp.arange(nvirt))
 
-    mp2_correlation = 0.0
     def loop_mp2(idx, mp2_corr):
         i,j,a,b = indices[idx]
         mp2_corr += G[i, a, j, b] * (2 * G[i, a, j, b] - G[i, b, j, a]) * e_denom[i, a, j, b]
         return mp2_corr
 
-    dE_mp2 = fori_loop(0, indices.shape[0], loop_mp2, mp2_correlation)
+    dE_mp2 = fori_loop(0, indices.shape[0], loop_mp2, 0.0) # MP2 correlation
 
-    return E_scf + dE_mp2
+    if return_aux_data:
+        return E_scf + dE_mp2, C, eps
+    else:
+        return E_scf + dE_mp2
 

From fe3ac4ff1f5c3649c34f5b040f290478604f482c Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 2 Oct 2023 14:38:46 -0400
Subject: [PATCH 14/91] F12 ints machinery, first steps to mp2-f12

---
 quax/core.py                       |   2 +-
 quax/integrals/basis_utils.py      | 102 +++++++-------------
 quax/integrals/libint_interface.cc |   7 +-
 quax/methods/energy_utils.py       |   8 +-
 quax/methods/hartree_fock.py       |   4 +-
 quax/methods/ints.py               | 146 ++++++++++++++++++++++++++---
 quax/methods/mp2-f12.py            |  57 +++++++++++
 7 files changed, 238 insertions(+), 88 deletions(-)
 create mode 100644 quax/methods/mp2-f12.py

diff --git a/quax/core.py b/quax/core.py
index 9f663f0..1d2caf3 100644
--- a/quax/core.py
+++ b/quax/core.py
@@ -8,7 +8,6 @@
 import os
 import h5py
 
-from .integrals.basis_utils import build_basis_set
 from .methods.energy_utils import nuclear_repulsion, cholesky_orthogonalization
 from .methods.hartree_fock import restricted_hartree_fock
 from .methods.mp2 import restricted_mp2
@@ -38,6 +37,7 @@ def check_options(options):
                        'damp_factor': 0.5,
                        'spectral_shift': True,
                        'integral_algo': 'libint_core',
+                       'beta': 1.0
                       }
 
     for key in options.keys():
diff --git a/quax/integrals/basis_utils.py b/quax/integrals/basis_utils.py
index 8d039ae..b891287 100644
--- a/quax/integrals/basis_utils.py
+++ b/quax/integrals/basis_utils.py
@@ -2,75 +2,43 @@
 import jax.numpy as jnp
 import numpy as np
 
-def build_basis_set(molecule, basis_name):
-    # Avoids printing from psi4
-    psi4.core.be_quiet()
-    # Create empty dictionary to hold basis information
-    basis_dict = {}
-    # Build basis in Psi4
-    basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
-    # Get total number of shells for the molecule
-    nshell = basis_set.nshell()
-    # Loop over each shell
-    for i in range(nshell):
-        # Create subdictionary for shell i that contains angular momentum
-        # and coefficient/exponent information for each primitive
-        basis_dict[i] = {}
-        basis_dict[i]['am'] = basis_set.shell(i).am
-        basis_dict[i]['atom'] = basis_set.shell_to_center(i)
-        basis_dict[i]['exp'] = []
-        basis_dict[i]['coef'] = []
-        basis_dict[i]['idx'] = basis_set.shell(i).function_index
-        basis_dict[i]['idx_stride'] = int(0.5 * (basis_set.shell(i).am + 1) * ((basis_set.shell(i).am + 1) + 1))
-        # Get total number of primitives for shell i
-        nprim = basis_set.shell(i).nprimitive
-        # Loop over each primitive in shell i
-        for j in range(nprim):
-            # Save the exponent and normalized coefficient of each primitive
-            basis_dict[i]['exp'].append(basis_set.shell(i).exp(j))
-            basis_dict[i]['coef'].append(basis_set.shell(i).coef(j))
-    return basis_dict
-
-def get_nbf(basis):
-    nshells = len(basis)
-    nbf = 0
-    for i in range(nshells):
-        nbf += basis[i]['idx_stride']
-    return nbf
-
-def flatten_basis_data(basis):
+def build_CABS(molecule, basis_name, cabs_name):
     """
-    Takes in a dictionary of basis set info and flattens 
-    all primitive data into vectors.
+    Builds and returns CABS
+    Provide molecule from Psi4,
+    OBS name, CABS name, and
+    MO coefficients from RHF
     """
-    nshells = len(basis)
-    coeffs = []
-    exps = []
-    atoms = []
-    ams = []
-    indices = []
-    dims = []
-    # Smush primitive data together into vectors
-    nbf = 0
-    for i in range(nshells):
-        tmp_coeffs = basis[i]['coef']  
-        tmp_exps = basis[i]['exp']  
-        nbf += basis[i]['idx_stride']
-        for j in tmp_coeffs:
-            coeffs.append(j)
-            atoms.append(basis[i]['atom'])
-            ams.append(basis[i]['am'])
-            indices.append(basis[i]['idx'])
-            dims.append(basis[i]['idx_stride'])
-        for j in tmp_exps:
-            exps.append(j)
-    coeffs = jnp.array(np.asarray(coeffs))
-    exps = jnp.array(np.asarray(exps))
-    atoms = jnp.array(np.asarray(atoms))
-    ams = jnp.array(np.asarray(ams))
-    indices = jnp.array(np.asarray(indices))
-    dims = jnp.array(np.asarray(dims))
-    return coeffs, exps, atoms, ams, indices, dims
+    cabs_name = cabs_name.lower().replace('cabs', 'optri')
+
+    keys = ["BASIS","CABS_BASIS"]
+    targets = [basis_name, cabs_name]
+    roles = ["ORBITAL","F12"]
+    others = [basis_name, basis_name]
+
+    # Creates combined basis set in Python
+    obs = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
+    ao_union = psi4.driver.qcdb.libmintsbasisset.BasisSet.pyconstruct_combined(molecule.save_string_xyz(), keys, targets, roles, others)
+    ao_union = psi4.core.BasisSet.construct_from_pydict(molecule, ao_union, 0)
+    ri_space = psi4.core.OrbitalSpace.build_ri_space(ao_union, 1.0e-8)
+
+    C_ribs = np.array(ri_space.C()) # Orthogonalizes the AOs of the RI space
+
+    # Compute the overlap matrix between OBS and RIBS, then orthogonalizes the RIBS
+    mints = psi4.core.MintsHelper(obs)
+    S_ao_obs_ribs = np.array(mints.ao_overlap(obs, ri_space.basisset()))
+    C12 = np.einsum('Pq,qQ->PQ', S_ao_obs_ribs, C_ribs)
+
+    # Compute the eigenvectors and eigenvalues of S12.T * S12
+    _, S, Vt = np.linalg.svd(C12)
 
+    # Collect the eigenvectors that are associated with (near) zero eignevalues
+    ncabs = S.shape[0]
+    for eval_i in S:
+        if abs(eval_i) < 1.0e-6: ncabs += 1
+    V_N = Vt[ncabs:, :].T
 
+    # Make sure the CABS is an orthonormal set
+    C_cabs = np.einsum('pQ,QP->pP', C_ribs, V_N)
 
+    return C_cabs
diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index c7fb89a..b5850c2 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -25,7 +25,7 @@ libint2::BasisSet bs1, bs2, bs3, bs4;
 unsigned int nbf1, nbf2, nbf3, nbf4;
 std::vector<size_t> shell2bf_1, shell2bf_2, shell2bf_3, shell2bf_4;
 std::vector<long> shell2atom_1, shell2atom_2, shell2atom_3, shell2atom_4;
-int nthreads;
+int nthreads = 1;
 
 // These lookup arrays are for mapping Libint's computed shell-set integrals and integral derivatives to the proper index 
 // in the full OEI/TEI array or derivative array.
@@ -86,11 +86,12 @@ void initialize(std::string xyzfilename, std::string basis1, std::string basis2,
     shell2atom_4 = bs4.shell2atom(atoms);
 
     // Get number of OMP threads
-    nthreads = 1;
 #ifdef _OPENMP
     nthreads = omp_get_max_threads();
 #endif
-    py::print("Number of OMP Threads:", nthreads);
+    if (basis1 == basis2 && basis3 == basis4 && basis2 == basis4) {
+        py::print("Number of OMP Threads:", nthreads);
+    }
 }
 
 void finalize() {
diff --git a/quax/methods/energy_utils.py b/quax/methods/energy_utils.py
index 5881b91..0d0e9dc 100644
--- a/quax/methods/energy_utils.py
+++ b/quax/methods/energy_utils.py
@@ -54,10 +54,10 @@ def tei_transformation(G, C):
     New algo for TEI transform
     It's faster than psi4.MintsHelper.mo_transform() for basis sets <~120.
     """
-    G = transform(C,G)
-    G = transform(C,G)
-    G = transform(C,G)
-    G = transform(C,G)
+    G = transform(C, G)
+    G = transform(C, G)
+    G = transform(C, G)
+    G = transform(C, G)
     return G
 
 def partial_tei_transformation(G, Ci, Cj, Ck, Cl):
diff --git a/quax/methods/hartree_fock.py b/quax/methods/hartree_fock.py
index 75126e2..8691aad 100644
--- a/quax/methods/hartree_fock.py
+++ b/quax/methods/hartree_fock.py
@@ -24,7 +24,7 @@ def restricted_hartree_fock(geom, basis_name, xyz_path, nuclear_charges, charge,
     else: 
         jk_build = jax.vmap(jax.vmap(lambda x,y: jnp.tensordot(x, y, axes=[(0, 1), (0, 1)]), in_axes=(0, None)), in_axes=(0, None))
 
-    S, T, V, G = compute_integrals(geom, basis_name, xyz_path, nuclear_charges, charge, deriv_order, options)
+    S, T, V, G = compute_integrals(geom, basis_name, xyz_path, deriv_order, options)
     # Canonical orthogonalization via cholesky decomposition
     A = cholesky_orthogonalization(S)
 
@@ -41,7 +41,7 @@ def restricted_hartree_fock(geom, basis_name, xyz_path, nuclear_charges, charge,
         shift = jnp.zeros_like(S)
 
     H = T + V
-    Enuc = nuclear_repulsion(geom.reshape(-1,3),nuclear_charges)
+    Enuc = nuclear_repulsion(geom.reshape(-1,3), nuclear_charges)
     D = jnp.zeros_like(H)
     
     def rhf_iter(F,D):
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index 0ff8151..f9bdd60 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -15,7 +15,7 @@
 from ..integrals import libint_interface
      
 
-def compute_integrals(geom, basis_name, xyz_path, nuclear_charges, charge, deriv_order, options):
+def compute_integrals(geom, basis_name, xyz_path, deriv_order, options):
     # Load integral algo, decides to compute integrals in memory or use disk 
     algo = options['integral_algo']
     libint_interface.initialize(xyz_path, basis_name, basis_name, basis_name, basis_name)
@@ -53,6 +53,96 @@ def compute_integrals(geom, basis_name, xyz_path, nuclear_charges, charge, deriv
     libint_interface.finalize()
     return S, T, V, G
 
+def compute_f12_oeints(geom, basis1, basis2, xyz_path, deriv_order, options):
+    # Load integral algo, decides to compute integrals in memory or use disk
+    algo = options['integral_algo']
+    libint_interface.initialize(xyz_path, basis1, basis2, basis1, basis2)
+
+    if algo == 'libint_disk':
+        # Check disk for currently existing integral derivatives
+        check = check_disk(geom, basis1, xyz_path, deriv_order)
+
+        oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'disk')
+        # If disk integral derivs are right, nothing to do
+        if check:
+            T = oei_obj.kinetic(geom)
+            V = oei_obj.potential(geom)
+        else:
+            libint_interface.oei_deriv_disk(deriv_order)
+            T = oei_obj.kinetic(geom)
+            V = oei_obj.potential(geom)
+
+    else:
+        # Precompute TEI derivatives
+        oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'core')
+        # Compute integrals
+        T = oei_obj.kinetic(geom)
+        V = oei_obj.potential(geom)
+
+    libint_interface.finalize()
+    return T + V
+
+def compute_f12_teints(geom, basis1, basis2, basis3, basis4, int_type, xyz_path, deriv_order, options):
+    # Load integral algo, decides to compute integrals in memory or use disk
+    algo = options['integral_algo']
+    beta = options['beta']
+    libint_interface.initialize(xyz_path, basis1, basis2, basis3, basis4)
+
+    if algo == 'libint_disk':
+        # Check disk for currently existing integral derivatives
+        check = check_disk_f12(geom, basis1, basis2, basis3, basis4, int_type, xyz_path, deriv_order)
+
+        tei_obj = TEI(basis1, basis2, basis3, basis4, xyz_path, deriv_order, 'disk')
+        # If disk integral derivs are right, nothing to do
+        if check:
+            match int_type:
+                case "f12":
+                    F = tei_obj.f12(geom, beta)
+                case "f12_squared":
+                    F = tei_obj.f12_squared(geom, beta)
+                case "f12g12":
+                    F = tei_obj.f12g12(geom, beta)
+                case "f12_double_commutator":
+                    F = tei_obj.f12_double_commutator(geom, beta)
+                case "eri":
+                    F = tei_obj.eri(geom, beta)
+        else:
+            match int_type:
+                case "f12":
+                    libint_interface.f12_deriv_disk(deriv_order)
+                    F = tei_obj.f12(geom, beta)
+                case "f12_squared":
+                    libint_interface.f12_squared_deriv_disk(deriv_order)
+                    F = tei_obj.f12_squared(geom, beta)
+                case "f12g12":
+                    libint_interface.f12g12_deriv_disk(deriv_order)
+                    F = tei_obj.f12g12(geom, beta)
+                case "f12_double_commutator":
+                    libint_interface.f12_double_commutator_deriv_disk(deriv_order)
+                    F = tei_obj.f12_double_commutator(geom, beta)
+                case "eri":
+                    libint_interface.eri_deriv_disk(deriv_order)
+                    F = tei_obj.eri(geom, beta)
+
+    else:
+        # Precompute TEI derivatives
+        tei_obj = TEI(basis1, basis2, basis3, basis4, xyz_path, deriv_order, 'core')
+        # Compute integrals
+        match int_type:
+            case "f12":
+                F = tei_obj.f12(geom, beta)
+            case "f12_squared":
+                F = tei_obj.f12_squared(geom, beta)
+            case "f12g12":
+                F = tei_obj.f12g12(geom, beta)
+            case "f12_double_commutator":
+                F = tei_obj.f12_double_commutator(geom, beta)
+            case "eri":
+                F = tei_obj.eri(geom, beta)
+
+    libint_interface.finalize()
+    return F
+
 def check_disk(geom, basis_name, xyz_path, deriv_order, address=None):
     # TODO need to check geometry and basis set name in addition to nbf
     # First check TEI's, then OEI's, return separately, check separately in compute_integrals
@@ -78,18 +168,52 @@ def check_disk(geom, basis_name, xyz_path, deriv_order, address=None):
         if correct_int_derivs:
             print("Integral derivatives appear to be correct. Avoiding recomputation.")
 
-    # TODO flesh out this logic for determining if partials file contains all integrals needed
-    # for particular address
-    elif ((os.path.exists("eri_partials.h5") and os.path.exists("oei_partials.h5"))):
-        print("Found currently existing partial derivatives in working directory. Assuming they are correct.") 
-        oeifile = h5py.File('oei_partials.h5', 'r')
-        erifile = h5py.File('eri_partials.h5', 'r')
+#    # TODO flesh out this logic for determining if partials file contains all integrals needed
+#    # for particular address
+#    elif ((os.path.exists("eri_partials.h5") and os.path.exists("oei_partials.h5"))):
+#        print("Found currently existing partial derivatives in working directory. Assuming they are correct.") 
+#        oeifile = h5py.File('oei_partials.h5', 'r')
+#        erifile = h5py.File('eri_partials.h5', 'r')
+#        with open(xyz_path, 'r') as f:
+#            tmp = f.read()
+#        molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
+#        basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
+#        nbf = basis_set.nbf()
+#        sample_dataset_name = list(oeifile.keys())[0]
+#        correct_nbf = oeifile[sample_dataset_name].shape[0] == nbf
+#        correct_int_derivs = correct_nbf
+#    return correct_int_derivs
+
+def check_disk_f12(geom, basis1, basis2, basis3, basis4, int_type, xyz_path, deriv_order, address=None):
+    # TODO need to check geometry and basis set name in addition to nbf
+    # First check TEI's, then OEI's, return separately, check separately in compute_integrals
+    correct_int_derivs = False
+
+    if ((os.path.exists(int_type + "_derivs.h5"))):
+        print("Found currently existing integral derivatives in your working directory. Trying to use them.")
+        erifile = h5py.File(int_type + '_derivs.h5', 'r')
         with open(xyz_path, 'r') as f:
             tmp = f.read()
         molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
-        basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
-        nbf = basis_set.nbf()
+        bs1 = psi4.core.BasisSet.build(molecule, 'BASIS', basis1, puream=0)
+        bs2 = psi4.core.BasisSet.build(molecule, 'BASIS', basis2, puream=0)
+        bs3 = psi4.core.BasisSet.build(molecule, 'BASIS', basis3, puream=0)
+        bs4 = psi4.core.BasisSet.build(molecule, 'BASIS', basis4, puream=0)
+        nbf1 = bs1.nbf()
+        nbf2 = bs2.nbf()
+        nbf3 = bs3.nbf()
+        nbf4 = bs4.nbf()
+        # Check if there are `deriv_order` datasets in the eri file
+        correct_deriv_order = len(erifile) == deriv_order
+        # Check nbf dimension of integral arrays
         sample_dataset_name = list(oeifile.keys())[0]
-        correct_nbf = oeifile[sample_dataset_name].shape[0] == nbf
-        correct_int_derivs = correct_nbf
+        correct_nbf1 = oeifile[sample_dataset_name].shape[0] == nbf1
+        correct_nbf2 = oeifile[sample_dataset_name].shape[1] == nbf2
+        correct_nbf3 = oeifile[sample_dataset_name].shape[2] == nbf3
+        correct_nbf4 = oeifile[sample_dataset_name].shape[3] == nbf4
+        erifile.close()
+        correct_int_derivs = correct_deriv_order and correct_nbf1 and correct_nbf2 and correct_nbf3 and correct_nbf4
+        if correct_int_derivs:
+            print("Integral derivatives appear to be correct. Avoiding recomputation.")
+
     return correct_int_derivs
diff --git a/quax/methods/mp2-f12.py b/quax/methods/mp2-f12.py
new file mode 100644
index 0000000..153de7c
--- /dev/null
+++ b/quax/methods/mp2-f12.py
@@ -0,0 +1,57 @@
+import jax 
+from jax.config import config; config.update("jax_enable_x64", True)
+import jax.numpy as jnp
+from jax.lax import fori_loop
+import psi4
+
+from ..integrals.basis_utils import build_CABS
+from .ints import compute_f12_oeints, compute_f12_teints
+from .energy_utils import nuclear_repulsion, tei_transformation
+from .mp2 import restricted_mp2
+
+def restricted_mp2_f12(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=0):
+    nelectrons = int(jnp.sum(nuclear_charges)) - charge
+    ndocc = nelectrons // 2
+    E_mp2, C_obs, eps = restricted_mp2(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
+
+    # Force to use Dunning basis sets with associated CABS
+    # Libint has a limited number of basis sets available
+    if 'cc-pv' in basis_name.lower():
+        cabs_name = basis_name + "-cabs"
+    C_cabs = jnp.array(build_CABS(geom, basis_name, cabs_name))
+
+    h = form_h(geom, basis_name, cabs_name, C_obs, C_cabs, xyz_path, deriv_order, options)
+
+    f, fk = form_Fock(geom, basis_name, cabs_name, C_obs, C_cabs, ndocc, xyz_path, deriv_order, options)
+
+def form_h(geom, basis_name, cabs_name, C_obs, C_cabs, xyz_path, deriv_order, options):
+    nobs = C_obs.shape[0]
+    nri = C_cabs.shape[0]
+
+    h = jnp.zeros((nri, nri))
+
+    h_tmp = compute_f12_oeints(geom, basis_name, basis_name, xyz_path, deriv_order, options)
+    h[:nobs, :nobs] = jnp.dot(C_obs, jnp.dot(h_tmp, C_obs))
+
+    h_tmp = compute_f12_oeints(geom, basis_name, cabs_name, xyz_path, deriv_order, options)
+    h[:nobs, nobs:nri] = jnp.dot(C_obs, jnp.dot(h_tmp, C_cabs))
+    h[nobs:nri, :nobs] = h[:nobs, nobs:nri].T
+
+    h_tmp = compute_f12_oeints(geom, cabs_name, cabs_name, xyz_path, deriv_order, options)
+    h[nobs:nri, nobs:nri] = jnp.dot(C_cabs, jnp.dot(h_tmp, C_cabs))
+
+    return h
+
+def form_Fock(geom, basis_name, cabs_name, C_obs, C_cabs, nocc, xyz_path, deriv_order, options):
+    nobs = C_obs.shape[0]
+    nri = C_cabs.shape[0]
+
+    f = np.zeros((nri, nri))
+    fk = np.zeros((nri, nri))
+
+    J_tmp = compute_f12_teints(geom, basis_name, basis_name, basis_name, basis_name, "eri", xyz_path, deriv_order, options)
+
+
+
+
+    

From 82195204e964750426ff47fd056ca5d547c51cbe Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Thu, 5 Oct 2023 10:22:17 -0400
Subject: [PATCH 15/91] WIP: F12 basis set issues

---
 quax/core.py                  | 18 ++++++-
 quax/integrals/basis_utils.py |  9 ++--
 quax/integrals/oei.py         | 13 +++++
 quax/integrals/tei.py         | 20 ++++++++
 quax/methods/__init__.py      |  3 +-
 quax/methods/ints.py          |  8 ++--
 quax/methods/mp2-f12.py       | 57 ----------------------
 quax/methods/mp2f12.py        | 90 +++++++++++++++++++++++++++++++++++
 8 files changed, 151 insertions(+), 67 deletions(-)
 delete mode 100644 quax/methods/mp2-f12.py
 create mode 100644 quax/methods/mp2f12.py

diff --git a/quax/core.py b/quax/core.py
index 1d2caf3..cd5472c 100644
--- a/quax/core.py
+++ b/quax/core.py
@@ -8,9 +8,11 @@
 import os
 import h5py
 
+from .integrals.basis_utils import build_CABS
 from .methods.energy_utils import nuclear_repulsion, cholesky_orthogonalization
 from .methods.hartree_fock import restricted_hartree_fock
 from .methods.mp2 import restricted_mp2
+from .methods.mp2f12 import restricted_mp2_f12
 from .methods.ccsd import rccsd
 from .methods.ccsd_t import rccsd_t
 from .utils import get_required_deriv_vecs
@@ -62,7 +64,6 @@ def compute(molecule, basis_name, method, options=None, deriv_order=0, partial=N
         options = check_options({})
     print("Using integral method: {}".format(options['integral_algo']))
 
-
     # Load molecule data
     geom2d = np.asarray(molecule.geometry())
     geom_list = geom2d.reshape(-1).tolist() 
@@ -81,6 +82,13 @@ def compute(molecule, basis_name, method, options=None, deriv_order=0, partial=N
     natoms = molecule.natom()
     print("Number of basis functions: ", nbf)
 
+    if method == 'mp2-f12': # Ensure use of Dunning basis sets
+        try:
+            cabs_name = basis_name + "-cabs"
+            cabs_space = build_CABS(molecule, basis_name, cabs_name)
+        except:
+            raise Exception("Must use a cc-pVXZ-F12 or aug-cc-pVXZ basis set for F12 methods.")
+
     # Energy and full derivative tensor evaluations
     if not partial:
         # Create energy evaluation function
@@ -90,6 +98,9 @@ def electronic_energy(*args, deriv_order=deriv_order):
         elif method =='mp2':
             def electronic_energy(*args, deriv_order=deriv_order):
                 return restricted_mp2(*args, deriv_order=deriv_order)
+        elif method =='mp2-f12':
+            def electronic_energy(*args, deriv_order=deriv_order):
+                return restricted_mp2_f12(*args, cabs_space, deriv_order=deriv_order)
         elif method =='ccsd':
             def electronic_energy(*args, deriv_order=deriv_order):
                 return rccsd(*args, deriv_order=deriv_order)
@@ -142,6 +153,11 @@ def partial_wrapper(*args):
                 E_scf = restricted_hartree_fock(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=False)
                 return E_scf
         elif method =='mp2':
+            def partial_wrapper(*args):
+                geom = jnp.asarray(args)
+                E_mp2f12 = restricted_mp2_f12(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order)
+                return E_mp2f12
+        elif method =='mp2-f12':
             def partial_wrapper(*args):
                 geom = jnp.asarray(args)
                 E_mp2 = restricted_mp2(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order)
diff --git a/quax/integrals/basis_utils.py b/quax/integrals/basis_utils.py
index b891287..40e918c 100644
--- a/quax/integrals/basis_utils.py
+++ b/quax/integrals/basis_utils.py
@@ -9,10 +9,11 @@ def build_CABS(molecule, basis_name, cabs_name):
     OBS name, CABS name, and
     MO coefficients from RHF
     """
-    cabs_name = cabs_name.lower().replace('cabs', 'optri')
+    # Libint uses the suffix 'cabs' bu Psi4 uses 'optri'
+    psi4_name = cabs_name.lower().replace('cabs', 'optri')
 
     keys = ["BASIS","CABS_BASIS"]
-    targets = [basis_name, cabs_name]
+    targets = [basis_name, psi4_name]
     roles = ["ORBITAL","F12"]
     others = [basis_name, basis_name]
 
@@ -39,6 +40,6 @@ def build_CABS(molecule, basis_name, cabs_name):
     V_N = Vt[ncabs:, :].T
 
     # Make sure the CABS is an orthonormal set
-    C_cabs = np.einsum('pQ,QP->pP', C_ribs, V_N)
+    C_cabs = psi4.core.Matrix.from_array(np.einsum('pQ,QP->pP', C_ribs, V_N))
 
-    return C_cabs
+    return psi4.core.OrbitalSpace(ri_space.id(), cabs_name, C_cabs, ri_space.basisset(), ri_space.integral())
diff --git a/quax/integrals/oei.py b/quax/integrals/oei.py
index 68c8d3a..8e8e341 100644
--- a/quax/integrals/oei.py
+++ b/quax/integrals/oei.py
@@ -17,11 +17,23 @@ def __init__(self, basis1, basis2, xyz_path, max_deriv_order, mode):
         molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
         natoms = molecule.natom()
 
+        # Libint and Psi4 CABS naming
+        if 'cabs' in basis1.lower():
+            basis1 = basis1.lower().replace('cabs', 'optri')
+        if 'cabs' in basis2.lower():
+            basis2 = basis2.lower().replace('cabs', 'optri')
+
         bs1 = psi4.core.BasisSet.build(molecule, 'BASIS', basis1, puream=0)
         bs2 = psi4.core.BasisSet.build(molecule, 'BASIS', basis2, puream=0)
         nbf1 = bs1.nbf()
         nbf2 = bs2.nbf()
 
+        if 'f12' in mode:
+            if 'optri' in basis1:
+                nbf1 += bs2.nbf()
+            if 'optri' in basis2:
+                nbf2 += bs1.nbf()
+
         if 'core' in mode and max_deriv_order > 0:
             # A list of OEI derivative tensors, containing only unique elements
             # corresponding to upper hypertriangle (since derivative tensors are symmetric)
@@ -37,6 +49,7 @@ def __init__(self, basis1, basis2, xyz_path, max_deriv_order, mode):
                 self.kinetic_derivatives.append(oei_deriv[1].reshape(n_unique_derivs, nbf1, nbf2))
                 self.potential_derivatives.append(oei_deriv[2].reshape(n_unique_derivs, nbf1, nbf2))
 
+
         self.mode = mode
         self.nbf1 = nbf1
         self.nbf2 = nbf2
diff --git a/quax/integrals/tei.py b/quax/integrals/tei.py
index b1cbfce..0435600 100644
--- a/quax/integrals/tei.py
+++ b/quax/integrals/tei.py
@@ -17,6 +17,16 @@ def __init__(self, basis1, basis2, basis3, basis4, xyz_path, max_deriv_order, mo
         molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
         natoms = molecule.natom()
 
+        # Libint and Psi4 CABS naming
+        if 'cabs' in basis1.lower():
+            basis1 = basis1.lower().replace('cabs', 'optri')
+        if 'cabs' in basis2.lower():
+            basis2 = basis2.lower().replace('cabs', 'optri')
+        if 'cabs' in basis3.lower():
+            basis3 = basis3.lower().replace('cabs', 'optri')
+        if 'cabs' in basis4.lower():
+            basis4 = basis4.lower().replace('cabs', 'optri')
+
         bs1 = psi4.core.BasisSet.build(molecule, 'BASIS', basis1, puream=0)
         bs2 = psi4.core.BasisSet.build(molecule, 'BASIS', basis2, puream=0)
         bs3 = psi4.core.BasisSet.build(molecule, 'BASIS', basis3, puream=0)
@@ -26,6 +36,16 @@ def __init__(self, basis1, basis2, basis3, basis4, xyz_path, max_deriv_order, mo
         nbf3 = bs3.nbf()
         nbf4 = bs4.nbf()
 
+        if 'f12' in mode:
+            if 'optri' in basis1:
+                nbf1 += bs2.nbf()
+            if 'optri' in basis2:
+                nbf2 += bs1.nbf()
+            if 'optri' in basis1:
+                nbf3 += bs2.nbf()
+            if 'optri' in basis2:
+                nbf4 += bs1.nbf()
+
         if 'core' in mode and max_deriv_order > 0:
             # A list of ERI derivative tensors, containing only unique elements
             # corresponding to upper hypertriangle (since derivative tensors are symmetric)
diff --git a/quax/methods/__init__.py b/quax/methods/__init__.py
index bf11c90..bb9245f 100644
--- a/quax/methods/__init__.py
+++ b/quax/methods/__init__.py
@@ -1,6 +1,7 @@
 from . import energy_utils
 from . import hartree_fock 
-from . import mp2 
+from . import mp2
+from . import mp2f12
 from . import ccsd
 from . import ccsd_t
 from . import ints
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index f9bdd60..ce8aa4e 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -62,7 +62,7 @@ def compute_f12_oeints(geom, basis1, basis2, xyz_path, deriv_order, options):
         # Check disk for currently existing integral derivatives
         check = check_disk(geom, basis1, xyz_path, deriv_order)
 
-        oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'disk')
+        oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'f12_disk')
         # If disk integral derivs are right, nothing to do
         if check:
             T = oei_obj.kinetic(geom)
@@ -74,7 +74,7 @@ def compute_f12_oeints(geom, basis1, basis2, xyz_path, deriv_order, options):
 
     else:
         # Precompute TEI derivatives
-        oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'core')
+        oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'f12_core')
         # Compute integrals
         T = oei_obj.kinetic(geom)
         V = oei_obj.potential(geom)
@@ -92,7 +92,7 @@ def compute_f12_teints(geom, basis1, basis2, basis3, basis4, int_type, xyz_path,
         # Check disk for currently existing integral derivatives
         check = check_disk_f12(geom, basis1, basis2, basis3, basis4, int_type, xyz_path, deriv_order)
 
-        tei_obj = TEI(basis1, basis2, basis3, basis4, xyz_path, deriv_order, 'disk')
+        tei_obj = TEI(basis1, basis2, basis3, basis4, xyz_path, deriv_order, 'f12_disk')
         # If disk integral derivs are right, nothing to do
         if check:
             match int_type:
@@ -126,7 +126,7 @@ def compute_f12_teints(geom, basis1, basis2, basis3, basis4, int_type, xyz_path,
 
     else:
         # Precompute TEI derivatives
-        tei_obj = TEI(basis1, basis2, basis3, basis4, xyz_path, deriv_order, 'core')
+        tei_obj = TEI(basis1, basis2, basis3, basis4, xyz_path, deriv_order, 'f12_core')
         # Compute integrals
         match int_type:
             case "f12":
diff --git a/quax/methods/mp2-f12.py b/quax/methods/mp2-f12.py
deleted file mode 100644
index 153de7c..0000000
--- a/quax/methods/mp2-f12.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import jax 
-from jax.config import config; config.update("jax_enable_x64", True)
-import jax.numpy as jnp
-from jax.lax import fori_loop
-import psi4
-
-from ..integrals.basis_utils import build_CABS
-from .ints import compute_f12_oeints, compute_f12_teints
-from .energy_utils import nuclear_repulsion, tei_transformation
-from .mp2 import restricted_mp2
-
-def restricted_mp2_f12(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=0):
-    nelectrons = int(jnp.sum(nuclear_charges)) - charge
-    ndocc = nelectrons // 2
-    E_mp2, C_obs, eps = restricted_mp2(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
-
-    # Force to use Dunning basis sets with associated CABS
-    # Libint has a limited number of basis sets available
-    if 'cc-pv' in basis_name.lower():
-        cabs_name = basis_name + "-cabs"
-    C_cabs = jnp.array(build_CABS(geom, basis_name, cabs_name))
-
-    h = form_h(geom, basis_name, cabs_name, C_obs, C_cabs, xyz_path, deriv_order, options)
-
-    f, fk = form_Fock(geom, basis_name, cabs_name, C_obs, C_cabs, ndocc, xyz_path, deriv_order, options)
-
-def form_h(geom, basis_name, cabs_name, C_obs, C_cabs, xyz_path, deriv_order, options):
-    nobs = C_obs.shape[0]
-    nri = C_cabs.shape[0]
-
-    h = jnp.zeros((nri, nri))
-
-    h_tmp = compute_f12_oeints(geom, basis_name, basis_name, xyz_path, deriv_order, options)
-    h[:nobs, :nobs] = jnp.dot(C_obs, jnp.dot(h_tmp, C_obs))
-
-    h_tmp = compute_f12_oeints(geom, basis_name, cabs_name, xyz_path, deriv_order, options)
-    h[:nobs, nobs:nri] = jnp.dot(C_obs, jnp.dot(h_tmp, C_cabs))
-    h[nobs:nri, :nobs] = h[:nobs, nobs:nri].T
-
-    h_tmp = compute_f12_oeints(geom, cabs_name, cabs_name, xyz_path, deriv_order, options)
-    h[nobs:nri, nobs:nri] = jnp.dot(C_cabs, jnp.dot(h_tmp, C_cabs))
-
-    return h
-
-def form_Fock(geom, basis_name, cabs_name, C_obs, C_cabs, nocc, xyz_path, deriv_order, options):
-    nobs = C_obs.shape[0]
-    nri = C_cabs.shape[0]
-
-    f = np.zeros((nri, nri))
-    fk = np.zeros((nri, nri))
-
-    J_tmp = compute_f12_teints(geom, basis_name, basis_name, basis_name, basis_name, "eri", xyz_path, deriv_order, options)
-
-
-
-
-    
diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
new file mode 100644
index 0000000..82a2833
--- /dev/null
+++ b/quax/methods/mp2f12.py
@@ -0,0 +1,90 @@
+import jax 
+from jax.config import config; config.update("jax_enable_x64", True)
+import jax.numpy as jnp
+from jax.lax import fori_loop
+import psi4
+
+from .ints import compute_f12_oeints, compute_f12_teints
+from .energy_utils import nuclear_repulsion, tei_transformation
+from .mp2 import restricted_mp2
+
+def restricted_mp2_f12(geom, basis_name, xyz_path, nuclear_charges, charge, options, cabs_space, deriv_order=0):
+    nelectrons = int(jnp.sum(nuclear_charges)) - charge
+    ndocc = nelectrons // 2
+    E_mp2, C_obs, eps = restricted_mp2(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
+    cabs_name = cabs_space.name()
+    C_cabs = jnp.array(cabs_space.C().to_array())
+
+    f, fk = form_Fock(geom, basis_name, cabs_name, C_obs, C_cabs, ndocc, xyz_path, deriv_order, options)
+
+    return f
+
+def form_h(geom, basis_name, cabs_name, C_obs, C_cabs, xyz_path, deriv_order, options):
+    nobs = C_obs.shape[0]
+    nri = C_cabs.shape[0]
+
+    h = jnp.empty((nri, nri))
+
+    # <O|O>
+    h_tmp = compute_f12_oeints(geom, basis_name, basis_name, xyz_path, deriv_order, options)
+    h_tmp = jnp.einsum('pP,qQ,pq->PQ', C_obs, C_obs, h_tmp, optimize='optimal')
+    h = h.at[:nobs, :nobs].set(h_tmp)
+
+    # <O|C> and <C|O>
+    h_tmp = compute_f12_oeints(geom, basis_name, cabs_name, xyz_path, deriv_order, options)
+    h_tmp = jnp.einsum('pP,qQ,pq->PQ', C_obs, C_cabs, h_tmp, optimize='optimal')
+    h = h.at[:nobs, nobs:nri].set(h_tmp)
+    h = h.at[nobs:nri, :nobs].set(jnp.transpose(h_tmp))
+
+    # <C|C>
+    h_tmp = compute_f12_oeints(geom, cabs_name, cabs_name, xyz_path, deriv_order, options)
+    h_tmp = jnp.einsum('pP,qQ,pq->PQ', C_cabs, C_cabs, h_tmp, optimize='optimal')
+    h = h.at[nobs:nri, nobs:nri].set(h_tmp)
+
+    return h
+
+def form_Fock(geom, basis_name, cabs_name, C_obs, C_cabs, nocc, xyz_path, deriv_order, options):
+    nobs = C_obs.shape[0]
+    nri = C_cabs.shape[0]
+
+    f = jnp.empty((nri, nri))
+    fk = jnp.empty((nri, nri))
+
+    # OEINTS
+    h = form_h(geom, basis_name, cabs_name, C_obs, C_cabs, xyz_path, deriv_order, options)
+    f.at[:, :].set(h)
+
+    # TEINTS
+    G = jnp.empty((nri, nobs, nri, nri))
+
+    G_tmp = compute_f12_teints(geom, basis_name, basis_name, basis_name, basis_name, "eri", xyz_path, deriv_order, options)
+    G_tmp = jnp.einsum('pP,qQ,rR,sS,pqrs->PRQS', C_obs, C_obs, C_obs, C_obs, G_tmp, optimize='optimal')
+    G = G.at[:nobs, :nocc, :nobs, :nobs].set(G_tmp) # <OO|OO>
+
+    G_tmp = compute_f12_teints(geom, cabs_name, basis_name, basis_name, basis_name, "eri", xyz_path, deriv_order, options)
+    G_tmp = jnp.einsum('pP,qQ,rR,sS,pqrs->PRQS', C_cabs, C_obs, C_obs, C_obs, G_tmp, optimize='optimal')
+    G = G.at[nobs:nri, :nocc, :nobs, :nobs].set(G_tmp) # <CO|OO>
+    G = G.at[:nocc, :nobs, nobs:nri, :nobs].set(jnp.transpose(G_tmp, (2,3,1,0))) # <OO|CO>
+    G = G.at[:nocc, :nobs, :nobs, nobs:nri].set(jnp.transpose(G_tmp, (3,2,1,0))) # <OO|OC>
+
+    G_tmp = compute_f12_teints(geom, cabs_name, basis_name, basis_name, cabs_name, "eri", xyz_path, deriv_order, options)
+    G_tmp = jnp.einsum('pP,qQ,rR,sS,pqrs->PRQS', C_cabs, C_obs, C_obs, C_cabs, G_tmp, optimize='optimal')
+    G = G.at[nobs:nri, :nocc, :nobs, nobs:nri].set(G_tmp) # <CO|OC>
+
+    G_tmp = compute_f12_teints(geom, cabs_name, cabs_name, basis_name, basis_name, "eri", xyz_path, deriv_order, options)
+    G_tmp = jnp.einsum('pP,qQ,rR,sS,pqrs->PRQS', C_cabs, C_cabs, C_obs, C_obs, G_tmp, optimize='optimal')
+    G = G.at[nobs:nri, :nocc, nobs:nri, :nobs].set(G_tmp) # <CO|CO>
+
+    # Fill Fock Matrix
+    f.at[:, :].set(2.0 * jnp.einsum('PIQI->PQ', G[:, :nocc, :, nocc], optimize='optimal'))
+    fk.at[:, :].set(f)      
+    f.at[:, :].add(-1.0 * jnp.einsum('PIIQ->PQ', G[:, :nocc, :nocc, :], optimize='optimal'))
+
+    return f, fk
+
+    
+
+
+
+
+    

From b3c9ffe1bbe29a6debaa75413e24dd1a4d4a4f9f Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Fri, 6 Oct 2023 16:11:57 -0400
Subject: [PATCH 16/91] Form BS once, simplify max_nprim and max_l

---
 quax/core.py                       |  3 +-
 quax/integrals/libint_interface.cc | 70 +++---------------------------
 quax/integrals/oei.py              | 22 +++-------
 quax/integrals/tei.py              | 40 ++++++-----------
 quax/methods/ccsd.py               |  4 +-
 quax/methods/ccsd_t.py             |  4 +-
 quax/methods/hartree_fock.py       |  4 +-
 quax/methods/ints.py               | 38 ++++++++--------
 quax/methods/mp2.py                |  4 +-
 9 files changed, 55 insertions(+), 134 deletions(-)

diff --git a/quax/core.py b/quax/core.py
index cd5472c..90a1f5b 100644
--- a/quax/core.py
+++ b/quax/core.py
@@ -63,6 +63,7 @@ def compute(molecule, basis_name, method, options=None, deriv_order=0, partial=N
     else:
         options = check_options({})
     print("Using integral method: {}".format(options['integral_algo']))
+    print("Number of OMP Threads: {}".format(psi4.core.get_num_threads()))
 
     # Load molecule data
     geom2d = np.asarray(molecule.geometry())
@@ -75,7 +76,6 @@ def compute(molecule, basis_name, method, options=None, deriv_order=0, partial=N
     mult = molecule.multiplicity()
     charge = molecule.molecular_charge()
     nuclear_charges = jnp.asarray([molecule.charge(i) for i in range(geom2d.shape[0])])
-    args = (geom, basis_name, xyz_path, nuclear_charges, charge, options)
 
     basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
     nbf = basis_set.nbf()
@@ -90,6 +90,7 @@ def compute(molecule, basis_name, method, options=None, deriv_order=0, partial=N
             raise Exception("Must use a cc-pVXZ-F12 or aug-cc-pVXZ basis set for F12 methods.")
 
     # Energy and full derivative tensor evaluations
+    args = (geom, basis_set, xyz_path, nuclear_charges, charge, options)
     if not partial:
         # Create energy evaluation function
         if method == 'scf' or method == 'hf' or method == 'rhf':
diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index b5850c2..a53c5ea 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -25,6 +25,8 @@ libint2::BasisSet bs1, bs2, bs3, bs4;
 unsigned int nbf1, nbf2, nbf3, nbf4;
 std::vector<size_t> shell2bf_1, shell2bf_2, shell2bf_3, shell2bf_4;
 std::vector<long> shell2atom_1, shell2atom_2, shell2atom_3, shell2atom_4;
+size_t max_nprim;
+int max_l;
 int nthreads = 1;
 
 // These lookup arrays are for mapping Libint's computed shell-set integrals and integral derivatives to the proper index 
@@ -85,13 +87,13 @@ void initialize(std::string xyzfilename, std::string basis1, std::string basis2,
     shell2atom_3 = bs3.shell2atom(atoms);
     shell2atom_4 = bs4.shell2atom(atoms);
 
+    max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
+    max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+
     // Get number of OMP threads
 #ifdef _OPENMP
     nthreads = omp_get_max_threads();
 #endif
-    if (basis1 == basis2 && basis3 == basis4 && basis2 == basis4) {
-        py::print("Number of OMP Threads:", nthreads);
-    }
 }
 
 void finalize() {
@@ -224,8 +226,6 @@ std::vector<std::vector<int>> generate_multi_index_lookup(int nparams, int deriv
 py::array overlap() {
     // Overlap integral engine
     std::vector<libint2::Engine> s_engines(nthreads);
-    size_t max_nprim = std::max(bs1.max_nprim(), bs2.max_nprim());
-    int max_l = std::max(bs1.max_l(), bs2.max_l());
     s_engines[0] = libint2::Engine(libint2::Operator::overlap, max_nprim, max_l);
     for (size_t i = 1; i != nthreads; ++i) {
         s_engines[i] = s_engines[0];
@@ -268,8 +268,6 @@ py::array overlap() {
 py::array kinetic() {
     // Kinetic energy integral engine
     std::vector<libint2::Engine> t_engines(nthreads);
-    size_t max_nprim = std::max(bs1.max_nprim(), bs2.max_nprim());
-    int max_l = std::max(bs1.max_l(), bs2.max_l());
     t_engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l);
     for (size_t i = 1; i != nthreads; ++i) {
         t_engines[i] = t_engines[0];
@@ -312,8 +310,6 @@ py::array kinetic() {
 py::array potential() {
     // Potential integral engine
     std::vector<libint2::Engine> v_engines(nthreads);
-    size_t max_nprim = std::max(bs1.max_nprim(), bs2.max_nprim());
-    int max_l = std::max(bs1.max_l(), bs2.max_l());
     v_engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l);
     v_engines[0].set_params(make_point_charges(atoms));
     for (size_t i = 1; i != nthreads; ++i) {
@@ -358,8 +354,6 @@ py::array potential() {
 py::array eri() {
     // workaround for data copying: perhaps pass an empty numpy array, then populate it in C++? avoids last line, which copies
     std::vector<libint2::Engine> eri_engines(nthreads);
-    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
     eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l);
     for (size_t i = 1; i != nthreads; ++i) {
         eri_engines[i] = eri_engines[0];
@@ -418,8 +412,6 @@ py::array f12(double beta) {
     // workaround for data copying: perhaps pass an empty numpy array, then populate it in C++? avoids last line, which copies
     auto cgtg_params = make_cgtg(beta);
     std::vector<libint2::Engine> cgtg_engines(nthreads);
-    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
     cgtg_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l);
     cgtg_engines[0].set_params(cgtg_params);
     for (size_t i = 1; i != nthreads; ++i) {
@@ -479,8 +471,6 @@ py::array f12_squared(double beta) {
     // workaround for data copying: perhaps pass an empty numpy array, then populate it in C++? avoids last line, which copies
     auto cgtg_params = take_square(make_cgtg(beta));
     std::vector<libint2::Engine> cgtg_squared_engines(nthreads);
-    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
     cgtg_squared_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l);
     cgtg_squared_engines[0].set_params(cgtg_params);
     for (size_t i = 1; i != nthreads; ++i) {
@@ -540,8 +530,6 @@ py::array f12g12(double beta) {
     // workaround for data copying: perhaps pass an empty numpy array, then populate it in C++? avoids last line, which copies
     auto cgtg_params = make_cgtg(beta);
     std::vector<libint2::Engine> cgtg_coulomb_engines(nthreads);
-    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
     cgtg_coulomb_engines[0] = libint2::Engine(libint2::Operator::cgtg_x_coulomb, max_nprim, max_l);
     cgtg_coulomb_engines[0].set_params(cgtg_params);
     for (size_t i = 1; i != nthreads; ++i) {
@@ -601,8 +589,6 @@ py::array f12_double_commutator(double beta) {
     // workaround for data copying: perhaps pass an empty numpy array, then populate it in C++? avoids last line, which copies
     auto cgtg_params = make_cgtg(beta);
     std::vector<libint2::Engine> cgtg_del_engines(nthreads);
-    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
     // Returns Runtime Error: bad any_cast if shorthand version is used, may be an error on the Libint side since Psi4 works with this as well
     cgtg_del_engines[0] = libint2::Engine(libint2::Operator::delcgtg2, max_nprim, max_l, 0, 0., cgtg_params, libint2::BraKet::xx_xx);
     for (size_t i = 1; i != nthreads; ++i) {
@@ -670,8 +656,6 @@ py::array overlap_deriv(std::vector<int> deriv_vec) {
 
     // Overlap integral derivative engine
     std::vector<libint2::Engine> s_engines(nthreads);
-    size_t max_nprim = std::max(bs1.max_nprim(), bs2.max_nprim());
-    int max_l = std::max(bs1.max_l(), bs2.max_l());
     s_engines[0] = libint2::Engine(libint2::Operator::overlap, max_nprim, max_l, deriv_order);
     for (size_t i = 1; i != nthreads; ++i) {
         s_engines[i] = s_engines[0];
@@ -766,8 +750,6 @@ py::array kinetic_deriv(std::vector<int> deriv_vec) {
 
     // Kinetic integral derivative engine
     std::vector<libint2::Engine> t_engines(nthreads);
-    size_t max_nprim = std::max(bs1.max_nprim(), bs2.max_nprim());
-    int max_l = std::max(bs1.max_l(), bs2.max_l());
     t_engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l, deriv_order);
     for (size_t i = 1; i != nthreads; ++i) {
         t_engines[i] = t_engines[0];
@@ -869,8 +851,6 @@ py::array potential_deriv(std::vector<int> deriv_vec) {
 
     // Potential integral derivative engine
     std::vector<libint2::Engine> v_engines(nthreads);
-    size_t max_nprim = std::max(bs1.max_nprim(), bs2.max_nprim());
-    int max_l = std::max(bs1.max_l(), bs2.max_l());
     v_engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l, deriv_order);
     v_engines[0].set_params(make_point_charges(atoms));
     for (size_t i = 1; i != nthreads; ++i) {
@@ -998,8 +978,6 @@ py::array eri_deriv(std::vector<int> deriv_vec) {
 
     // ERI derivative integral engine
     std::vector<libint2::Engine> eri_engines(nthreads);
-    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
     eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
     for (size_t i = 1; i != nthreads; ++i) {
         eri_engines[i] = eri_engines[0];
@@ -1151,8 +1129,6 @@ py::array f12_deriv(double beta, std::vector<int> deriv_vec) {
     // F12 derivative integral engine
     auto cgtg_params = make_cgtg(beta);
     std::vector<libint2::Engine> cgtg_engines(nthreads);
-    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
     cgtg_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
     cgtg_engines[0].set_params(cgtg_params);
     for (size_t i = 1; i != nthreads; ++i) {
@@ -1305,8 +1281,6 @@ py::array f12_squared_deriv(double beta, std::vector<int> deriv_vec) {
     // F12 Squared derivative integral engine
     auto cgtg_params = take_square(make_cgtg(beta));
     std::vector<libint2::Engine> cgtg_squared_engines(nthreads);
-    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
     cgtg_squared_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
     cgtg_squared_engines[0].set_params(cgtg_params);
     for (size_t i = 1; i != nthreads; ++i) {
@@ -1459,8 +1433,6 @@ py::array f12g12_deriv(double beta, std::vector<int> deriv_vec) {
     // F12 derivative integral engine
     auto cgtg_params = make_cgtg(beta);
     std::vector<libint2::Engine> cgtg_coulomb_engines(nthreads);
-    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
     cgtg_coulomb_engines[0] = libint2::Engine(libint2::Operator::cgtg_x_coulomb, max_nprim, max_l, deriv_order);
     cgtg_coulomb_engines[0].set_params(cgtg_params);
     for (size_t i = 1; i != nthreads; ++i) {
@@ -1613,8 +1585,6 @@ py::array f12_double_commutator_deriv(double beta, std::vector<int> deriv_vec) {
     // F12 derivative integral engine
     auto cgtg_params = make_cgtg(beta);
     std::vector<libint2::Engine> cgtg_del_engines(nthreads);
-    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
     // Returns Runtime Error: bad any_cast if shorthand version is used, may be an error on the Libint side since Psi4 works with this as well
     cgtg_del_engines[0] = libint2::Engine(libint2::Operator::delcgtg2, max_nprim, max_l, deriv_order, 0., cgtg_params, libint2::BraKet::xx_xx);
     for (size_t i = 1; i != nthreads; ++i) {
@@ -1784,9 +1754,6 @@ void oei_deriv_disk(int max_deriv_order) {
         total_deriv_slices += how_many_derivs(natom, i);
     }
 
-    size_t max_nprim = std::max(bs1.max_nprim(), bs2.max_nprim());
-    int max_l = std::max(bs1.max_l(), bs2.max_l());
-
     // Create H5 File and prepare to fill with 0.0's
     const H5std_string file_name("oei_derivs.h5");
     H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
@@ -1993,9 +1960,6 @@ void eri_deriv_disk(int max_deriv_order) {
     DSetCreatPropList plist;
     plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
 
-    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
-
     // Check to make sure you are not flooding the disk.
     long total_deriv_slices = 0;
     for (int i = 1; i <= max_deriv_order; i++){
@@ -2163,9 +2127,6 @@ void f12_deriv_disk(double beta, int max_deriv_order) {
     DSetCreatPropList plist;
     plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
 
-    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
-
     // Check to make sure you are not flooding the disk.
     long total_deriv_slices = 0;
     for (int i = 1; i <= max_deriv_order; i++){
@@ -2336,9 +2297,6 @@ void f12_squared_deriv_disk(double beta, int max_deriv_order) {
     DSetCreatPropList plist;
     plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
 
-    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
-
     // Check to make sure you are not flooding the disk.
     long total_deriv_slices = 0;
     for (int i = 1; i <= max_deriv_order; i++){
@@ -2511,9 +2469,6 @@ void f12g12_deriv_disk(double beta, int max_deriv_order) {
     DSetCreatPropList plist;
     plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
 
-    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
-
     // Check to make sure you are not flooding the disk.
     long total_deriv_slices = 0;
     for (int i = 1; i <= max_deriv_order; i++){
@@ -2684,9 +2639,6 @@ void f12_double_commutator_deriv_disk(double beta, int max_deriv_order) {
     DSetCreatPropList plist;
     plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
 
-    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
-
     // Check to make sure you are not flooding the disk.
     long total_deriv_slices = 0;
     for (int i = 1; i <= max_deriv_order; i++){
@@ -2860,8 +2812,6 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
 
     // Define engines and buffers
     std::vector<libint2::Engine> s_engines(nthreads), t_engines(nthreads), v_engines(nthreads);
-    size_t max_nprim = std::max(bs1.max_nprim(), bs2.max_nprim());
-    int max_l = std::max(bs1.max_l(), bs2.max_l());
     s_engines[0] = libint2::Engine(libint2::Operator::overlap, max_nprim, max_l, deriv_order);
     t_engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l, deriv_order);
     v_engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l, deriv_order);
@@ -3010,8 +2960,6 @@ py::array eri_deriv_core(int deriv_order) {
 
     // Libint engine for computing shell quartet derivatives
     std::vector<libint2::Engine> eri_engines(nthreads);
-    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
     eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
     for (size_t i = 1; i != nthreads; ++i) {
         eri_engines[i] = eri_engines[0];
@@ -3137,8 +3085,6 @@ py::array f12_deriv_core(double beta, int deriv_order) {
     // Libint engine for computing shell quartet derivatives
     auto cgtg_params = make_cgtg(beta);
     std::vector<libint2::Engine> cgtg_engines(nthreads);
-    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
     cgtg_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
     cgtg_engines[0].set_params(cgtg_params);
     for (size_t i = 1; i != nthreads; ++i) {
@@ -3265,8 +3211,6 @@ py::array f12_squared_deriv_core(double beta, int deriv_order) {
     // Libint engine for computing shell quartet derivatives
     auto cgtg_params = take_square(make_cgtg(beta));
     std::vector<libint2::Engine> cgtg_squared_engines(nthreads);
-    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
     cgtg_squared_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
     cgtg_squared_engines[0].set_params(cgtg_params);
     for (size_t i = 1; i != nthreads; ++i) {
@@ -3393,8 +3337,6 @@ py::array f12g12_deriv_core(double beta, int deriv_order) {
     // Libint engine for computing shell quartet derivatives
     auto cgtg_params = make_cgtg(beta);
     std::vector<libint2::Engine> cgtg_coulomb_engines(nthreads);
-    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
     cgtg_coulomb_engines[0] = libint2::Engine(libint2::Operator::cgtg_x_coulomb, max_nprim, max_l, deriv_order);
     cgtg_coulomb_engines[0].set_params(cgtg_params);
     for (size_t i = 1; i != nthreads; ++i) {
@@ -3521,8 +3463,6 @@ py::array f12_double_commutator_deriv_core(double beta, int deriv_order) {
     // Libint engine for computing shell quartet derivatives
     auto cgtg_params = make_cgtg(beta);
     std::vector<libint2::Engine> cgtg_del_engines(nthreads);
-    size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-    int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
     // Returns Runtime Error: bad any_cast if shorthand version is used, may be an error on the Libint side since Psi4 works with this as well
     cgtg_del_engines[0] = libint2::Engine(libint2::Operator::delcgtg2, max_nprim, max_l, deriv_order, 0., cgtg_params, libint2::BraKet::xx_xx);
     for (size_t i = 1; i != nthreads; ++i) {
diff --git a/quax/integrals/oei.py b/quax/integrals/oei.py
index 8e8e341..ec30695 100644
--- a/quax/integrals/oei.py
+++ b/quax/integrals/oei.py
@@ -18,21 +18,13 @@ def __init__(self, basis1, basis2, xyz_path, max_deriv_order, mode):
         natoms = molecule.natom()
 
         # Libint and Psi4 CABS naming
-        if 'cabs' in basis1.lower():
-            basis1 = basis1.lower().replace('cabs', 'optri')
-        if 'cabs' in basis2.lower():
-            basis2 = basis2.lower().replace('cabs', 'optri')
-
-        bs1 = psi4.core.BasisSet.build(molecule, 'BASIS', basis1, puream=0)
-        bs2 = psi4.core.BasisSet.build(molecule, 'BASIS', basis2, puream=0)
-        nbf1 = bs1.nbf()
-        nbf2 = bs2.nbf()
-
-        if 'f12' in mode:
-            if 'optri' in basis1:
-                nbf1 += bs2.nbf()
-            if 'optri' in basis2:
-                nbf2 += bs1.nbf()
+        if 'cabs' in basis1.name().lower():
+            basis1_name = basis1.name().lower().replace('cabs', 'optri')
+        if 'cabs' in basis2.name().lower():
+            basis2_name = basis2.name().lower().replace('cabs', 'optri')
+
+        nbf1 = basis1.nbf()
+        nbf2 = basis2.nbf()
 
         if 'core' in mode and max_deriv_order > 0:
             # A list of OEI derivative tensors, containing only unique elements
diff --git a/quax/integrals/tei.py b/quax/integrals/tei.py
index 0435600..7eed871 100644
--- a/quax/integrals/tei.py
+++ b/quax/integrals/tei.py
@@ -18,33 +18,19 @@ def __init__(self, basis1, basis2, basis3, basis4, xyz_path, max_deriv_order, mo
         natoms = molecule.natom()
 
         # Libint and Psi4 CABS naming
-        if 'cabs' in basis1.lower():
-            basis1 = basis1.lower().replace('cabs', 'optri')
-        if 'cabs' in basis2.lower():
-            basis2 = basis2.lower().replace('cabs', 'optri')
-        if 'cabs' in basis3.lower():
-            basis3 = basis3.lower().replace('cabs', 'optri')
-        if 'cabs' in basis4.lower():
-            basis4 = basis4.lower().replace('cabs', 'optri')
-
-        bs1 = psi4.core.BasisSet.build(molecule, 'BASIS', basis1, puream=0)
-        bs2 = psi4.core.BasisSet.build(molecule, 'BASIS', basis2, puream=0)
-        bs3 = psi4.core.BasisSet.build(molecule, 'BASIS', basis3, puream=0)
-        bs4 = psi4.core.BasisSet.build(molecule, 'BASIS', basis4, puream=0)
-        nbf1 = bs1.nbf()
-        nbf2 = bs2.nbf()
-        nbf3 = bs3.nbf()
-        nbf4 = bs4.nbf()
-
-        if 'f12' in mode:
-            if 'optri' in basis1:
-                nbf1 += bs2.nbf()
-            if 'optri' in basis2:
-                nbf2 += bs1.nbf()
-            if 'optri' in basis1:
-                nbf3 += bs2.nbf()
-            if 'optri' in basis2:
-                nbf4 += bs1.nbf()
+        if 'cabs' in basis1.name().lower():
+            basis1_name = basis1.name().lower().replace('cabs', 'optri')
+        if 'cabs' in basis2.name().lower():
+            basis2_name = basis2.name().lower().replace('cabs', 'optri')
+        if 'cabs' in basis3.name().lower():
+            basis3_name = basis3.name().lower().replace('cabs', 'optri')
+        if 'cabs' in basis4.name().lower():
+            basis4_name = basis4.name().lower().replace('cabs', 'optri')
+
+        nbf1 = basis1.nbf()
+        nbf2 = basis2.nbf()
+        nbf3 = basis3.nbf()
+        nbf4 = basis4.nbf()
 
         if 'core' in mode and max_deriv_order > 0:
             # A list of ERI derivative tensors, containing only unique elements
diff --git a/quax/methods/ccsd.py b/quax/methods/ccsd.py
index 9242b13..421f5f8 100644
--- a/quax/methods/ccsd.py
+++ b/quax/methods/ccsd.py
@@ -6,9 +6,9 @@
 from .energy_utils import nuclear_repulsion, partial_tei_transformation, tei_transformation
 from .hartree_fock import restricted_hartree_fock
 
-def rccsd(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=0, return_aux_data=False):
+def rccsd(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=0, return_aux_data=False):
     # Do HF
-    E_scf, C, eps, V = restricted_hartree_fock(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
+    E_scf, C, eps, V = restricted_hartree_fock(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
 
     nelectrons = int(jnp.sum(nuclear_charges)) - charge
     ndocc = nelectrons // 2
diff --git a/quax/methods/ccsd_t.py b/quax/methods/ccsd_t.py
index 4015ba9..83a0c12 100644
--- a/quax/methods/ccsd_t.py
+++ b/quax/methods/ccsd_t.py
@@ -94,8 +94,8 @@ def loop_k(arr2):
     i, j, k, pT = while_loop(lambda arr0: arr0[0] < o, loop_i, (0, 0, 0, 0.0)) # (i, j, k, pT)
     return pT
 
-def rccsd_t(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=0):
-    E_ccsd, T1, T2, V, fock_Od, fock_Vd = rccsd(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
+def rccsd_t(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=0):
+    E_ccsd, T1, T2, V, fock_Od, fock_Vd = rccsd(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
     pT = perturbative_triples(T1, T2, V, fock_Od, fock_Vd)
     #print("(T) energy correction:     ", pT)
     #print("CCSD(T) total energy:      ", E_ccsd + pT)
diff --git a/quax/methods/hartree_fock.py b/quax/methods/hartree_fock.py
index 8691aad..d69d006 100644
--- a/quax/methods/hartree_fock.py
+++ b/quax/methods/hartree_fock.py
@@ -7,7 +7,7 @@
 from .ints import compute_integrals
 from .energy_utils import nuclear_repulsion, cholesky_orthogonalization
 
-def restricted_hartree_fock(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=0, return_aux_data=False):
+def restricted_hartree_fock(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=0, return_aux_data=False):
     # Load keyword options
     maxit = options['maxit']
     damping = options['damping']
@@ -24,7 +24,7 @@ def restricted_hartree_fock(geom, basis_name, xyz_path, nuclear_charges, charge,
     else: 
         jk_build = jax.vmap(jax.vmap(lambda x,y: jnp.tensordot(x, y, axes=[(0, 1), (0, 1)]), in_axes=(0, None)), in_axes=(0, None))
 
-    S, T, V, G = compute_integrals(geom, basis_name, xyz_path, deriv_order, options)
+    S, T, V, G = compute_integrals(geom, basis_set, xyz_path, deriv_order, options)
     # Canonical orthogonalization via cholesky decomposition
     A = cholesky_orthogonalization(S)
 
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index ce8aa4e..607cf26 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -15,17 +15,18 @@
 from ..integrals import libint_interface
      
 
-def compute_integrals(geom, basis_name, xyz_path, deriv_order, options):
+def compute_integrals(geom, basis_set, xyz_path, deriv_order, options):
     # Load integral algo, decides to compute integrals in memory or use disk 
     algo = options['integral_algo']
+    basis_name = basis_set.name()
     libint_interface.initialize(xyz_path, basis_name, basis_name, basis_name, basis_name)
 
     if algo == 'libint_disk':
         # Check disk for currently existing integral derivatives
-        check = check_disk(geom, basis_name, xyz_path, deriv_order)
+        check = check_disk(geom, basis_set, xyz_path, deriv_order)
 
-        tei_obj = TEI(basis_name, basis_name, basis_name, basis_name, xyz_path, deriv_order, 'disk')
-        oei_obj = OEI(basis_name, basis_name, xyz_path, deriv_order, 'disk')
+        tei_obj = TEI(basis_set, basis_set, basis_set, basis_set, xyz_path, deriv_order, 'disk')
+        oei_obj = OEI(basis_set, basis_set, xyz_path, deriv_order, 'disk')
         # If disk integral derivs are right, nothing to do
         if check:
             S = oei_obj.overlap(geom)
@@ -42,8 +43,8 @@ def compute_integrals(geom, basis_name, xyz_path, deriv_order, options):
 
     else:
         # Precompute TEI derivatives
-        tei_obj = TEI(basis_name, basis_name, basis_name, basis_name, xyz_path, deriv_order, 'core')
-        oei_obj = OEI(basis_name, basis_name, xyz_path, deriv_order, 'core')
+        tei_obj = TEI(basis_set, basis_set, basis_set, basis_set, xyz_path, deriv_order, 'core')
+        oei_obj = OEI(basis_set, basis_set, xyz_path, deriv_order, 'core')
         # Compute integrals
         S = oei_obj.overlap(geom)
         T = oei_obj.kinetic(geom)
@@ -56,7 +57,9 @@ def compute_integrals(geom, basis_name, xyz_path, deriv_order, options):
 def compute_f12_oeints(geom, basis1, basis2, xyz_path, deriv_order, options):
     # Load integral algo, decides to compute integrals in memory or use disk
     algo = options['integral_algo']
-    libint_interface.initialize(xyz_path, basis1, basis2, basis1, basis2)
+    basis1_name = basis1.name()
+    basis2_name = basis2.name()
+    libint_interface.initialize(xyz_path, basis1_name, basis2_name, basis1_name, basis2_name)
 
     if algo == 'libint_disk':
         # Check disk for currently existing integral derivatives
@@ -86,7 +89,11 @@ def compute_f12_teints(geom, basis1, basis2, basis3, basis4, int_type, xyz_path,
     # Load integral algo, decides to compute integrals in memory or use disk
     algo = options['integral_algo']
     beta = options['beta']
-    libint_interface.initialize(xyz_path, basis1, basis2, basis3, basis4)
+    basis1_name = basis1.name()
+    basis2_name = basis2.name()
+    basis3_name = basis3.name()
+    basis4_name = basis4.name()
+    libint_interface.initialize(xyz_path, basis1_name, basis2_name, basis3_name, basis4_name)
 
     if algo == 'libint_disk':
         # Check disk for currently existing integral derivatives
@@ -143,7 +150,7 @@ def compute_f12_teints(geom, basis1, basis2, basis3, basis4, int_type, xyz_path,
     libint_interface.finalize()
     return F
 
-def check_disk(geom, basis_name, xyz_path, deriv_order, address=None):
+def check_disk(geom, basis_set, xyz_path, deriv_order, address=None):
     # TODO need to check geometry and basis set name in addition to nbf
     # First check TEI's, then OEI's, return separately, check separately in compute_integrals
     correct_int_derivs = False
@@ -155,7 +162,6 @@ def check_disk(geom, basis_name, xyz_path, deriv_order, address=None):
         with open(xyz_path, 'r') as f:
             tmp = f.read()
         molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
-        basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
         nbf = basis_set.nbf()
         # Check if there are `deriv_order` datasets in the eri file
         correct_deriv_order = len(erifile) == deriv_order
@@ -195,14 +201,10 @@ def check_disk_f12(geom, basis1, basis2, basis3, basis4, int_type, xyz_path, der
         with open(xyz_path, 'r') as f:
             tmp = f.read()
         molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
-        bs1 = psi4.core.BasisSet.build(molecule, 'BASIS', basis1, puream=0)
-        bs2 = psi4.core.BasisSet.build(molecule, 'BASIS', basis2, puream=0)
-        bs3 = psi4.core.BasisSet.build(molecule, 'BASIS', basis3, puream=0)
-        bs4 = psi4.core.BasisSet.build(molecule, 'BASIS', basis4, puream=0)
-        nbf1 = bs1.nbf()
-        nbf2 = bs2.nbf()
-        nbf3 = bs3.nbf()
-        nbf4 = bs4.nbf()
+        nbf1 = basis1.nbf()
+        nbf2 = basis2.nbf()
+        nbf3 = basis3.nbf()
+        nbf4 = basis4.nbf()
         # Check if there are `deriv_order` datasets in the eri file
         correct_deriv_order = len(erifile) == deriv_order
         # Check nbf dimension of integral arrays
diff --git a/quax/methods/mp2.py b/quax/methods/mp2.py
index 02e4574..3084c8b 100644
--- a/quax/methods/mp2.py
+++ b/quax/methods/mp2.py
@@ -7,10 +7,10 @@
 from .energy_utils import nuclear_repulsion, partial_tei_transformation, tei_transformation, cartesian_product
 from .hartree_fock import restricted_hartree_fock
 
-def restricted_mp2(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=0, return_aux_data=False):
+def restricted_mp2(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=0, return_aux_data=False):
     nelectrons = int(jnp.sum(nuclear_charges)) - charge
     ndocc = nelectrons // 2
-    E_scf, C, eps, G = restricted_hartree_fock(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
+    E_scf, C, eps, G = restricted_hartree_fock(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
 
     nvirt = G.shape[0] - ndocc
     nbf = G.shape[0]

From 0c7b6a5aa4f20804f7fc3035608a11383d05a729 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Thu, 12 Oct 2023 14:52:22 -0400
Subject: [PATCH 17/91] HDF5 w/ OMP, Add printing, Change check_disk, MP2-F12
 Dev

---
 quax/core.py                       |  14 +-
 quax/integrals/basis_utils.py      |   3 +-
 quax/integrals/libint_interface.cc | 212 ++++++++++++++++++++++++-----
 quax/integrals/tei.py              |  43 ++----
 quax/methods/ccsd.py               |   5 +-
 quax/methods/ccsd_t.py             |   2 +
 quax/methods/hartree_fock.py       |   2 +
 quax/methods/ints.py               | 122 ++++++++++-------
 quax/methods/mp2.py                |   2 +
 quax/methods/mp2f12.py             |  87 ++++++------
 10 files changed, 316 insertions(+), 176 deletions(-)

diff --git a/quax/core.py b/quax/core.py
index 90a1f5b..d121cc0 100644
--- a/quax/core.py
+++ b/quax/core.py
@@ -60,6 +60,8 @@ def compute(molecule, basis_name, method, options=None, deriv_order=0, partial=N
     # Set keyword options
     if options:
         options = check_options(options)
+        if deriv_order == 0:
+            options['integral_algo'] = 'libint_core'
     else:
         options = check_options({})
     print("Using integral method: {}".format(options['integral_algo']))
@@ -82,7 +84,7 @@ def compute(molecule, basis_name, method, options=None, deriv_order=0, partial=N
     natoms = molecule.natom()
     print("Number of basis functions: ", nbf)
 
-    if method == 'mp2-f12': # Ensure use of Dunning basis sets
+    if 'f12' in method: # Ensure use of Dunning basis sets
         try:
             cabs_name = basis_name + "-cabs"
             cabs_space = build_CABS(molecule, basis_name, cabs_name)
@@ -151,27 +153,27 @@ def electronic_energy(*args, deriv_order=deriv_order):
         if method == 'scf' or method == 'hf' or method == 'rhf':
             def partial_wrapper(*args):
                 geom = jnp.asarray(args)
-                E_scf = restricted_hartree_fock(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=False)
+                E_scf = restricted_hartree_fock(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=False)
                 return E_scf
         elif method =='mp2':
             def partial_wrapper(*args):
                 geom = jnp.asarray(args)
-                E_mp2f12 = restricted_mp2_f12(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order)
+                E_mp2f12 = restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order)
                 return E_mp2f12
         elif method =='mp2-f12':
             def partial_wrapper(*args):
                 geom = jnp.asarray(args)
-                E_mp2 = restricted_mp2(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order)
+                E_mp2 = restricted_mp2(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order)
                 return E_mp2
         elif method =='ccsd':
             def partial_wrapper(*args):
                 geom = jnp.asarray(args)
-                E_ccsd = rccsd(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order)
+                E_ccsd = rccsd(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order)
                 return E_ccsd
         elif method =='ccsd(t)':
             def partial_wrapper(*args):
                 geom = jnp.asarray(args)
-                E_ccsd_t = rccsd_t(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order)
+                E_ccsd_t = rccsd_t(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order)
                 return E_ccsd_t
         else:
             raise Exception("Error: Method {} not supported.".format(method))
diff --git a/quax/integrals/basis_utils.py b/quax/integrals/basis_utils.py
index 40e918c..9801105 100644
--- a/quax/integrals/basis_utils.py
+++ b/quax/integrals/basis_utils.py
@@ -9,7 +9,7 @@ def build_CABS(molecule, basis_name, cabs_name):
     OBS name, CABS name, and
     MO coefficients from RHF
     """
-    # Libint uses the suffix 'cabs' bu Psi4 uses 'optri'
+    # Libint uses the suffix 'cabs' but Psi4 uses 'optri'
     psi4_name = cabs_name.lower().replace('cabs', 'optri')
 
     keys = ["BASIS","CABS_BASIS"]
@@ -20,6 +20,7 @@ def build_CABS(molecule, basis_name, cabs_name):
     # Creates combined basis set in Python
     obs = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
     ao_union = psi4.driver.qcdb.libmintsbasisset.BasisSet.pyconstruct_combined(molecule.save_string_xyz(), keys, targets, roles, others)
+    ao_union['name'] = cabs_name
     ao_union = psi4.core.BasisSet.construct_from_pydict(molecule, ao_union, 0)
     ri_space = psi4.core.OrbitalSpace.build_ri_space(ao_union, 1.0e-8)
 
diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index a53c5ea..022e622 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -18,6 +18,9 @@
 namespace py = pybind11;
 using namespace H5;
 
+/*Global variable, OpenMP lock*/
+omp_lock_t  lock;
+
 std::vector<libint2::Atom> atoms;
 unsigned int natom;
 unsigned int ncart;
@@ -56,6 +59,49 @@ std::vector<libint2::Atom> get_atoms(std::string xyzfilename)
     return atoms;
 }
 
+// Creates a combined basis set
+libint2::BasisSet make_ao_cabs(std::string obs_name, libint2::BasisSet cabs) {
+    // Create OBS
+    obs_name.erase(obs_name.end() - 5, obs_name.end());
+    auto obs = libint2::BasisSet(obs_name, atoms);
+    obs.set_pure(false); // use cartesian gaussians
+
+    auto obs_idx = obs.atom2shell(atoms);
+    auto cabs_idx = cabs.atom2shell(atoms);
+
+    std::vector<std::vector<libint2::Shell>> el_bases(36); // Only consider atoms up to Kr
+    for (size_t i = 0; i < atoms.size(); i++) {
+        if (el_bases[atoms[i].atomic_number].empty()) {
+            std::vector<libint2::Shell> tmp;
+
+            for(long int& idx : obs_idx[i]) {
+                tmp.push_back(obs[idx]);
+            }
+            for(long int& idx : cabs_idx[i]) {
+                tmp.push_back(cabs[idx]);
+            }
+
+            sort(tmp.begin(), tmp.end(), [i](const auto& a, const auto& b) -> bool
+            {
+                int a_l, b_l;
+                for (auto&& c_a : a.contr)
+                    a_l = c_a.l;
+                for (auto&& c_b : b.contr)
+                    b_l = c_b.l;
+
+                return a_l < b_l;
+            });
+
+            el_bases[atoms[i].atomic_number] = tmp;
+        }
+    }
+
+    // Create CABS, union of orbital and auxiliary basis AOs
+    cabs = libint2::BasisSet(atoms, el_bases);
+    cabs.set_pure(false);
+    return cabs;
+}
+
 // Must call initialize before computing ints 
 void initialize(std::string xyzfilename, std::string basis1, std::string basis2,
                 std::string basis3, std::string basis4) {
@@ -67,12 +113,27 @@ void initialize(std::string xyzfilename, std::string basis1, std::string basis2,
     // Move harddrive load of basis and xyz to happen only once
     bs1 = libint2::BasisSet(basis1, atoms);
     bs1.set_pure(false); // use cartesian gaussians
+    if (basis1.find("-cabs", 10) != std::string::npos) {
+        bs1 = make_ao_cabs(basis1, bs1);
+    }
+
     bs2 = libint2::BasisSet(basis2, atoms);
     bs2.set_pure(false); // use cartesian gaussians
+    if (basis2.find("-cabs", 10) != std::string::npos) {
+        bs2 = make_ao_cabs(basis2, bs2);
+    }
+
     bs3 = libint2::BasisSet(basis3, atoms);
     bs3.set_pure(false); // use cartesian gaussians
+    if (basis3.find("-cabs", 10) != std::string::npos) {
+        bs3 = make_ao_cabs(basis3, bs3);
+    }
+
     bs4 = libint2::BasisSet(basis4, atoms);
     bs4.set_pure(false); // use cartesian gaussians
+    if (basis4.find("-cabs", 10) != std::string::npos) {
+        bs4 = make_ao_cabs(basis4, bs4);
+    }
 
     nbf1 = bs1.nbf();
     nbf2 = bs2.nbf();
@@ -1806,6 +1867,9 @@ void oei_deriv_disk(int max_deriv_order) {
         hsize_t block[3] = {1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
         hsize_t zerostart[3] = {0, 0, 0};
 
+        /* Initialize lock */
+        omp_init_lock(&lock);
+
 #pragma omp parallel for collapse(2) num_threads(nthreads)
         for(auto s1 = 0; s1 != bs1.size(); ++s1) {
             for(auto s2 = 0; s2 != bs2.size(); ++s2) {
@@ -1916,6 +1980,9 @@ void oei_deriv_disk(int max_deriv_order) {
                     }
                 } // Unique nuclear cartesian derivative indices loop
 
+                /* Serialize HDF dataset writing using OpenMP lock */
+                omp_set_lock(&lock);
+
                 // Now write this shell set slab to HDF5 file
                 // Create file space hyperslab, defining where to write data to in file
                 hsize_t count[3] = {n1, n2, nderivs_triu};
@@ -1929,16 +1996,22 @@ void oei_deriv_disk(int max_deriv_order) {
                 overlap_dataset->write(overlap_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
                 kinetic_dataset->write(kinetic_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
                 potential_dataset->write(potential_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+
+                /* Release lock */
+                omp_unset_lock(&lock);
             }
         } // shell duet loops
-    // Delete datasets for this derivative order
-    delete overlap_dataset;
-    delete kinetic_dataset;
-    delete potential_dataset;
+        // Delete datasets for this derivative order
+        delete overlap_dataset;
+        delete kinetic_dataset;
+        delete potential_dataset;
     } // deriv order loop
-// close the file
-delete file;
-std::cout << " done" << std::endl;
+
+    /* Finished lock mechanism, destroy it */
+    omp_destroy_lock(&lock);
+    // close the file
+    delete file;
+    std::cout << " done" << std::endl;
 } //oei_deriv_disk 
 
 
@@ -1998,6 +2071,9 @@ void eri_deriv_disk(int max_deriv_order) {
         hsize_t block[5] = {1, 1, 1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
         hsize_t zerostart[5] = {0, 0, 0, 0, 0};
 
+        /* Initialize lock */
+        omp_init_lock(&lock);
+
 #pragma omp parallel for collapse(4) num_threads(nthreads)
         for(auto s1 = 0; s1 != bs1.size(); ++s1) {
             for(auto s2 = 0; s2 != bs2.size(); ++s2) {
@@ -2087,6 +2163,10 @@ void eri_deriv_disk(int max_deriv_order) {
                                 }
                             }
                         } // For every nuc_idx 0, nderivs_triu
+
+                        /* Serialize HDF dataset writing using OpenMP lock */
+                        omp_set_lock(&lock);
+
                         // Now write this shell set slab to HDF5 file
                         hsize_t count[5] = {n1, n2, n3, n4, nderivs_triu};
                         hsize_t start[5] = {bf1, bf2, bf3, bf4, 0};
@@ -2097,16 +2177,22 @@ void eri_deriv_disk(int max_deriv_order) {
                         mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
                         // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
                         eri_dataset->write(eri_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+
+                        /* Release lock */
+                        omp_unset_lock(&lock);
                     }
                 }
             }
         } // shell quartet loops
-    // Close the dataset for this derivative order
-    delete eri_dataset;
-    } // deriv order loop 
-// Close the file
-delete file;
-std::cout << " done" << std::endl;
+        // Close the dataset for this derivative order
+        delete eri_dataset;
+    } // deriv order loop
+
+    /* Finished lock mechanism, destroy it */
+    omp_destroy_lock(&lock);
+    // Close the file
+    delete file;
+    std::cout << " done" << std::endl;
 } // eri_deriv_disk function
 
 // Writes all F12 ints up to `max_deriv_order` to disk.
@@ -2168,6 +2254,9 @@ void f12_deriv_disk(double beta, int max_deriv_order) {
         hsize_t block[5] = {1, 1, 1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
         hsize_t zerostart[5] = {0, 0, 0, 0, 0};
 
+        /* Initialize lock */
+        omp_init_lock(&lock);
+
 #pragma omp parallel for collapse(4) num_threads(nthreads)
         for(auto s1 = 0; s1 != bs1.size(); ++s1) {
             for(auto s2 = 0; s2 != bs2.size(); ++s2) {
@@ -2257,6 +2346,10 @@ void f12_deriv_disk(double beta, int max_deriv_order) {
                                 }
                             }
                         } // For every nuc_idx 0, nderivs_triu
+
+                        /* Serialize HDF dataset writing using OpenMP lock */
+                        omp_set_lock(&lock);
+
                         // Now write this shell set slab to HDF5 file
                         hsize_t count[5] = {n1, n2, n3, n4, nderivs_triu};
                         hsize_t start[5] = {bf1, bf2, bf3, bf4, 0};
@@ -2267,16 +2360,22 @@ void f12_deriv_disk(double beta, int max_deriv_order) {
                         mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
                         // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
                         f12_dataset->write(f12_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+
+                        /* Release lock */
+                        omp_unset_lock(&lock);
                     }
                 }
             }
         } // shell quartet loops
-    // Close the dataset for this derivative order
-    delete f12_dataset;
-    } // deriv order loop 
-// Close the file
-delete file;
-std::cout << " done" << std::endl;
+        // Close the dataset for this derivative order
+        delete f12_dataset;
+    } // deriv order loop
+
+    /* Finished lock mechanism, destroy it */
+    omp_destroy_lock(&lock);
+    // Close the file
+    delete file;
+    std::cout << " done" << std::endl;
 } // f12_deriv_disk function
 
 // Writes all F12 Squared ints up to `max_deriv_order` to disk.
@@ -2340,6 +2439,9 @@ void f12_squared_deriv_disk(double beta, int max_deriv_order) {
         hsize_t block[5] = {1, 1, 1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
         hsize_t zerostart[5] = {0, 0, 0, 0, 0};
 
+        /* Initialize lock */
+        omp_init_lock(&lock);
+
 #pragma omp parallel for collapse(4) num_threads(nthreads)
         for(auto s1 = 0; s1 != bs1.size(); ++s1) {
             for(auto s2 = 0; s2 != bs2.size(); ++s2) {
@@ -2429,6 +2531,10 @@ void f12_squared_deriv_disk(double beta, int max_deriv_order) {
                                 }
                             }
                         } // For every nuc_idx 0, nderivs_triu
+
+                        /* Serialize HDF dataset writing using OpenMP lock */
+                        omp_set_lock(&lock);
+
                         // Now write this shell set slab to HDF5 file
                         hsize_t count[5] = {n1, n2, n3, n4, nderivs_triu};
                         hsize_t start[5] = {bf1, bf2, bf3, bf4, 0};
@@ -2439,16 +2545,22 @@ void f12_squared_deriv_disk(double beta, int max_deriv_order) {
                         mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
                         // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
                         f12_squared_dataset->write(f12_squared_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+
+                        /* Release lock */
+                        omp_unset_lock(&lock);
                     }
                 }
             }
         } // shell quartet loops
-    // Close the dataset for this derivative order
-    delete f12_squared_dataset;
-    } // deriv order loop 
-// Close the file
-delete file;
-std::cout << " done" << std::endl;
+        // Close the dataset for this derivative order
+        delete f12_squared_dataset;
+    } // deriv order loop
+
+    /* Finished lock mechanism, destroy it */
+    omp_destroy_lock(&lock);
+    // Close the file
+    delete file;
+    std::cout << " done" << std::endl;
 } // f12_squared_deriv_disk function
 
 // Writes all F12G12 ints up to `max_deriv_order` to disk.
@@ -2510,6 +2622,9 @@ void f12g12_deriv_disk(double beta, int max_deriv_order) {
         hsize_t block[5] = {1, 1, 1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
         hsize_t zerostart[5] = {0, 0, 0, 0, 0};
 
+        /* Initialize lock */
+        omp_init_lock(&lock);
+
 #pragma omp parallel for collapse(4) num_threads(nthreads)
         for(auto s1 = 0; s1 != bs1.size(); ++s1) {
             for(auto s2 = 0; s2 != bs2.size(); ++s2) {
@@ -2599,6 +2714,10 @@ void f12g12_deriv_disk(double beta, int max_deriv_order) {
                                 }
                             }
                         } // For every nuc_idx 0, nderivs_triu
+
+                        /* Serialize HDF dataset writing using OpenMP lock */
+                        omp_set_lock(&lock);
+
                         // Now write this shell set slab to HDF5 file
                         hsize_t count[5] = {n1, n2, n3, n4, nderivs_triu};
                         hsize_t start[5] = {bf1, bf2, bf3, bf4, 0};
@@ -2609,16 +2728,22 @@ void f12g12_deriv_disk(double beta, int max_deriv_order) {
                         mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
                         // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
                         f12g12_dataset->write(f12g12_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+
+                        /* Release lock */
+                        omp_unset_lock(&lock);
                     }
                 }
             }
         } // shell quartet loops
-    // Close the dataset for this derivative order
-    delete f12g12_dataset;
-    } // deriv order loop 
-// Close the file
-delete file;
-std::cout << " done" << std::endl;
+        // Close the dataset for this derivative order
+        delete f12g12_dataset;
+    } // deriv order loop
+
+    /* Finished lock mechanism, destroy it */
+    omp_destroy_lock(&lock);
+    // Close the file
+    delete file;
+    std::cout << " done" << std::endl;
 } // f12g12_deriv_disk function
 
 // Writes all F12 Double Commutator ints up to `max_deriv_order` to disk.
@@ -2680,6 +2805,9 @@ void f12_double_commutator_deriv_disk(double beta, int max_deriv_order) {
         hsize_t block[5] = {1, 1, 1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
         hsize_t zerostart[5] = {0, 0, 0, 0, 0};
 
+        /* Initialize lock */
+        omp_init_lock(&lock);
+
 #pragma omp parallel for collapse(4) num_threads(nthreads)
         for(auto s1 = 0; s1 != bs1.size(); ++s1) {
             for(auto s2 = 0; s2 != bs2.size(); ++s2) {
@@ -2769,6 +2897,10 @@ void f12_double_commutator_deriv_disk(double beta, int max_deriv_order) {
                                 }
                             }
                         } // For every nuc_idx 0, nderivs_triu
+
+                        /* Serialize HDF dataset writing using OpenMP lock */
+                        omp_set_lock(&lock);
+
                         // Now write this shell set slab to HDF5 file
                         hsize_t count[5] = {n1, n2, n3, n4, nderivs_triu};
                         hsize_t start[5] = {bf1, bf2, bf3, bf4, 0};
@@ -2779,16 +2911,22 @@ void f12_double_commutator_deriv_disk(double beta, int max_deriv_order) {
                         mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
                         // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
                         f12_double_commutator_dataset->write(f12_double_commutator_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+
+                        /* Release lock */
+                        omp_unset_lock(&lock);
                     }
                 }
             }
         } // shell quartet loops
-    // Close the dataset for this derivative order
-    delete f12_double_commutator_dataset;
-    } // deriv order loop 
-// Close the file
-delete file;
-std::cout << " done" << std::endl;
+        // Close the dataset for this derivative order
+        delete f12_double_commutator_dataset;
+    } // deriv order loop
+
+    /* Finished lock mechanism, destroy it */
+    omp_destroy_lock(&lock);
+    // Close the file
+    delete file;
+    std::cout << " done" << std::endl;
 } // f12_double_commutator_deriv_disk function
 
 // Computes a single 'deriv_order' derivative tensor of OEIs, keeps everything in core memory
diff --git a/quax/integrals/tei.py b/quax/integrals/tei.py
index 7eed871..bf97660 100644
--- a/quax/integrals/tei.py
+++ b/quax/integrals/tei.py
@@ -11,7 +11,7 @@
 
 class TEI(object):
 
-    def __init__(self, basis1, basis2, basis3, basis4, xyz_path, max_deriv_order, mode):
+    def __init__(self, basis1, basis2, basis3, basis4, xyz_path, max_deriv_order, options, mode):
         with open(xyz_path, 'r') as f:
             tmp = f.read()
         molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
@@ -43,26 +43,6 @@ def __init__(self, basis1, basis2, basis3, basis4, xyz_path, max_deriv_order, mo
                 eri_deriv = libint_interface.eri_deriv_core(i + 1).reshape(n_unique_derivs, nbf1, nbf2, nbf3, nbf4)
                 self.eri_derivatives.append(eri_deriv)
 
-        if 'f12' in mode and max_deriv_order > 0:
-            # A list of ERI derivative tensors, containing only unique elements
-            # corresponding to upper hypertriangle (since derivative tensors are symmetric)
-            # Length of tuple is maximum deriv order, each array is (upper triangle derivatives,nbf,nbf,nbf,nbf)
-            # Then when JAX calls JVP, read appropriate slice
-            self.f12_derivatives = []
-            self.f12_squared_derivatives = []
-            self.f12g12_derivatives = []
-            self.f12_double_commutator_derivatives = []
-            for i in range(max_deriv_order):
-                n_unique_derivs = how_many_derivs(natoms, i + 1)
-                f12_deriv = libint_interface.f12_deriv_core(i + 1).reshape(n_unique_derivs, nbf1, nbf2, nbf3, nbf4)
-                f12_squared_deriv = libint_interface.f12_squared_deriv_core(i + 1).reshape(n_unique_derivs, nbf1, nbf2, nbf3, nbf4)
-                f12g12_deriv = libint_interface.f12g12_deriv_core(i + 1).reshape(n_unique_derivs, nbf1, nbf2, nbf3, nbf4)
-                f12_double_commutator_deriv = libint_interface.f12_double_commutator_deriv_core(i + 1).reshape(n_unique_derivs, nbf1, nbf2, nbf3, nbf4)
-                self.f12_derivatives.append(f12_deriv)
-                self.f12_squared_derivatives.append(f12_squared_deriv)
-                self.f12g12_derivatives.append(f12g12_deriv)
-                self.f12_double_commutator_derivatives.append(f12_double_commutator_deriv)
-
         self.mode = mode
         self.nbf1 = nbf1
         self.nbf2 = nbf2
@@ -146,31 +126,26 @@ def f12_double_commutator_deriv(self, geom, beta, deriv_vec):
     # Create primitive evaluation rules
     def eri_impl(self, geom):
         G = libint_interface.eri()
-        #d = int(np.sqrt(np.sqrt(G.shape[0])))
         G = G.reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
         return jnp.asarray(G)
 
     def f12_impl(self, geom, beta):
         F = libint_interface.f12(beta)
-        #d = int(np.sqrt(np.sqrt(G.shape[0])))
         F = F.reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
         return jnp.asarray(F)
 
     def f12_squared_impl(self, geom, beta):
         F = libint_interface.f12_squared(beta)
-        #d = int(np.sqrt(np.sqrt(G.shape[0])))
         F = F.reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
         return jnp.asarray(F)
 
     def f12g12_impl(self, geom, beta):
         F = libint_interface.f12g12(beta)
-        #d = int(np.sqrt(np.sqrt(G.shape[0])))
         F = F.reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
         return jnp.asarray(F)
     
     def f12_double_commutator_impl(self, geom, beta):
         F = libint_interface.f12_double_commutator(beta)
-        #d = int(np.sqrt(np.sqrt(G.shape[0])))
         F = F.reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
         return jnp.asarray(F)
 
@@ -214,8 +189,8 @@ def f12_deriv_impl(self, geom, beta, deriv_vec):
 
         # Use f12 derivatives in memory
         if 'core' in self.mode:
-            F = self.f12_derivatives[deriv_order-1][idx,:,:,:,:]
-            return jnp.asarray(F)
+            F = libint_interface.f12_deriv(beta, deriv_vec)
+            return jnp.asarray(F).reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
 
         # Read from disk
         elif 'disk' in self.mode:
@@ -247,8 +222,8 @@ def f12_squared_deriv_impl(self, geom, beta, deriv_vec):
 
         # Use f12 squared derivatives in memory
         if 'core' in self.mode:
-            F = self.f12_squared_derivatives[deriv_order-1][idx,:,:,:,:]
-            return jnp.asarray(F)
+            F = libint_interface.f12_squared_deriv(beta, deriv_vec)
+            return jnp.asarray(F).reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
 
         # Read from disk
         elif 'disk' in self.mode:
@@ -280,8 +255,8 @@ def f12g12_deriv_impl(self, geom, beta, deriv_vec):
 
         # Use f12g12 derivatives in memory
         if 'core' in self.mode:
-            F = self.f12g12_derivatives[deriv_order-1][idx,:,:,:,:]
-            return jnp.asarray(F)
+            F = libint_interface.f12g12_deriv(beta, deriv_vec)
+            return jnp.asarray(F).reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
 
         # Read from disk
         elif 'disk' in self.mode:
@@ -313,8 +288,8 @@ def f12_double_commutator_deriv_impl(self, geom, beta, deriv_vec):
 
         # Use f12 double commutator derivatives in memory
         if 'core' in self.mode:
-            F = self.f12_double_commutator_derivatives[deriv_order-1][idx,:,:,:,:]
-            return jnp.asarray(F)
+            F = libint_interface.f12_double_commutator_deriv(beta, deriv_vec)
+            return jnp.asarray(F).reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
 
         # Read from disk
         elif 'disk' in self.mode:
diff --git a/quax/methods/ccsd.py b/quax/methods/ccsd.py
index 421f5f8..a17461b 100644
--- a/quax/methods/ccsd.py
+++ b/quax/methods/ccsd.py
@@ -10,6 +10,7 @@ def rccsd(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_ord
     # Do HF
     E_scf, C, eps, V = restricted_hartree_fock(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
 
+    print("Running CCSD Computation...")
     nelectrons = int(jnp.sum(nuclear_charges)) - charge
     ndocc = nelectrons // 2
     nbf = V.shape[0]
@@ -49,9 +50,9 @@ def rccsd(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_ord
             break
 
     print(iteration, " CCSD iterations performed")
-    #print("CCSD Correlation Energy:   ", E_ccsd)
-    #print("CCSD Total Energy:         ", E_ccsd + E_scf)
     if return_aux_data:
+        #print("CCSD Correlation Energy:   ", E_ccsd)
+        #print("CCSD Total Energy:         ", E_ccsd + E_scf)
         return E_scf + E_ccsd, T1, T2, V, fock_Od, fock_Vd
     else:
         return E_scf + E_ccsd
diff --git a/quax/methods/ccsd_t.py b/quax/methods/ccsd_t.py
index 83a0c12..24669d0 100644
--- a/quax/methods/ccsd_t.py
+++ b/quax/methods/ccsd_t.py
@@ -96,6 +96,8 @@ def loop_k(arr2):
 
 def rccsd_t(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=0):
     E_ccsd, T1, T2, V, fock_Od, fock_Vd = rccsd(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
+
+    print("Running (T) Correction...")
     pT = perturbative_triples(T1, T2, V, fock_Od, fock_Vd)
     #print("(T) energy correction:     ", pT)
     #print("CCSD(T) total energy:      ", E_ccsd + pT)
diff --git a/quax/methods/hartree_fock.py b/quax/methods/hartree_fock.py
index d69d006..4557384 100644
--- a/quax/methods/hartree_fock.py
+++ b/quax/methods/hartree_fock.py
@@ -8,6 +8,7 @@
 from .energy_utils import nuclear_repulsion, cholesky_orthogonalization
 
 def restricted_hartree_fock(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=0, return_aux_data=False):
+    print("Running Hartree-Fock Computation...")
     # Load keyword options
     maxit = options['maxit']
     damping = options['damping']
@@ -93,5 +94,6 @@ def rhf_iter(F,D):
     if not return_aux_data:
         return E_scf
     else:
+        #print("RHF Energy:                ", E_scf)
         return E_scf, C, eps, G
 
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index 607cf26..1f97502 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -23,10 +23,11 @@ def compute_integrals(geom, basis_set, xyz_path, deriv_order, options):
 
     if algo == 'libint_disk':
         # Check disk for currently existing integral derivatives
-        check = check_disk(geom, basis_set, xyz_path, deriv_order)
+        check = check_oei_disk(geom, basis_set, basis_set, xyz_path, deriv_order)
+        check = check_tei_disk(geom, basis_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order)
 
-        tei_obj = TEI(basis_set, basis_set, basis_set, basis_set, xyz_path, deriv_order, 'disk')
         oei_obj = OEI(basis_set, basis_set, xyz_path, deriv_order, 'disk')
+        tei_obj = TEI(basis_set, basis_set, basis_set, basis_set, xyz_path, deriv_order, options, 'disk')
         # If disk integral derivs are right, nothing to do
         if check:
             S = oei_obj.overlap(geom)
@@ -43,8 +44,8 @@ def compute_integrals(geom, basis_set, xyz_path, deriv_order, options):
 
     else:
         # Precompute TEI derivatives
-        tei_obj = TEI(basis_set, basis_set, basis_set, basis_set, xyz_path, deriv_order, 'core')
         oei_obj = OEI(basis_set, basis_set, xyz_path, deriv_order, 'core')
+        tei_obj = TEI(basis_set, basis_set, basis_set, basis_set, xyz_path, deriv_order, options, 'core')
         # Compute integrals
         S = oei_obj.overlap(geom)
         T = oei_obj.kinetic(geom)
@@ -63,7 +64,7 @@ def compute_f12_oeints(geom, basis1, basis2, xyz_path, deriv_order, options):
 
     if algo == 'libint_disk':
         # Check disk for currently existing integral derivatives
-        check = check_disk(geom, basis1, xyz_path, deriv_order)
+        check = check_oei_disk(geom, basis1, basis2, xyz_path, deriv_order)
 
         oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'f12_disk')
         # If disk integral derivs are right, nothing to do
@@ -97,9 +98,9 @@ def compute_f12_teints(geom, basis1, basis2, basis3, basis4, int_type, xyz_path,
 
     if algo == 'libint_disk':
         # Check disk for currently existing integral derivatives
-        check = check_disk_f12(geom, basis1, basis2, basis3, basis4, int_type, xyz_path, deriv_order)
+        check = check_tei_disk(geom, basis1, basis2, basis3, basis4, int_type, xyz_path, deriv_order)
 
-        tei_obj = TEI(basis1, basis2, basis3, basis4, xyz_path, deriv_order, 'f12_disk')
+        tei_obj = TEI(basis1, basis2, basis3, basis4, xyz_path, deriv_order, options, 'f12_disk')
         # If disk integral derivs are right, nothing to do
         if check:
             match int_type:
@@ -112,7 +113,7 @@ def compute_f12_teints(geom, basis1, basis2, basis3, basis4, int_type, xyz_path,
                 case "f12_double_commutator":
                     F = tei_obj.f12_double_commutator(geom, beta)
                 case "eri":
-                    F = tei_obj.eri(geom, beta)
+                    F = tei_obj.eri(geom)
         else:
             match int_type:
                 case "f12":
@@ -129,11 +130,11 @@ def compute_f12_teints(geom, basis1, basis2, basis3, basis4, int_type, xyz_path,
                     F = tei_obj.f12_double_commutator(geom, beta)
                 case "eri":
                     libint_interface.eri_deriv_disk(deriv_order)
-                    F = tei_obj.eri(geom, beta)
+                    F = tei_obj.eri(geom)
 
     else:
         # Precompute TEI derivatives
-        tei_obj = TEI(basis1, basis2, basis3, basis4, xyz_path, deriv_order, 'f12_core')
+        tei_obj = TEI(basis1, basis2, basis3, basis4, xyz_path, deriv_order, options, 'f12_core')
         # Compute integrals
         match int_type:
             case "f12":
@@ -145,58 +146,63 @@ def compute_f12_teints(geom, basis1, basis2, basis3, basis4, int_type, xyz_path,
             case "f12_double_commutator":
                 F = tei_obj.f12_double_commutator(geom, beta)
             case "eri":
-                F = tei_obj.eri(geom, beta)
+                F = tei_obj.eri(geom)
 
     libint_interface.finalize()
     return F
 
-def check_disk(geom, basis_set, xyz_path, deriv_order, address=None):
+def check_oei_disk(geom, basis1, basis2, xyz_path, deriv_order, address=None):
     # TODO need to check geometry and basis set name in addition to nbf
-    # First check TEI's, then OEI's, return separately, check separately in compute_integrals
+    # Check OEI's in compute_integrals
     correct_int_derivs = False
 
-    if ((os.path.exists("eri_derivs.h5") and os.path.exists("oei_derivs.h5"))):
-        print("Found currently existing integral derivatives in your working directory. Trying to use them.")
+    if ((os.path.exists("oei_derivs.h5"))):
+        print("Found currently existing one-electron integral derivatives in your working directory. Trying to use them.")
         oeifile = h5py.File('oei_derivs.h5', 'r')
-        erifile = h5py.File('eri_derivs.h5', 'r')
         with open(xyz_path, 'r') as f:
             tmp = f.read()
         molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
-        nbf = basis_set.nbf()
+        nbf1 = basis1.nbf()
+        nbf2 = basis2.nbf()
         # Check if there are `deriv_order` datasets in the eri file
-        correct_deriv_order = len(erifile) == deriv_order
+        correct_deriv_order = len(oeifile) == deriv_order
         # Check nbf dimension of integral arrays
         sample_dataset_name = list(oeifile.keys())[0]
-        correct_nbf = oeifile[sample_dataset_name].shape[0] == nbf
+        correct_nbf1 = oeifile[sample_dataset_name].shape[0] == nbf1
+        correct_nbf2 = oeifile[sample_dataset_name].shape[1] == nbf2
         oeifile.close()
-        erifile.close()
-        correct_int_derivs = correct_deriv_order and correct_nbf
-        if correct_int_derivs:
-            print("Integral derivatives appear to be correct. Avoiding recomputation.")
+        correct_int_derivs = correct_deriv_order and correct_nbf1 and correct_nbf2
 
-#    # TODO flesh out this logic for determining if partials file contains all integrals needed
-#    # for particular address
-#    elif ((os.path.exists("eri_partials.h5") and os.path.exists("oei_partials.h5"))):
-#        print("Found currently existing partial derivatives in working directory. Assuming they are correct.") 
-#        oeifile = h5py.File('oei_partials.h5', 'r')
-#        erifile = h5py.File('eri_partials.h5', 'r')
-#        with open(xyz_path, 'r') as f:
-#            tmp = f.read()
-#        molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
-#        basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
-#        nbf = basis_set.nbf()
-#        sample_dataset_name = list(oeifile.keys())[0]
-#        correct_nbf = oeifile[sample_dataset_name].shape[0] == nbf
-#        correct_int_derivs = correct_nbf
-#    return correct_int_derivs
-
-def check_disk_f12(geom, basis1, basis2, basis3, basis4, int_type, xyz_path, deriv_order, address=None):
+    # TODO flesh out this logic for determining if partials file contains all integrals needed
+    # for particular address
+    elif (os.path.exists("oei_partials.h5")):
+        print("Found currently existing partial oei derivatives in working directory. Assuming they are correct.")
+        oeifile = h5py.File('oei_partials.h5', 'r')
+        with open(xyz_path, 'r') as f:
+            tmp = f.read()
+        molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
+        nbf1 = basis1.nbf()
+        nbf2 = basis2.nbf()
+        # Check if there are `deriv_order` datasets in the eri file
+        correct_deriv_order = len(oeifile) == deriv_order
+        # Check nbf dimension of integral arrays
+        sample_dataset_name = list(oeifile.keys())[0]
+        correct_nbf1 = oeifile[sample_dataset_name].shape[0] == nbf1
+        correct_nbf2 = oeifile[sample_dataset_name].shape[1] == nbf2
+        oeifile.close()
+        correct_int_derivs = correct_deriv_order and correct_nbf1 and correct_nbf2
+
+    if correct_int_derivs:
+        print("Integral derivatives appear to be correct. Avoiding recomputation.")
+    return correct_int_derivs
+
+def check_tei_disk(geom, basis1, basis2, basis3, basis4, int_type, xyz_path, deriv_order, address=None):
     # TODO need to check geometry and basis set name in addition to nbf
-    # First check TEI's, then OEI's, return separately, check separately in compute_integrals
+    # Check TEI's in compute_integrals
     correct_int_derivs = False
 
     if ((os.path.exists(int_type + "_derivs.h5"))):
-        print("Found currently existing integral derivatives in your working directory. Trying to use them.")
+        print("Found currently existing " + int_type + " integral derivatives in your working directory. Trying to use them.")
         erifile = h5py.File(int_type + '_derivs.h5', 'r')
         with open(xyz_path, 'r') as f:
             tmp = f.read()
@@ -208,14 +214,36 @@ def check_disk_f12(geom, basis1, basis2, basis3, basis4, int_type, xyz_path, der
         # Check if there are `deriv_order` datasets in the eri file
         correct_deriv_order = len(erifile) == deriv_order
         # Check nbf dimension of integral arrays
-        sample_dataset_name = list(oeifile.keys())[0]
-        correct_nbf1 = oeifile[sample_dataset_name].shape[0] == nbf1
-        correct_nbf2 = oeifile[sample_dataset_name].shape[1] == nbf2
-        correct_nbf3 = oeifile[sample_dataset_name].shape[2] == nbf3
-        correct_nbf4 = oeifile[sample_dataset_name].shape[3] == nbf4
+        sample_dataset_name = list(erifile.keys())[0]
+        correct_nbf1 = erifile[sample_dataset_name].shape[0] == nbf1
+        correct_nbf2 = erifile[sample_dataset_name].shape[1] == nbf2
+        correct_nbf3 = erifile[sample_dataset_name].shape[2] == nbf3
+        correct_nbf4 = erifile[sample_dataset_name].shape[3] == nbf4
         erifile.close()
         correct_int_derivs = correct_deriv_order and correct_nbf1 and correct_nbf2 and correct_nbf3 and correct_nbf4
         if correct_int_derivs:
             print("Integral derivatives appear to be correct. Avoiding recomputation.")
+        return correct_int_derivs
 
-    return correct_int_derivs
+    # TODO flesh out this logic for determining if partials file contains all integrals needed
+    # for particular address
+    elif ((os.path.exists("eri_partials.h5"))):
+        print("Found currently existing partial tei derivatives in working directory. Assuming they are correct.")
+        erifile = h5py.File('eri_partials.h5', 'r')
+        with open(xyz_path, 'r') as f:
+            tmp = f.read()
+        molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
+        nbf1 = basis1.nbf()
+        nbf2 = basis2.nbf()
+        nbf3 = basis3.nbf()
+        nbf4 = basis4.nbf()
+        sample_dataset_name = list(erifile.keys())[0]
+        correct_nbf1 = erifile[sample_dataset_name].shape[0] == nbf1
+        correct_nbf2 = erifile[sample_dataset_name].shape[1] == nbf2
+        correct_nbf3 = erifile[sample_dataset_name].shape[2] == nbf3
+        correct_nbf4 = erifile[sample_dataset_name].shape[3] == nbf4
+        erifile.close()
+        correct_int_derivs = correct_deriv_order and correct_nbf1 and correct_nbf2 and correct_nbf3 and correct_nbf4
+        if correct_int_derivs:
+            print("Integral derivatives appear to be correct. Avoiding recomputation.")
+        return correct_int_derivs
diff --git a/quax/methods/mp2.py b/quax/methods/mp2.py
index 3084c8b..54aada6 100644
--- a/quax/methods/mp2.py
+++ b/quax/methods/mp2.py
@@ -12,6 +12,7 @@ def restricted_mp2(geom, basis_set, xyz_path, nuclear_charges, charge, options,
     ndocc = nelectrons // 2
     E_scf, C, eps, G = restricted_hartree_fock(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
 
+    print("Running MP2 Computation...")
     nvirt = G.shape[0] - ndocc
     nbf = G.shape[0]
 
@@ -39,6 +40,7 @@ def loop_mp2(idx, mp2_corr):
     dE_mp2 = fori_loop(0, indices.shape[0], loop_mp2, 0.0) # MP2 correlation
 
     if return_aux_data:
+        #print("MP2 Energy:                ", E_scf + dE_mp2)
         return E_scf + dE_mp2, C, eps
     else:
         return E_scf + dE_mp2
diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index 82a2833..f61dc16 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -8,83 +8,72 @@
 from .energy_utils import nuclear_repulsion, tei_transformation
 from .mp2 import restricted_mp2
 
-def restricted_mp2_f12(geom, basis_name, xyz_path, nuclear_charges, charge, options, cabs_space, deriv_order=0):
+def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, options, cabs_space, deriv_order=0):
     nelectrons = int(jnp.sum(nuclear_charges)) - charge
     ndocc = nelectrons // 2
-    E_mp2, C_obs, eps = restricted_mp2(geom, basis_name, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
-    cabs_name = cabs_space.name()
+    E_mp2, C_obs, eps = restricted_mp2(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
+
+    print("Running MP2-F12 Computation...")
+    cabs_set = cabs_space.basisset()
     C_cabs = jnp.array(cabs_space.C().to_array())
+    nobs = C_obs.shape[0]
+    nri = C_cabs.shape[0]
 
-    f, fk = form_Fock(geom, basis_name, cabs_name, C_obs, C_cabs, ndocc, xyz_path, deriv_order, options)
+    f, fk = form_Fock(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
 
     return f
 
-def form_h(geom, basis_name, cabs_name, C_obs, C_cabs, xyz_path, deriv_order, options):
-    nobs = C_obs.shape[0]
-    nri = C_cabs.shape[0]
-
+def form_h(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_order, options):
     h = jnp.empty((nri, nri))
 
-    # <O|O>
-    h_tmp = compute_f12_oeints(geom, basis_name, basis_name, xyz_path, deriv_order, options)
+    h_tmp = compute_f12_oeints(geom, basis_set, basis_set, xyz_path, deriv_order, options)
     h_tmp = jnp.einsum('pP,qQ,pq->PQ', C_obs, C_obs, h_tmp, optimize='optimal')
-    h = h.at[:nobs, :nobs].set(h_tmp)
+    h = h.at[:nobs, :nobs].set(h_tmp) # <O|O>
 
-    # <O|C> and <C|O>
-    h_tmp = compute_f12_oeints(geom, basis_name, cabs_name, xyz_path, deriv_order, options)
+    h_tmp = compute_f12_oeints(geom, basis_set, cabs_set, xyz_path, deriv_order, options)
     h_tmp = jnp.einsum('pP,qQ,pq->PQ', C_obs, C_cabs, h_tmp, optimize='optimal')
-    h = h.at[:nobs, nobs:nri].set(h_tmp)
-    h = h.at[nobs:nri, :nobs].set(jnp.transpose(h_tmp))
+    h = h.at[:nobs, nobs:nri].set(h_tmp) # <O|C>
+    h = h.at[nobs:nri, :nobs].set(jnp.transpose(h_tmp)) # <C|O>
 
-    # <C|C>
-    h_tmp = compute_f12_oeints(geom, cabs_name, cabs_name, xyz_path, deriv_order, options)
+    h_tmp = compute_f12_oeints(geom, cabs_set, cabs_set, xyz_path, deriv_order, options)
     h_tmp = jnp.einsum('pP,qQ,pq->PQ', C_cabs, C_cabs, h_tmp, optimize='optimal')
-    h = h.at[nobs:nri, nobs:nri].set(h_tmp)
+    h = h.at[nobs:nri, nobs:nri].set(h_tmp) # <C|C>
+    del h_tmp
 
     return h
 
-def form_Fock(geom, basis_name, cabs_name, C_obs, C_cabs, nocc, xyz_path, deriv_order, options):
-    nobs = C_obs.shape[0]
-    nri = C_cabs.shape[0]
-
-    f = jnp.empty((nri, nri))
-    fk = jnp.empty((nri, nri))
-
+def form_Fock(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
     # OEINTS
-    h = form_h(geom, basis_name, cabs_name, C_obs, C_cabs, xyz_path, deriv_order, options)
-    f.at[:, :].set(h)
+    f = form_h(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_order, options)
 
     # TEINTS
     G = jnp.empty((nri, nobs, nri, nri))
 
-    G_tmp = compute_f12_teints(geom, basis_name, basis_name, basis_name, basis_name, "eri", xyz_path, deriv_order, options)
-    G_tmp = jnp.einsum('pP,qQ,rR,sS,pqrs->PRQS', C_obs, C_obs, C_obs, C_obs, G_tmp, optimize='optimal')
-    G = G.at[:nobs, :nocc, :nobs, :nobs].set(G_tmp) # <OO|OO>
+    G_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
+    G_tmp = jnp.einsum('pP,rR,qQ,sS,prqs->PQRS', C_obs, C_obs, C_obs[:, :ndocc], C_obs, G_tmp, optimize='optimal')
+    G = G.at[:nobs, :ndocc, :nobs, :nobs].set(G_tmp) # <Oo|OO>
 
-    G_tmp = compute_f12_teints(geom, cabs_name, basis_name, basis_name, basis_name, "eri", xyz_path, deriv_order, options)
-    G_tmp = jnp.einsum('pP,qQ,rR,sS,pqrs->PRQS', C_cabs, C_obs, C_obs, C_obs, G_tmp, optimize='optimal')
-    G = G.at[nobs:nri, :nocc, :nobs, :nobs].set(G_tmp) # <CO|OO>
-    G = G.at[:nocc, :nobs, nobs:nri, :nobs].set(jnp.transpose(G_tmp, (2,3,1,0))) # <OO|CO>
-    G = G.at[:nocc, :nobs, :nobs, nobs:nri].set(jnp.transpose(G_tmp, (3,2,1,0))) # <OO|OC>
+    G_tmp = compute_f12_teints(geom, cabs_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
+    G_tmp = jnp.einsum('pP,rR,qQ,sS,prqs->PQRS', C_cabs, C_obs, C_obs, C_obs, G_tmp, optimize='optimal')
+    G = G.at[nobs:nri, :nobs, :nobs, :nobs].set(G_tmp) # <CO|OO>
+    G = G.at[:nobs, :nobs, nobs:nri, :nobs].set(jnp.transpose(G_tmp, (2,1,0,3))) # <OO|CO>
+    G = G.at[:nobs, :nobs, :nobs, nobs:nri].set(jnp.transpose(G_tmp, (3,2,1,0))) # <OO|OC>
 
-    G_tmp = compute_f12_teints(geom, cabs_name, basis_name, basis_name, cabs_name, "eri", xyz_path, deriv_order, options)
-    G_tmp = jnp.einsum('pP,qQ,rR,sS,pqrs->PRQS', C_cabs, C_obs, C_obs, C_cabs, G_tmp, optimize='optimal')
-    G = G.at[nobs:nri, :nocc, :nobs, nobs:nri].set(G_tmp) # <CO|OC>
+    G_tmp = compute_f12_teints(geom, cabs_set, basis_set, basis_set, cabs_set, "eri", xyz_path, deriv_order, options)
+    G_tmp = jnp.einsum('pP,rR,qQ,sS,prqs->PQRS', C_cabs, C_obs, C_obs[:, :ndocc], C_cabs, G_tmp, optimize='optimal')
+    G = G.at[nobs:nri, :ndocc, :nobs, nobs:nri].set(G_tmp) # <Co|OC>
 
-    G_tmp = compute_f12_teints(geom, cabs_name, cabs_name, basis_name, basis_name, "eri", xyz_path, deriv_order, options)
-    G_tmp = jnp.einsum('pP,qQ,rR,sS,pqrs->PRQS', C_cabs, C_cabs, C_obs, C_obs, G_tmp, optimize='optimal')
-    G = G.at[nobs:nri, :nocc, nobs:nri, :nobs].set(G_tmp) # <CO|CO>
+    G_tmp = compute_f12_teints(geom, cabs_set, cabs_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
+    G_tmp = jnp.einsum('pP,rR,qQ,sS,prqs->PQRS', C_cabs, C_cabs, C_obs[:, :ndocc], C_obs, G_tmp, optimize='optimal')
+    G = G.at[nobs:nri, :ndocc, nobs:nri, :nobs].set(G_tmp) # <Co|CO>
+    del G_tmp
 
     # Fill Fock Matrix
-    f.at[:, :].set(2.0 * jnp.einsum('PIQI->PQ', G[:, :nocc, :, nocc], optimize='optimal'))
-    fk.at[:, :].set(f)      
-    f.at[:, :].add(-1.0 * jnp.einsum('PIIQ->PQ', G[:, :nocc, :nocc, :], optimize='optimal'))
+    f = f.at[:, :].add(2.0 * jnp.einsum('PIQI->PQ', G[:, :ndocc, :, :ndocc], optimize='optimal'))
+    fk = f # Fock Matrix without Exchange
+    f = f.at[:, :].add(-1.0 * jnp.einsum('PIIQ->PQ', G[:, :ndocc, :ndocc, :], optimize='optimal'))
 
     return f, fk
 
-    
-
-
-
-
+#def form_V(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
     

From eabf2cc36803ac54537eaac41f32deaa6cf7216e Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Thu, 12 Oct 2023 15:17:16 -0400
Subject: [PATCH 18/91] Clarify makefile

---
 quax/integrals/makefile | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/quax/integrals/makefile b/quax/integrals/makefile
index eb7acde..d2ef2a9 100644
--- a/quax/integrals/makefile
+++ b/quax/integrals/makefile
@@ -1,25 +1,27 @@
 # NOTE: These paths below need to be edited such that they point to a set of 
 # Eigen headers, Python headers, Pybind11 headers, Libint API headers libint2.h libint2.hpp, the rest of the Libint2 headers, and the library location of libint2.a,
 CC      := g++
-# Options passed to compiler
+# Options passed to compiler, add "-fopenmp" if intending to use OpenMP
 CFLAGS  := -O3 -fPIC -fopenmp
 # Libint prefix location (where /include, /include/libint2, /lib, /share are located) 
 LIBINT_PREFIX := /home/ecm23353/psi_env
+# Conda prefix location, it is suggested to use conda to install nearly all dependencies
+CONDA_PREFIX := /home/ecm23353/psi_env
 
 I1 := $(LIBINT_PREFIX)/include
 I2 := $(LIBINT_PREFIX)/include/libint2
 L1 := $(LIBINT_PREFIX)/lib
 # Eigen headers location 
-I3 := /home/ecm23353/psi_env/include/eigen3
+I3 := $(CONDA_PREFIX)/include/eigen3
 # Python headers location 
-I4 := /home/ecm23353/psi_env/include/python3.10
+I4 := $(CONDA_PREFIX)/include/python3.10
 # Pybind11 headers location 
-I5 := /home/ecm23353/psi_env/lib/python3.10/site-packages/pybind11/include
+I5 := $(CONDA_PREFIX)/lib/python3.10/site-packages/pybind11/include
 # HDF5 headers, static and shared libraries 
-I6 := /home/ecm23353/psi_env/include
-L2 := /home/ecm23353/psi_env/lib
+I6 := $(CONDA_PREFIX)/include
+L2 := $(CONDA_PREFIX)/lib
 # Edit path in quotes to be same location as L2 definition above
-RPATH := -Wl,-rpath,"/home/ecm23353/psi_env/lib"
+RPATH := -Wl,-rpath,"$(CONDA_PREFIX)/lib"
 
 # This 'TARGETS' suffix should be set to whatever is returned by the command `python3-config --extension-suffix` entered on command line.
 # and it should match the same python version referenced in the above include path for I4 := (3.7 in this case)

From 2dc0cc0bd5ba73e21757a4fbddfe10edadcdb75e Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Tue, 17 Oct 2023 15:18:30 -0400
Subject: [PATCH 19/91] Stable_sort for CABS, VXC intermediates

---
 quax/integrals/libint_interface.cc |  2 +-
 quax/methods/hartree_fock.py       |  8 +--
 quax/methods/mp2f12.py             | 78 ++++++++++++++++++++++++++++--
 3 files changed, 79 insertions(+), 9 deletions(-)

diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index 022e622..62a6300 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -81,7 +81,7 @@ libint2::BasisSet make_ao_cabs(std::string obs_name, libint2::BasisSet cabs) {
                 tmp.push_back(cabs[idx]);
             }
 
-            sort(tmp.begin(), tmp.end(), [i](const auto& a, const auto& b) -> bool
+            stable_sort(tmp.begin(), tmp.end(), [](const auto& a, const auto& b) -> bool
             {
                 int a_l, b_l;
                 for (auto&& c_a : a.contr)
diff --git a/quax/methods/hartree_fock.py b/quax/methods/hartree_fock.py
index 4557384..623d071 100644
--- a/quax/methods/hartree_fock.py
+++ b/quax/methods/hartree_fock.py
@@ -21,9 +21,9 @@ def restricted_hartree_fock(geom, basis_set, xyz_path, nuclear_charges, charge,
 
     # If we are doing MP2 or CCSD after, might as well use jit-compiled JK-build, since HF will not be memory bottleneck
     if return_aux_data:
-        jk_build = jax.jit(jax.vmap(jax.vmap(lambda x,y: jnp.tensordot(x, y, axes=[(0, 1), (0, 1)]), in_axes=(0, None)), in_axes=(0, None)))
+        jk_build = jax.jit(jax.vmap(jax.vmap(lambda x,y: jnp.tensordot(x, y, axes=[(0,1), (0,1)]), in_axes=(0, None)), in_axes=(0, None)))
     else: 
-        jk_build = jax.vmap(jax.vmap(lambda x,y: jnp.tensordot(x, y, axes=[(0, 1), (0, 1)]), in_axes=(0, None)), in_axes=(0, None))
+        jk_build = jax.vmap(jax.vmap(lambda x,y: jnp.tensordot(x, y, axes=[(0,1), (0,1)]), in_axes=(0, None)), in_axes=(0, None))
 
     S, T, V, G = compute_integrals(geom, basis_set, xyz_path, deriv_order, options)
     # Canonical orthogonalization via cholesky decomposition
@@ -68,10 +68,10 @@ def rhf_iter(F,D):
         if damping:
             if iteration < 10:
                 D = Dold * damp_factor + D * damp_factor
-                Dold = D * 1
+                Dold = D * 1.0
         # Build JK matrix: 2 * J - K
         JK = 2 * jk_build(G, D)
-        JK -= jk_build(G.transpose((0, 2, 1, 3)), D)
+        JK -= jk_build(G.transpose((0,2,1,3)), D)
         # Build Fock
         F = H + JK
         # Update convergence error
diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index f61dc16..82aa413 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -3,9 +3,10 @@
 import jax.numpy as jnp
 from jax.lax import fori_loop
 import psi4
+import sys
+jnp.set_printoptions(threshold=sys.maxsize, linewidth=100)
 
 from .ints import compute_f12_oeints, compute_f12_teints
-from .energy_utils import nuclear_repulsion, tei_transformation
 from .mp2 import restricted_mp2
 
 def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, options, cabs_space, deriv_order=0):
@@ -21,7 +22,14 @@ def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, optio
 
     f, fk = form_Fock(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
 
-    return f
+    V = form_V(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options)
+
+    X = form_X(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options)
+
+    C = form_C(geom, basis_set, cabs_set, C_obs, C_cabs, f, ndocc, nobs, xyz_path, deriv_order, options)
+    jax.debug.breakpoint()
+
+    return 0
 
 def form_h(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_order, options):
     h = jnp.empty((nri, nri))
@@ -29,11 +37,13 @@ def form_h(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_
     h_tmp = compute_f12_oeints(geom, basis_set, basis_set, xyz_path, deriv_order, options)
     h_tmp = jnp.einsum('pP,qQ,pq->PQ', C_obs, C_obs, h_tmp, optimize='optimal')
     h = h.at[:nobs, :nobs].set(h_tmp) # <O|O>
+    del h_tmp
 
     h_tmp = compute_f12_oeints(geom, basis_set, cabs_set, xyz_path, deriv_order, options)
     h_tmp = jnp.einsum('pP,qQ,pq->PQ', C_obs, C_cabs, h_tmp, optimize='optimal')
     h = h.at[:nobs, nobs:nri].set(h_tmp) # <O|C>
     h = h.at[nobs:nri, :nobs].set(jnp.transpose(h_tmp)) # <C|O>
+    del h_tmp
 
     h_tmp = compute_f12_oeints(geom, cabs_set, cabs_set, xyz_path, deriv_order, options)
     h_tmp = jnp.einsum('pP,qQ,pq->PQ', C_cabs, C_cabs, h_tmp, optimize='optimal')
@@ -52,16 +62,19 @@ def form_Fock(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_pa
     G_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
     G_tmp = jnp.einsum('pP,rR,qQ,sS,prqs->PQRS', C_obs, C_obs, C_obs[:, :ndocc], C_obs, G_tmp, optimize='optimal')
     G = G.at[:nobs, :ndocc, :nobs, :nobs].set(G_tmp) # <Oo|OO>
+    del G_tmp
 
     G_tmp = compute_f12_teints(geom, cabs_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
     G_tmp = jnp.einsum('pP,rR,qQ,sS,prqs->PQRS', C_cabs, C_obs, C_obs, C_obs, G_tmp, optimize='optimal')
     G = G.at[nobs:nri, :nobs, :nobs, :nobs].set(G_tmp) # <CO|OO>
     G = G.at[:nobs, :nobs, nobs:nri, :nobs].set(jnp.transpose(G_tmp, (2,1,0,3))) # <OO|CO>
     G = G.at[:nobs, :nobs, :nobs, nobs:nri].set(jnp.transpose(G_tmp, (3,2,1,0))) # <OO|OC>
+    del G_tmp
 
     G_tmp = compute_f12_teints(geom, cabs_set, basis_set, basis_set, cabs_set, "eri", xyz_path, deriv_order, options)
     G_tmp = jnp.einsum('pP,rR,qQ,sS,prqs->PQRS', C_cabs, C_obs, C_obs[:, :ndocc], C_cabs, G_tmp, optimize='optimal')
     G = G.at[nobs:nri, :ndocc, :nobs, nobs:nri].set(G_tmp) # <Co|OC>
+    del G_tmp
 
     G_tmp = compute_f12_teints(geom, cabs_set, cabs_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
     G_tmp = jnp.einsum('pP,rR,qQ,sS,prqs->PQRS', C_cabs, C_cabs, C_obs[:, :ndocc], C_obs, G_tmp, optimize='optimal')
@@ -75,5 +88,62 @@ def form_Fock(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_pa
 
     return f, fk
 
-#def form_V(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
-    
+def form_V(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options):
+
+    V = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12g12", xyz_path, deriv_order, options)
+    V = jnp.einsum('iI,kK,jJ,lL,ikjl->IJKL', C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], V, optimize='optimal')
+
+    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
+    F_tmp = jnp.einsum('iI,mM,jJ,yY,imjy->IJMY', C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs, F_tmp, optimize='optimal')
+    G_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "eri", xyz_path, deriv_order, options)
+    G_tmp = jnp.einsum('kK,mM,lL,yY,kmly->KLMY', C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs, G_tmp, optimize='optimal')
+    V_tmp = -1.0 * jnp.einsum('IJMY,KLMY->IJKL', F_tmp, G_tmp, optimize='optimal')
+    V = V.at[:, :, :, :].add(V_tmp)
+    V = V.at[:, :, :, :].add(jnp.transpose(V_tmp, (1,0,3,2)))
+    del V_tmp
+    del F_tmp
+    del G_tmp
+
+    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12", xyz_path, deriv_order, options)
+    F_tmp = jnp.einsum('iI,rR,jJ,sS,irjs->IJRS', C_obs[:, :ndocc], C_obs[:, :nobs], C_obs[:, :ndocc], C_obs[:, :nobs], F_tmp, optimize='optimal')
+    G_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
+    G_tmp = jnp.einsum('kK,rR,lL,sS,krls->KLRS', C_obs[:, :ndocc], C_obs[:, :nobs], C_obs[:, :ndocc], C_obs[:, :nobs], G_tmp, optimize='optimal')
+    V = V.at[:, :, :, :].add(-1.0 * jnp.einsum('IJRS,KLRS->IJKL', F_tmp, G_tmp, optimize='optimal'))
+    del F_tmp
+    del G_tmp
+
+    return V
+
+def form_X(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options):
+
+    X = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12_squared", xyz_path, deriv_order, options)
+    X = jnp.einsum('iI,kK,jJ,lL,ikjl->IJKL', C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], X, optimize='optimal')
+
+    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
+    F_tmp = jnp.einsum('iI,mM,jJ,yY,imjy->IJMY', C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs, F_tmp, optimize='optimal')
+    X_tmp = -1.0 * jnp.einsum('IJMY,KLMY->IJKL', F_tmp, F_tmp, optimize='optimal')
+    X = X.at[:, :, :, :].add(X_tmp)
+    X = X.at[:, :, :, :].add(jnp.transpose(X_tmp, (1,0,3,2)))
+    del X_tmp
+    del F_tmp
+
+    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12", xyz_path, deriv_order, options)
+    F_tmp = jnp.einsum('iI,rR,jJ,sS,irjs->IJRS', C_obs[:, :ndocc], C_obs[:, :nobs], C_obs[:, :ndocc], C_obs[:, :nobs], F_tmp, optimize='optimal')
+    X = X.at[:, :, :, :].add(-1.0 * jnp.einsum('IJRS,KLRS->IJKL', F_tmp, F_tmp, optimize='optimal'))
+    del F_tmp
+
+    return X
+
+def form_C(geom, basis_set, cabs_set, C_obs, C_cabs, Fock, ndocc, nobs, xyz_path, deriv_order, options):
+
+    C = jnp.empty((ndocc, ndocc, nobs - ndocc, nobs - ndocc))
+
+    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
+    F_tmp = jnp.einsum('kK,aA,lL,yY,kaly->KLAY', C_obs[:, :ndocc], C_obs[:, ndocc:nobs], C_obs[:, :ndocc], C_cabs, F_tmp, optimize='optimal')
+    C_tmp = jnp.einsum('KLAY,BY->KLAB', F_tmp, Fock[ndocc:nobs, nobs:])
+    del F_tmp
+
+    C = C.at[:, :, :, :].set(C_tmp)
+    C = C.at[:, :, :, :].add(jnp.transpose(C_tmp, (1,0,3,2)))
+
+    return C

From 9709da2e280764fd863037f029dbe7285e5c5eed Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Thu, 19 Oct 2023 16:12:16 -0400
Subject: [PATCH 20/91] B Intermediate Correct, New Partial TEI Algo

---
 quax/methods/ccsd.py         |   4 +-
 quax/methods/ccsd_t.py       |   1 -
 quax/methods/energy_utils.py |  20 +++++-
 quax/methods/ints.py         |   4 +-
 quax/methods/mp2.py          |   2 +-
 quax/methods/mp2f12.py       | 130 +++++++++++++++++++++++++++--------
 6 files changed, 127 insertions(+), 34 deletions(-)

diff --git a/quax/methods/ccsd.py b/quax/methods/ccsd.py
index a17461b..85b57e7 100644
--- a/quax/methods/ccsd.py
+++ b/quax/methods/ccsd.py
@@ -3,7 +3,7 @@
 import jax.numpy as jnp
 import psi4
 
-from .energy_utils import nuclear_repulsion, partial_tei_transformation, tei_transformation
+from .energy_utils import tei_transformation
 from .hartree_fock import restricted_hartree_fock
 
 def rccsd(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=0, return_aux_data=False):
@@ -20,7 +20,7 @@ def rccsd(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_ord
     v = slice(ndocc, nbf)
 
     # Save slices of two-electron repulsion integrals in MO basis
-    V = tei_transformation(V,C)
+    V = tei_transformation(V, C)
     V = jnp.swapaxes(V,1,2)
     V = (V[o,o,o,o], V[o,o,o,v], V[o,o,v,v], V[o,v,o,v], V[o,v,v,v], V[v,v,v,v])
 
diff --git a/quax/methods/ccsd_t.py b/quax/methods/ccsd_t.py
index 24669d0..9f45f0f 100644
--- a/quax/methods/ccsd_t.py
+++ b/quax/methods/ccsd_t.py
@@ -3,7 +3,6 @@
 import jax.numpy as jnp
 from jax.lax import while_loop
 
-from .energy_utils import nuclear_repulsion, partial_tei_transformation, tei_transformation
 from .ccsd import rccsd
 
 def perturbative_triples(T1, T2, V, fock_Od, fock_Vd):
diff --git a/quax/methods/energy_utils.py b/quax/methods/energy_utils.py
index 0d0e9dc..61df17c 100644
--- a/quax/methods/energy_utils.py
+++ b/quax/methods/energy_utils.py
@@ -60,9 +60,27 @@ def tei_transformation(G, C):
     G = transform(C, G)
     return G
 
-def partial_tei_transformation(G, Ci, Cj, Ck, Cl):
+def old_partial_tei_transformation(G, Ci, Cj, Ck, Cl):
     G = jnp.einsum('pqrs, pP, qQ, rR, sS -> PQRS', G, Ci, Cj, Ck, Cl, optimize='optimal')
     return G
+
+def partial_tei_transformation(G, C1, C2, C3, C4):
+    """
+    New algo for Partial TEI transform
+    """
+    G = transform(C4, G)
+    G = transform(C3, G)
+    G = transform(C2, G)
+    G = transform(C1, G)
+    return G
+
+@jax.jit
+def chem2phys(G):
+    return jnp.transpose(G, (0,2,1,3))
+
+@jax.jit
+def f12_transpose(G):
+    return jnp.transpose(G, (1,0,3,2))
     
 def cartesian_product(*arrays):
     '''
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index 1f97502..7456fcb 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -7,7 +7,7 @@
 import psi4
 import os
 
-from ..utils import get_deriv_vec_idx, get_required_deriv_vecs
+from .energy_utils import chem2phys
 
 # Check for Libint interface
 from ..integrals import TEI
@@ -149,7 +149,7 @@ def compute_f12_teints(geom, basis1, basis2, basis3, basis4, int_type, xyz_path,
                 F = tei_obj.eri(geom)
 
     libint_interface.finalize()
-    return F
+    return chem2phys(F)
 
 def check_oei_disk(geom, basis1, basis2, xyz_path, deriv_order, address=None):
     # TODO need to check geometry and basis set name in addition to nbf
diff --git a/quax/methods/mp2.py b/quax/methods/mp2.py
index 54aada6..e24a2e2 100644
--- a/quax/methods/mp2.py
+++ b/quax/methods/mp2.py
@@ -4,7 +4,7 @@
 from jax.lax import fori_loop
 import psi4
 
-from .energy_utils import nuclear_repulsion, partial_tei_transformation, tei_transformation, cartesian_product
+from .energy_utils import partial_tei_transformation, cartesian_product
 from .hartree_fock import restricted_hartree_fock
 
 def restricted_mp2(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=0, return_aux_data=False):
diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index 82aa413..ed230a3 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -6,7 +6,8 @@
 import sys
 jnp.set_printoptions(threshold=sys.maxsize, linewidth=100)
 
-from .ints import compute_f12_oeints, compute_f12_teints
+from .ints import compute_f12_oeints, compute_f12_teints # F12 TEINTS are entered in Chem and returned in Phys
+from .energy_utils import partial_tei_transformation, f12_transpose
 from .mp2 import restricted_mp2
 
 def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, options, cabs_space, deriv_order=0):
@@ -20,16 +21,19 @@ def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, optio
     nobs = C_obs.shape[0]
     nri = C_cabs.shape[0]
 
-    f, fk = form_Fock(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
+    f, fk, k = form_Fock(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
 
     V = form_V(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options)
 
     X = form_X(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options)
 
     C = form_C(geom, basis_set, cabs_set, C_obs, C_cabs, f, ndocc, nobs, xyz_path, deriv_order, options)
+
+    B = form_B(geom, basis_set, cabs_set, C_obs, C_cabs, f, fk, k, ndocc, nobs, nri, xyz_path, deriv_order, options)
+    del fk
     jax.debug.breakpoint()
 
-    return 0
+    return jnp.array([0])
 
 def form_h(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_order, options):
     h = jnp.empty((nri, nri))
@@ -60,55 +64,57 @@ def form_Fock(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_pa
     G = jnp.empty((nri, nobs, nri, nri))
 
     G_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
-    G_tmp = jnp.einsum('pP,rR,qQ,sS,prqs->PQRS', C_obs, C_obs, C_obs[:, :ndocc], C_obs, G_tmp, optimize='optimal')
+    G_tmp = partial_tei_transformation(G_tmp, C_obs, C_obs[:, :ndocc], C_obs, C_obs)
     G = G.at[:nobs, :ndocc, :nobs, :nobs].set(G_tmp) # <Oo|OO>
     del G_tmp
 
     G_tmp = compute_f12_teints(geom, cabs_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
-    G_tmp = jnp.einsum('pP,rR,qQ,sS,prqs->PQRS', C_cabs, C_obs, C_obs, C_obs, G_tmp, optimize='optimal')
+    G_tmp = partial_tei_transformation(G_tmp, C_cabs, C_obs, C_obs, C_obs)
     G = G.at[nobs:nri, :nobs, :nobs, :nobs].set(G_tmp) # <CO|OO>
     G = G.at[:nobs, :nobs, nobs:nri, :nobs].set(jnp.transpose(G_tmp, (2,1,0,3))) # <OO|CO>
     G = G.at[:nobs, :nobs, :nobs, nobs:nri].set(jnp.transpose(G_tmp, (3,2,1,0))) # <OO|OC>
     del G_tmp
 
     G_tmp = compute_f12_teints(geom, cabs_set, basis_set, basis_set, cabs_set, "eri", xyz_path, deriv_order, options)
-    G_tmp = jnp.einsum('pP,rR,qQ,sS,prqs->PQRS', C_cabs, C_obs, C_obs[:, :ndocc], C_cabs, G_tmp, optimize='optimal')
+    G_tmp = partial_tei_transformation(G_tmp, C_cabs, C_obs[:, :ndocc], C_obs, C_cabs)
     G = G.at[nobs:nri, :ndocc, :nobs, nobs:nri].set(G_tmp) # <Co|OC>
     del G_tmp
 
     G_tmp = compute_f12_teints(geom, cabs_set, cabs_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
-    G_tmp = jnp.einsum('pP,rR,qQ,sS,prqs->PQRS', C_cabs, C_cabs, C_obs[:, :ndocc], C_obs, G_tmp, optimize='optimal')
+    G_tmp = partial_tei_transformation(G_tmp, C_cabs, C_obs[:, :ndocc], C_cabs, C_obs)
     G = G.at[nobs:nri, :ndocc, nobs:nri, :nobs].set(G_tmp) # <Co|CO>
     del G_tmp
 
     # Fill Fock Matrix
-    f = f.at[:, :].add(2.0 * jnp.einsum('PIQI->PQ', G[:, :ndocc, :, :ndocc], optimize='optimal'))
+    f = f.at[:, :].add(2.0 * jnp.einsum('piqi->pq', G[:, :ndocc, :, :ndocc], optimize='optimal'))
     fk = f # Fock Matrix without Exchange
-    f = f.at[:, :].add(-1.0 * jnp.einsum('PIIQ->PQ', G[:, :ndocc, :ndocc, :], optimize='optimal'))
+    k =  jnp.einsum('piiq->pq', G[:, :ndocc, :ndocc, :], optimize='optimal')
+    f = f.at[:, :].add(-1.0 * k)
+    del G
 
-    return f, fk
+    return f, fk, k
 
 def form_V(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options):
 
     V = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12g12", xyz_path, deriv_order, options)
-    V = jnp.einsum('iI,kK,jJ,lL,ikjl->IJKL', C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], V, optimize='optimal')
+    V = partial_tei_transformation(V, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc])
 
     F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
-    F_tmp = jnp.einsum('iI,mM,jJ,yY,imjy->IJMY', C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs, F_tmp, optimize='optimal')
+    F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs)
     G_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "eri", xyz_path, deriv_order, options)
-    G_tmp = jnp.einsum('kK,mM,lL,yY,kmly->KLMY', C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs, G_tmp, optimize='optimal')
-    V_tmp = -1.0 * jnp.einsum('IJMY,KLMY->IJKL', F_tmp, G_tmp, optimize='optimal')
+    G_tmp = partial_tei_transformation(G_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs)
+    V_tmp = -1.0 * jnp.einsum('ijmy,klmy->ijkl', G_tmp, F_tmp, optimize='optimal')
     V = V.at[:, :, :, :].add(V_tmp)
-    V = V.at[:, :, :, :].add(jnp.transpose(V_tmp, (1,0,3,2)))
+    V = V.at[:, :, :, :].add(f12_transpose(V_tmp))
     del V_tmp
     del F_tmp
     del G_tmp
 
     F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12", xyz_path, deriv_order, options)
-    F_tmp = jnp.einsum('iI,rR,jJ,sS,irjs->IJRS', C_obs[:, :ndocc], C_obs[:, :nobs], C_obs[:, :ndocc], C_obs[:, :nobs], F_tmp, optimize='optimal')
+    F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :nobs], C_obs[:, :nobs])
     G_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
-    G_tmp = jnp.einsum('kK,rR,lL,sS,krls->KLRS', C_obs[:, :ndocc], C_obs[:, :nobs], C_obs[:, :ndocc], C_obs[:, :nobs], G_tmp, optimize='optimal')
-    V = V.at[:, :, :, :].add(-1.0 * jnp.einsum('IJRS,KLRS->IJKL', F_tmp, G_tmp, optimize='optimal'))
+    G_tmp = partial_tei_transformation(G_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :nobs], C_obs[:, :nobs])
+    V = V.at[:, :, :, :].add(-1.0 * jnp.einsum('ijrs,klrs->ijkl', G_tmp, F_tmp, optimize='optimal'))
     del F_tmp
     del G_tmp
 
@@ -117,19 +123,19 @@ def form_V(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, xyz_path, deri
 def form_X(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options):
 
     X = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12_squared", xyz_path, deriv_order, options)
-    X = jnp.einsum('iI,kK,jJ,lL,ikjl->IJKL', C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], X, optimize='optimal')
+    X = partial_tei_transformation(X, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc])
 
     F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
-    F_tmp = jnp.einsum('iI,mM,jJ,yY,imjy->IJMY', C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs, F_tmp, optimize='optimal')
-    X_tmp = -1.0 * jnp.einsum('IJMY,KLMY->IJKL', F_tmp, F_tmp, optimize='optimal')
+    F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs)
+    X_tmp = -1.0 * jnp.einsum('ijmy,klmy->ijkl', F_tmp, F_tmp, optimize='optimal')
     X = X.at[:, :, :, :].add(X_tmp)
-    X = X.at[:, :, :, :].add(jnp.transpose(X_tmp, (1,0,3,2)))
+    X = X.at[:, :, :, :].add(f12_transpose(X_tmp))
     del X_tmp
     del F_tmp
 
     F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12", xyz_path, deriv_order, options)
-    F_tmp = jnp.einsum('iI,rR,jJ,sS,irjs->IJRS', C_obs[:, :ndocc], C_obs[:, :nobs], C_obs[:, :ndocc], C_obs[:, :nobs], F_tmp, optimize='optimal')
-    X = X.at[:, :, :, :].add(-1.0 * jnp.einsum('IJRS,KLRS->IJKL', F_tmp, F_tmp, optimize='optimal'))
+    F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :nobs], C_obs[:, :nobs])
+    X = X.at[:, :, :, :].add(-1.0 * jnp.einsum('ijrs,klrs->ijkl', F_tmp, F_tmp, optimize='optimal'))
     del F_tmp
 
     return X
@@ -139,11 +145,81 @@ def form_C(geom, basis_set, cabs_set, C_obs, C_cabs, Fock, ndocc, nobs, xyz_path
     C = jnp.empty((ndocc, ndocc, nobs - ndocc, nobs - ndocc))
 
     F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
-    F_tmp = jnp.einsum('kK,aA,lL,yY,kaly->KLAY', C_obs[:, :ndocc], C_obs[:, ndocc:nobs], C_obs[:, :ndocc], C_cabs, F_tmp, optimize='optimal')
-    C_tmp = jnp.einsum('KLAY,BY->KLAB', F_tmp, Fock[ndocc:nobs, nobs:])
+    F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, ndocc:nobs], C_cabs)
+    C_tmp = jnp.einsum('klay,by->klab', F_tmp, Fock[ndocc:nobs, nobs:], optimize='optimal')
     del F_tmp
 
     C = C.at[:, :, :, :].set(C_tmp)
-    C = C.at[:, :, :, :].add(jnp.transpose(C_tmp, (1,0,3,2)))
+    C = C.at[:, :, :, :].add(f12_transpose(C_tmp))
+    del C_tmp
 
     return C
+
+def form_B(geom, basis_set, cabs_set, C_obs, C_cabs, Fock, noK, K, ndocc, nobs, nri, xyz_path, deriv_order, options):
+    # Term 1
+    B = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12_double_commutator", xyz_path, deriv_order, options)
+    B = partial_tei_transformation(B, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc])
+
+    # Term 2
+    F2 = jnp.empty((ndocc, ndocc, ndocc, nri))
+
+    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12_squared", xyz_path, deriv_order, options)
+    F2 = F2.at[:, :, :, :nobs].set(partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs)) # <oo|oO>
+    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12_squared", xyz_path, deriv_order, options)
+    F2 = F2.at[:, :, :, nobs:].set(partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs)) # <oo|oC>
+    del F_tmp
+
+    tmp = jnp.einsum('lknI,mI->lknm', F2, noK[:ndocc, :])
+    del F2
+    B = B.at[:, :, :, :].add(tmp)
+    B = B.at[:, :, :, :].add(f12_transpose(tmp))
+    del tmp
+
+    # F12 Integral
+    F_oo11 = jnp.empty((ndocc, ndocc, nri, nri))
+    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12", xyz_path, deriv_order, options)
+    F_oo11 = F_oo11.at[:, :, :nobs, :nobs].set(partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs, C_obs)) # <oo|OO>
+    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
+    F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs, C_cabs)
+    F_oo11 = F_oo11.at[:, :, :nobs, nobs:].set(F_tmp) # <oo|OC>
+    F_oo11 = F_oo11.at[:, :, nobs:, :nobs].set(f12_transpose(F_tmp)) # <oo|CO>
+    F_tmp = compute_f12_teints(geom, basis_set, cabs_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
+    F_oo11 = F_oo11.at[:, :, nobs:, nobs:].set(partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs, C_cabs)) # <oo|CC>
+    del F_tmp
+
+    # Term 3
+    tmp = -1.0 * jnp.einsum('lkPC,CA,nmPA->lknm', F_oo11, K, F_oo11, optimize='optimal')
+    B = B.at[:, :, :, :].add(tmp)
+    B = B.at[:, :, :, :].add(f12_transpose(tmp))
+
+    # Term 4
+    tmp = -1.0 * jnp.einsum('lkjC,CA,nmjA->lknm', F_oo11[:, :, :ndocc, :], Fock, F_oo11[:, :, :ndocc, :], optimize='optimal')
+    B = B.at[:, :, :, :].add(tmp)
+    B = B.at[:, :, :, :].add(f12_transpose(tmp))
+
+    # Term 5
+    tmp = jnp.einsum('lkxj,ji,nmxi->lknm', F_oo11[:, :, nobs:, :ndocc], Fock[:ndocc, :ndocc], F_oo11[:, :, nobs:, :ndocc], optimize='optimal')
+    B = B.at[:, :, :, :].add(tmp)
+    B = B.at[:, :, :, :].add(f12_transpose(tmp))
+
+    # Term 6
+    tmp = -1.0 * jnp.einsum('lkbp,pq,nmbq->lknm', F_oo11[:, :, ndocc:nobs, :nobs], Fock[:nobs, :nobs], F_oo11[:, :, ndocc:nobs, :nobs], optimize='optimal')
+    B = B.at[:, :, :, :].add(tmp)
+    B = B.at[:, :, :, :].add(f12_transpose(tmp))
+
+    # Term 7
+    tmp = -2.0 * jnp.einsum('lkxI,jI,nmxj->lknm', F_oo11[:, :, nobs:, :], Fock[:ndocc, :], F_oo11[:, :, nobs:, :ndocc], optimize='optimal')
+    B = B.at[:, :, :, :].add(tmp)
+    B = B.at[:, :, :, :].add(f12_transpose(tmp))
+
+    # Term 8
+    tmp = -2.0 * jnp.einsum('lkbq,qy,nmby->lknm', F_oo11[:, :, ndocc:nobs, :nobs], Fock[:nobs, nobs:], F_oo11[:, :, ndocc:nobs, nobs:], optimize='optimal')
+    del F_oo11
+    B = B.at[:, :, :, :].add(tmp)
+    B = B.at[:, :, :, :].add(f12_transpose(tmp))
+
+    tmp = jnp.transpose(B, (2,3,0,1))
+    B = B.at[:, :, :, :].add(tmp)
+    del tmp
+
+    return 0.5 * B

From 99e02854e6abd13d003c31714669a34f8f029c86 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Thu, 19 Oct 2023 17:36:33 -0400
Subject: [PATCH 21/91] Working MP2-F12/3C(FIX)

---
 quax/methods/mp2f12.py | 92 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 89 insertions(+), 3 deletions(-)

diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index ed230a3..0a0bbe1 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -14,6 +14,8 @@ def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, optio
     nelectrons = int(jnp.sum(nuclear_charges)) - charge
     ndocc = nelectrons // 2
     E_mp2, C_obs, eps = restricted_mp2(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
+    e_ij = eps[:ndocc]
+    e_ab = eps[ndocc:]
 
     print("Running MP2-F12 Computation...")
     cabs_set = cabs_space.basisset()
@@ -30,10 +32,15 @@ def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, optio
     C = form_C(geom, basis_set, cabs_set, C_obs, C_cabs, f, ndocc, nobs, xyz_path, deriv_order, options)
 
     B = form_B(geom, basis_set, cabs_set, C_obs, C_cabs, f, fk, k, ndocc, nobs, nri, xyz_path, deriv_order, options)
-    del fk
-    jax.debug.breakpoint()
 
-    return jnp.array([0])
+    D = -1.0 / (e_ij.reshape(-1, 1, 1, 1) + e_ij.reshape(-1, 1, 1) - e_ab.reshape(-1, 1) - e_ab)
+
+    G = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
+    G = partial_tei_transformation(G, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, ndocc:nobs], C_obs[:, ndocc:nobs])
+    
+    E_f12 = form_energy(V, X, C, B, D, f, G, ndocc, nobs)
+
+    return E_f12
 
 def form_h(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_order, options):
     h = jnp.empty((nri, nri))
@@ -94,6 +101,85 @@ def form_Fock(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_pa
 
     return f, fk, k
 
+# F12 Energy and Energy (Tilde) Intermediates
+def kron_delta(i, j):
+    if i == j:
+        return 1.0
+    else:
+        return 2.0
+
+def form_energy(V, X, C, B, D, Fock, G, ndocc, nobs):
+    # Singlet and Triplet Pair Energies
+    E_f12_s = 0.0
+    E_f12_t = 0.0
+
+    for i in range(ndocc):
+        for j in range(i, ndocc):
+            B_ij = B - (X * (Fock[i, i] + Fock[j, j]))
+            V_s, V_t = form_V_Tilde(V[i, j, :, :], C, G[i, j, :, :], D[i, j, :, :], i, j)
+            B_s, B_t = form_B_Tilde(B_ij, C, D[i, j, :, :], i, j)
+
+            kd = kron_delta(i, j)
+
+            E_s = kd * (V_s + B_s)
+            E_f12_s += E_s
+
+            E_t = 0.0
+            if i != j:
+                E_t = 3.0 * kd * (V_t + B_t)
+                E_f12_t += E_t
+
+    return E_f12_s + E_f12_t
+
+def t_(p, q, r, s):
+    # Fixed Amplitude Ansatz
+    if p == r and q == s and p != q:
+        return 3.0 / 8.0
+    elif q == r and p == s and p != q:
+        return 1.0 / 8.0
+    elif p == q and p == r and p == s:
+        return 0.5
+    else:
+        return 0.0
+
+def form_V_Tilde(V_ij, C, G_ij, D_ij, i, j):
+    # Singlet and Triplet Pair Energies
+    V_s = 0.0
+    V_t = 0.0
+
+    V_ij = V_ij.at[:, :].add(-1.0 *jnp.einsum('klab,ab,ab->kl', C, G_ij, D_ij, optimize='optimal'))
+
+    kd = kron_delta(i, j)
+
+    V_s += 0.5 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd * (V_ij[i, j] + V_ij[j, i])
+
+    if i != j:
+        V_t += 0.5 * (t_(i, j, i, j) - t_(i, j, j, i)) * kd * (V_ij[i, j] - V_ij[j, i])
+
+    return V_s, V_t
+
+def form_B_Tilde(B_ij, C, D_ij, i, j):
+    # Singlet and Triplet Pair Energies
+    B_s = 0.0
+    B_t = 0.0
+
+    B_ij = B_ij.at[:, :, :, :].add(-1.0 * jnp.einsum('klab,ab,mnab', C, D_ij, C, optimize='optimal'))
+
+    kd = kron_delta(i, j)
+
+    B_s += 0.125 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd \
+                 * (B_ij[i, j, i, j] + B_ij[j, i, i, j]) \
+                 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd
+    
+    if i != j:
+        B_t += 0.125 * (t_(i, j, i, j) - t_(i, j, j, i)) * kd \
+                     * (B_ij[i, j, i, j] - B_ij[j, i, i, j]) \
+                     * (t_(i, j, i, j) - t_(i, j, j, i)) * kd
+        
+    return B_s, B_t
+
+# F12 Intermediates
+
 def form_V(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options):
 
     V = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12g12", xyz_path, deriv_order, options)

From 8c9c79283c8f494a8bef1b6e2fe34ddd5bba697b Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Fri, 27 Oct 2023 13:34:30 -0400
Subject: [PATCH 22/91] Fix MP2 and MP2F12 mixup, libint2 calls, and disk
 check. MP2F12 dev

---
 quax/core.py           |  11 +--
 quax/methods/ccsd.py   |   4 +-
 quax/methods/ccsd_t.py |  36 ++++----
 quax/methods/ints.py   |  25 +++---
 quax/methods/mp2.py    |   1 -
 quax/methods/mp2f12.py | 188 ++++++++++++++---------------------------
 6 files changed, 104 insertions(+), 161 deletions(-)

diff --git a/quax/core.py b/quax/core.py
index d121cc0..d0b13bd 100644
--- a/quax/core.py
+++ b/quax/core.py
@@ -102,8 +102,9 @@ def electronic_energy(*args, deriv_order=deriv_order):
             def electronic_energy(*args, deriv_order=deriv_order):
                 return restricted_mp2(*args, deriv_order=deriv_order)
         elif method =='mp2-f12':
+            args = args + (cabs_space,)
             def electronic_energy(*args, deriv_order=deriv_order):
-                return restricted_mp2_f12(*args, cabs_space, deriv_order=deriv_order)
+                return restricted_mp2_f12(*args, deriv_order=deriv_order)
         elif method =='ccsd':
             def electronic_energy(*args, deriv_order=deriv_order):
                 return rccsd(*args, deriv_order=deriv_order)
@@ -158,13 +159,13 @@ def partial_wrapper(*args):
         elif method =='mp2':
             def partial_wrapper(*args):
                 geom = jnp.asarray(args)
-                E_mp2f12 = restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order)
-                return E_mp2f12
+                E_mp2 = restricted_mp2(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order)
+                return E_mp2
         elif method =='mp2-f12':
             def partial_wrapper(*args):
                 geom = jnp.asarray(args)
-                E_mp2 = restricted_mp2(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order)
-                return E_mp2
+                E_mp2f12 = restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, options, cabs_space, deriv_order=deriv_order)
+                return E_mp2f12
         elif method =='ccsd':
             def partial_wrapper(*args):
                 geom = jnp.asarray(args)
diff --git a/quax/methods/ccsd.py b/quax/methods/ccsd.py
index 85b57e7..df578d7 100644
--- a/quax/methods/ccsd.py
+++ b/quax/methods/ccsd.py
@@ -32,8 +32,8 @@ def rccsd(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_ord
     d = 1.0 / (fock_Od.reshape(-1, 1) - fock_Vd)
 
     # Initial Amplitudes
-    T1 = jnp.zeros((ndocc,nvir))
-    T2 = D*V[2]
+    T1 = jnp.zeros((ndocc, nvir))
+    T2 = D * V[2]
 
     maxit = options['maxit']
     iteration = 0
diff --git a/quax/methods/ccsd_t.py b/quax/methods/ccsd_t.py
index 9f45f0f..8aaf6eb 100644
--- a/quax/methods/ccsd_t.py
+++ b/quax/methods/ccsd_t.py
@@ -11,24 +11,24 @@ def perturbative_triples(T1, T2, V, fock_Od, fock_Vd):
     delta_o = jnp.eye(o)
     delta_v = jnp.eye(v)
 
-    def inner_func(i,j,k):
-        delta_ij = delta_o[i,j] 
-        delta_jk = delta_o[j,k] 
-        W  = jnp.einsum('dab,cd', Vovvv[i,:,:,:], T2[k,j,:,:]) 
-        W += jnp.einsum('dac,bd', Vovvv[i,:,:,:], T2[j,k,:,:]) 
-        W += jnp.einsum('dca,bd', Vovvv[k,:,:,:], T2[j,i,:,:])  
-        W += jnp.einsum('dcb,ad', Vovvv[k,:,:,:], T2[i,j,:,:])
-        W += jnp.einsum('dbc,ad', Vovvv[j,:,:,:], T2[i,k,:,:])
-        W += jnp.einsum('dba,cd', Vovvv[j,:,:,:], T2[k,i,:,:])
-        W -= jnp.einsum('lc,lab', Vooov[:,k,j,:], T2[i,:,:,:])
-        W -= jnp.einsum('lb,lac', Vooov[:,j,k,:], T2[i,:,:,:]) 
-        W -= jnp.einsum('lb,lca', Vooov[:,j,i,:], T2[k,:,:,:])
-        W -= jnp.einsum('la,lcb', Vooov[:,i,j,:], T2[k,:,:,:])
-        W -= jnp.einsum('la,lbc', Vooov[:,i,k,:], T2[j,:,:,:])
-        W -= jnp.einsum('lc,lba', Vooov[:,k,i,:], T2[j,:,:,:])
-        V  = W + jnp.einsum('bc,a', Voovv[j,k,:,:], T1[i,:]) \
-               + jnp.einsum('ac,b', Voovv[i,k,:,:], T1[j,:]) \
-               + jnp.einsum('ab,c', Voovv[i,j,:,:], T1[k,:])
+    def inner_func(i, j, k):
+        delta_ij = delta_o[i, j]
+        delta_jk = delta_o[j, k]
+        W  = jnp.einsum('dab,cd', Vovvv[i, :, :, :], T2[k, j, :, :])
+        W += jnp.einsum('dac,bd', Vovvv[i, :, :, :], T2[j, k, :, :])
+        W += jnp.einsum('dca,bd', Vovvv[k, :, :, :], T2[j, i, :, :])
+        W += jnp.einsum('dcb,ad', Vovvv[k, :, :, :], T2[i, j, :, :])
+        W += jnp.einsum('dbc,ad', Vovvv[j, :, :, :], T2[i, k, :, :])
+        W += jnp.einsum('dba,cd', Vovvv[j, :, :, :], T2[k, i, :, :])
+        W -= jnp.einsum('lc,lab', Vooov[:, k, j, :], T2[i, :, :, :])
+        W -= jnp.einsum('lb,lac', Vooov[:, j, k, :], T2[i, :, :, :])
+        W -= jnp.einsum('lb,lca', Vooov[:, j, i, :], T2[k, :, :, :])
+        W -= jnp.einsum('la,lcb', Vooov[:, i, j, :], T2[k, :, :, :])
+        W -= jnp.einsum('la,lbc', Vooov[:, i, k, :], T2[j, :, :, :])
+        W -= jnp.einsum('lc,lba', Vooov[:, k, i, :], T2[j, :, :, :])
+        V  = W + jnp.einsum('bc,a', Voovv[j, k, :, :], T1[i, :]) \
+               + jnp.einsum('ac,b', Voovv[i, k, :, :], T1[j, :]) \
+               + jnp.einsum('ab,c', Voovv[i, j, :, :], T1[k, :])
 
 
         delta_occ = 2 - delta_ij - delta_jk
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index 7456fcb..5949cd7 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -23,23 +23,26 @@ def compute_integrals(geom, basis_set, xyz_path, deriv_order, options):
 
     if algo == 'libint_disk':
         # Check disk for currently existing integral derivatives
-        check = check_oei_disk(geom, basis_set, basis_set, xyz_path, deriv_order)
-        check = check_tei_disk(geom, basis_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order)
+        check_oei = check_oei_disk(geom, basis_set, basis_set, xyz_path, deriv_order)
+        check_tei = check_tei_disk(geom, basis_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order)
 
         oei_obj = OEI(basis_set, basis_set, xyz_path, deriv_order, 'disk')
         tei_obj = TEI(basis_set, basis_set, basis_set, basis_set, xyz_path, deriv_order, options, 'disk')
         # If disk integral derivs are right, nothing to do
-        if check:
+        if check_oei:
             S = oei_obj.overlap(geom)
             T = oei_obj.kinetic(geom)
             V = oei_obj.potential(geom)
-            G = tei_obj.eri(geom)
         else:
             libint_interface.oei_deriv_disk(deriv_order)
-            libint_interface.eri_deriv_disk(deriv_order)
             S = oei_obj.overlap(geom)
             T = oei_obj.kinetic(geom)
             V = oei_obj.potential(geom)
+
+        if check_tei:
+            G = tei_obj.eri(geom)
+        else:
+            libint_interface.eri_deriv_disk(deriv_order)
             G = tei_obj.eri(geom)
 
     else:
@@ -117,16 +120,16 @@ def compute_f12_teints(geom, basis1, basis2, basis3, basis4, int_type, xyz_path,
         else:
             match int_type:
                 case "f12":
-                    libint_interface.f12_deriv_disk(deriv_order)
+                    libint_interface.f12_deriv_disk(beta, deriv_order)
                     F = tei_obj.f12(geom, beta)
                 case "f12_squared":
-                    libint_interface.f12_squared_deriv_disk(deriv_order)
+                    libint_interface.f12_squared_deriv_disk(beta, deriv_order)
                     F = tei_obj.f12_squared(geom, beta)
                 case "f12g12":
-                    libint_interface.f12g12_deriv_disk(deriv_order)
+                    libint_interface.f12g12_deriv_disk(beta, deriv_order)
                     F = tei_obj.f12g12(geom, beta)
                 case "f12_double_commutator":
-                    libint_interface.f12_double_commutator_deriv_disk(deriv_order)
+                    libint_interface.f12_double_commutator_deriv_disk(beta, deriv_order)
                     F = tei_obj.f12_double_commutator(geom, beta)
                 case "eri":
                     libint_interface.eri_deriv_disk(deriv_order)
@@ -165,7 +168,7 @@ def check_oei_disk(geom, basis1, basis2, xyz_path, deriv_order, address=None):
         nbf1 = basis1.nbf()
         nbf2 = basis2.nbf()
         # Check if there are `deriv_order` datasets in the eri file
-        correct_deriv_order = len(oeifile) == deriv_order
+        correct_deriv_order = len(oeifile) >= 3 * (deriv_order)
         # Check nbf dimension of integral arrays
         sample_dataset_name = list(oeifile.keys())[0]
         correct_nbf1 = oeifile[sample_dataset_name].shape[0] == nbf1
@@ -212,7 +215,7 @@ def check_tei_disk(geom, basis1, basis2, basis3, basis4, int_type, xyz_path, der
         nbf3 = basis3.nbf()
         nbf4 = basis4.nbf()
         # Check if there are `deriv_order` datasets in the eri file
-        correct_deriv_order = len(erifile) == deriv_order
+        correct_deriv_order = len(erifile) >= deriv_order
         # Check nbf dimension of integral arrays
         sample_dataset_name = list(erifile.keys())[0]
         correct_nbf1 = erifile[sample_dataset_name].shape[0] == nbf1
diff --git a/quax/methods/mp2.py b/quax/methods/mp2.py
index e24a2e2..5bcb663 100644
--- a/quax/methods/mp2.py
+++ b/quax/methods/mp2.py
@@ -14,7 +14,6 @@ def restricted_mp2(geom, basis_set, xyz_path, nuclear_charges, charge, options,
 
     print("Running MP2 Computation...")
     nvirt = G.shape[0] - ndocc
-    nbf = G.shape[0]
 
     G = partial_tei_transformation(G, C[:,:ndocc], C[:,ndocc:], C[:,:ndocc], C[:,ndocc:])
 
diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index 0a0bbe1..5a47b13 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -6,7 +6,7 @@
 import sys
 jnp.set_printoptions(threshold=sys.maxsize, linewidth=100)
 
-from .ints import compute_f12_oeints, compute_f12_teints # F12 TEINTS are entered in Chem and returned in Phys
+from .ints import compute_f12_oeints, compute_f12_teints
 from .energy_utils import partial_tei_transformation, f12_transpose
 from .mp2 import restricted_mp2
 
@@ -14,12 +14,11 @@ def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, optio
     nelectrons = int(jnp.sum(nuclear_charges)) - charge
     ndocc = nelectrons // 2
     E_mp2, C_obs, eps = restricted_mp2(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
-    e_ij = eps[:ndocc]
-    e_ab = eps[ndocc:]
+    eps_occ, eps_vir = eps[:ndocc], eps[ndocc:]
 
     print("Running MP2-F12 Computation...")
     cabs_set = cabs_space.basisset()
-    C_cabs = jnp.array(cabs_space.C().to_array())
+    C_cabs = jnp.asarray(cabs_space.C().to_array())
     nobs = C_obs.shape[0]
     nri = C_cabs.shape[0]
 
@@ -33,33 +32,72 @@ def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, optio
 
     B = form_B(geom, basis_set, cabs_set, C_obs, C_cabs, f, fk, k, ndocc, nobs, nri, xyz_path, deriv_order, options)
 
-    D = -1.0 / (e_ij.reshape(-1, 1, 1, 1) + e_ij.reshape(-1, 1, 1) - e_ab.reshape(-1, 1) - e_ab)
+    D = -1.0 * jnp.reciprocal(eps_occ.reshape(-1, 1, 1, 1) + eps_occ.reshape(-1, 1, 1) - eps_vir.reshape(-1, 1) - eps_vir)
 
     G = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
     G = partial_tei_transformation(G, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, ndocc:nobs], C_obs[:, ndocc:nobs])
     
-    E_f12 = form_energy(V, X, C, B, D, f, G, ndocc, nobs)
+    indices = jnp.asarray(jnp.triu_indices(ndocc)).reshape(2,-1).T
 
-    return E_f12
+    def loop_energy(idx, f12_corr):
+        i,j = indices[idx]
+        kd = jax.lax.cond(i == j, lambda: 1.0, lambda: 2.0)
+
+        V_ij = V[i, j, :, :]
+        V_ij = V_ij.at[:, :].add(-1.0 *jnp.einsum('klab,ab,ab->kl', C, G[i, j, :, :], D[i, j, :, :], optimize='optimal'))
+
+        V_s = 0.5 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd * (V_ij[i, j] + V_ij[j, i])
+
+        V_t = 0.5 * jax.lax.cond(i != j, lambda: (t_(i, j, i, j) - t_(i, j, j, i))
+                                               * kd * (V_ij[i, j] - V_ij[j, i]), lambda: 0.0)
+
+        B_ij = B - (X * (f[i, i] + f[j, j]))
+        B_ij = B_ij.at[:, :, :, :].add(-1.0 * jnp.einsum('klab,ab,mnab', C, D[i, j, :, :], C, optimize='optimal'))
+
+        B_s = 0.125 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd \
+                     * (B_ij[i, j, i, j] + B_ij[j, i, i, j]) \
+                     * (t_(i, j, i, j) + t_(i, j, j, i)) * kd
+
+        B_t = 0.125 * jax.lax.cond(i != j, lambda: (t_(i, j, i, j) - t_(i, j, j, i)) * kd
+                                                 * (B_ij[i, j, i, j] - B_ij[j, i, i, j])
+                                                 * (t_(i, j, i, j) - t_(i, j, j, i)) * kd,
+                                                 lambda: 0.0)
+
+        f12_corr += kd * (V_s + B_s)
+        f12_corr += 3.0 * kd * (V_t + B_t)
+
+        return f12_corr
+
+    dE_mp2f12 = fori_loop(0, indices.shape[0], loop_energy, 0.0)
+
+    return E_mp2 + dE_mp2f12
+
+# Fixed Amplitude Ansatz
+@jax.jit
+def t_(p = 0, q = 0, r = 0, s = 0):
+    return jnp.select(
+        [(p == q) & (p == r) & (p ==s), (p == r) & (q == s), (p == s) & (q == r)],
+        [0.5, 0.375, 0.125],
+        default = jnp.nan
+    )
+
+# One-Electron Integrals
 
 def form_h(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_order, options):
-    h = jnp.empty((nri, nri))
+    h = np.empty((nri, nri))
 
     h_tmp = compute_f12_oeints(geom, basis_set, basis_set, xyz_path, deriv_order, options)
-    h_tmp = jnp.einsum('pP,qQ,pq->PQ', C_obs, C_obs, h_tmp, optimize='optimal')
+    h_tmp = np.einsum('pP,qQ,pq->PQ', C_obs, C_obs, h_tmp, optimize='optimal')
     h = h.at[:nobs, :nobs].set(h_tmp) # <O|O>
-    del h_tmp
 
     h_tmp = compute_f12_oeints(geom, basis_set, cabs_set, xyz_path, deriv_order, options)
-    h_tmp = jnp.einsum('pP,qQ,pq->PQ', C_obs, C_cabs, h_tmp, optimize='optimal')
+    h_tmp = np.einsum('pP,qQ,pq->PQ', C_obs, C_cabs, h_tmp, optimize='optimal')
     h = h.at[:nobs, nobs:nri].set(h_tmp) # <O|C>
     h = h.at[nobs:nri, :nobs].set(jnp.transpose(h_tmp)) # <C|O>
-    del h_tmp
 
     h_tmp = compute_f12_oeints(geom, cabs_set, cabs_set, xyz_path, deriv_order, options)
     h_tmp = jnp.einsum('pP,qQ,pq->PQ', C_cabs, C_cabs, h_tmp, optimize='optimal')
     h = h.at[nobs:nri, nobs:nri].set(h_tmp) # <C|C>
-    del h_tmp
 
     return h
 
@@ -73,112 +111,31 @@ def form_Fock(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_pa
     G_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
     G_tmp = partial_tei_transformation(G_tmp, C_obs, C_obs[:, :ndocc], C_obs, C_obs)
     G = G.at[:nobs, :ndocc, :nobs, :nobs].set(G_tmp) # <Oo|OO>
-    del G_tmp
 
     G_tmp = compute_f12_teints(geom, cabs_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
     G_tmp = partial_tei_transformation(G_tmp, C_cabs, C_obs, C_obs, C_obs)
     G = G.at[nobs:nri, :nobs, :nobs, :nobs].set(G_tmp) # <CO|OO>
     G = G.at[:nobs, :nobs, nobs:nri, :nobs].set(jnp.transpose(G_tmp, (2,1,0,3))) # <OO|CO>
     G = G.at[:nobs, :nobs, :nobs, nobs:nri].set(jnp.transpose(G_tmp, (3,2,1,0))) # <OO|OC>
-    del G_tmp
 
     G_tmp = compute_f12_teints(geom, cabs_set, basis_set, basis_set, cabs_set, "eri", xyz_path, deriv_order, options)
     G_tmp = partial_tei_transformation(G_tmp, C_cabs, C_obs[:, :ndocc], C_obs, C_cabs)
     G = G.at[nobs:nri, :ndocc, :nobs, nobs:nri].set(G_tmp) # <Co|OC>
-    del G_tmp
 
     G_tmp = compute_f12_teints(geom, cabs_set, cabs_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
     G_tmp = partial_tei_transformation(G_tmp, C_cabs, C_obs[:, :ndocc], C_cabs, C_obs)
     G = G.at[nobs:nri, :ndocc, nobs:nri, :nobs].set(G_tmp) # <Co|CO>
-    del G_tmp
 
     # Fill Fock Matrix
     f = f.at[:, :].add(2.0 * jnp.einsum('piqi->pq', G[:, :ndocc, :, :ndocc], optimize='optimal'))
     fk = f # Fock Matrix without Exchange
     k =  jnp.einsum('piiq->pq', G[:, :ndocc, :ndocc, :], optimize='optimal')
     f = f.at[:, :].add(-1.0 * k)
-    del G
 
     return f, fk, k
 
-# F12 Energy and Energy (Tilde) Intermediates
-def kron_delta(i, j):
-    if i == j:
-        return 1.0
-    else:
-        return 2.0
-
-def form_energy(V, X, C, B, D, Fock, G, ndocc, nobs):
-    # Singlet and Triplet Pair Energies
-    E_f12_s = 0.0
-    E_f12_t = 0.0
-
-    for i in range(ndocc):
-        for j in range(i, ndocc):
-            B_ij = B - (X * (Fock[i, i] + Fock[j, j]))
-            V_s, V_t = form_V_Tilde(V[i, j, :, :], C, G[i, j, :, :], D[i, j, :, :], i, j)
-            B_s, B_t = form_B_Tilde(B_ij, C, D[i, j, :, :], i, j)
-
-            kd = kron_delta(i, j)
-
-            E_s = kd * (V_s + B_s)
-            E_f12_s += E_s
-
-            E_t = 0.0
-            if i != j:
-                E_t = 3.0 * kd * (V_t + B_t)
-                E_f12_t += E_t
-
-    return E_f12_s + E_f12_t
-
-def t_(p, q, r, s):
-    # Fixed Amplitude Ansatz
-    if p == r and q == s and p != q:
-        return 3.0 / 8.0
-    elif q == r and p == s and p != q:
-        return 1.0 / 8.0
-    elif p == q and p == r and p == s:
-        return 0.5
-    else:
-        return 0.0
-
-def form_V_Tilde(V_ij, C, G_ij, D_ij, i, j):
-    # Singlet and Triplet Pair Energies
-    V_s = 0.0
-    V_t = 0.0
-
-    V_ij = V_ij.at[:, :].add(-1.0 *jnp.einsum('klab,ab,ab->kl', C, G_ij, D_ij, optimize='optimal'))
-
-    kd = kron_delta(i, j)
-
-    V_s += 0.5 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd * (V_ij[i, j] + V_ij[j, i])
-
-    if i != j:
-        V_t += 0.5 * (t_(i, j, i, j) - t_(i, j, j, i)) * kd * (V_ij[i, j] - V_ij[j, i])
-
-    return V_s, V_t
-
-def form_B_Tilde(B_ij, C, D_ij, i, j):
-    # Singlet and Triplet Pair Energies
-    B_s = 0.0
-    B_t = 0.0
-
-    B_ij = B_ij.at[:, :, :, :].add(-1.0 * jnp.einsum('klab,ab,mnab', C, D_ij, C, optimize='optimal'))
-
-    kd = kron_delta(i, j)
-
-    B_s += 0.125 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd \
-                 * (B_ij[i, j, i, j] + B_ij[j, i, i, j]) \
-                 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd
-    
-    if i != j:
-        B_t += 0.125 * (t_(i, j, i, j) - t_(i, j, j, i)) * kd \
-                     * (B_ij[i, j, i, j] - B_ij[j, i, i, j]) \
-                     * (t_(i, j, i, j) - t_(i, j, j, i)) * kd
-        
-    return B_s, B_t
-
 # F12 Intermediates
+# F12 TEINTS are entered in Chem and returned in Phys
 
 def form_V(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options):
 
@@ -192,17 +149,12 @@ def form_V(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, xyz_path, deri
     V_tmp = -1.0 * jnp.einsum('ijmy,klmy->ijkl', G_tmp, F_tmp, optimize='optimal')
     V = V.at[:, :, :, :].add(V_tmp)
     V = V.at[:, :, :, :].add(f12_transpose(V_tmp))
-    del V_tmp
-    del F_tmp
-    del G_tmp
 
     F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12", xyz_path, deriv_order, options)
     F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :nobs], C_obs[:, :nobs])
     G_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
     G_tmp = partial_tei_transformation(G_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :nobs], C_obs[:, :nobs])
     V = V.at[:, :, :, :].add(-1.0 * jnp.einsum('ijrs,klrs->ijkl', G_tmp, F_tmp, optimize='optimal'))
-    del F_tmp
-    del G_tmp
 
     return V
 
@@ -216,13 +168,10 @@ def form_X(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, xyz_path, deri
     X_tmp = -1.0 * jnp.einsum('ijmy,klmy->ijkl', F_tmp, F_tmp, optimize='optimal')
     X = X.at[:, :, :, :].add(X_tmp)
     X = X.at[:, :, :, :].add(f12_transpose(X_tmp))
-    del X_tmp
-    del F_tmp
 
     F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12", xyz_path, deriv_order, options)
     F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :nobs], C_obs[:, :nobs])
     X = X.at[:, :, :, :].add(-1.0 * jnp.einsum('ijrs,klrs->ijkl', F_tmp, F_tmp, optimize='optimal'))
-    del F_tmp
 
     return X
 
@@ -233,11 +182,9 @@ def form_C(geom, basis_set, cabs_set, C_obs, C_cabs, Fock, ndocc, nobs, xyz_path
     F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
     F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, ndocc:nobs], C_cabs)
     C_tmp = jnp.einsum('klay,by->klab', F_tmp, Fock[ndocc:nobs, nobs:], optimize='optimal')
-    del F_tmp
 
     C = C.at[:, :, :, :].set(C_tmp)
     C = C.at[:, :, :, :].add(f12_transpose(C_tmp))
-    del C_tmp
 
     return C
 
@@ -248,30 +195,25 @@ def form_B(geom, basis_set, cabs_set, C_obs, C_cabs, Fock, noK, K, ndocc, nobs,
 
     # Term 2
     F2 = jnp.empty((ndocc, ndocc, ndocc, nri))
-
-    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12_squared", xyz_path, deriv_order, options)
-    F2 = F2.at[:, :, :, :nobs].set(partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs)) # <oo|oO>
-    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12_squared", xyz_path, deriv_order, options)
-    F2 = F2.at[:, :, :, nobs:].set(partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs)) # <oo|oC>
-    del F_tmp
+    tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12_squared", xyz_path, deriv_order, options)
+    F2 = F2.at[:, :, :, :nobs].set(partial_tei_transformation(tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs)) # <oo|oO>
+    tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12_squared", xyz_path, deriv_order, options)
+    F2 = F2.at[:, :, :, nobs:].set(partial_tei_transformation(tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs)) # <oo|oC>
 
     tmp = jnp.einsum('lknI,mI->lknm', F2, noK[:ndocc, :])
-    del F2
     B = B.at[:, :, :, :].add(tmp)
     B = B.at[:, :, :, :].add(f12_transpose(tmp))
-    del tmp
 
     # F12 Integral
     F_oo11 = jnp.empty((ndocc, ndocc, nri, nri))
-    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12", xyz_path, deriv_order, options)
-    F_oo11 = F_oo11.at[:, :, :nobs, :nobs].set(partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs, C_obs)) # <oo|OO>
-    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
-    F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs, C_cabs)
-    F_oo11 = F_oo11.at[:, :, :nobs, nobs:].set(F_tmp) # <oo|OC>
-    F_oo11 = F_oo11.at[:, :, nobs:, :nobs].set(f12_transpose(F_tmp)) # <oo|CO>
-    F_tmp = compute_f12_teints(geom, basis_set, cabs_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
-    F_oo11 = F_oo11.at[:, :, nobs:, nobs:].set(partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs, C_cabs)) # <oo|CC>
-    del F_tmp
+    tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12", xyz_path, deriv_order, options)
+    F_oo11 = F_oo11.at[:, :, :nobs, :nobs].set(partial_tei_transformation(tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs, C_obs)) # <oo|OO>
+    tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
+    tmp = partial_tei_transformation(tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs, C_cabs)
+    F_oo11 = F_oo11.at[:, :, :nobs, nobs:].set(tmp) # <oo|OC>
+    F_oo11 = F_oo11.at[:, :, nobs:, :nobs].set(f12_transpose(tmp)) # <oo|CO>
+    tmp = compute_f12_teints(geom, basis_set, cabs_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
+    F_oo11 = F_oo11.at[:, :, nobs:, nobs:].set(partial_tei_transformation(tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs, C_cabs)) # <oo|CC>
 
     # Term 3
     tmp = -1.0 * jnp.einsum('lkPC,CA,nmPA->lknm', F_oo11, K, F_oo11, optimize='optimal')
@@ -300,12 +242,10 @@ def form_B(geom, basis_set, cabs_set, C_obs, C_cabs, Fock, noK, K, ndocc, nobs,
 
     # Term 8
     tmp = -2.0 * jnp.einsum('lkbq,qy,nmby->lknm', F_oo11[:, :, ndocc:nobs, :nobs], Fock[:nobs, nobs:], F_oo11[:, :, ndocc:nobs, nobs:], optimize='optimal')
-    del F_oo11
     B = B.at[:, :, :, :].add(tmp)
     B = B.at[:, :, :, :].add(f12_transpose(tmp))
 
     tmp = jnp.transpose(B, (2,3,0,1))
     B = B.at[:, :, :, :].add(tmp)
-    del tmp
 
     return 0.5 * B

From 25fc8e62967a29fa6ff83dc821e033358889d9a3 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Fri, 27 Oct 2023 14:25:38 -0400
Subject: [PATCH 23/91] MP2-F12 bas JAX call

---
 quax/methods/mp2f12.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index 5a47b13..08b2820 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -84,14 +84,14 @@ def t_(p = 0, q = 0, r = 0, s = 0):
 # One-Electron Integrals
 
 def form_h(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_order, options):
-    h = np.empty((nri, nri))
+    h = jnp.empty((nri, nri))
 
     h_tmp = compute_f12_oeints(geom, basis_set, basis_set, xyz_path, deriv_order, options)
-    h_tmp = np.einsum('pP,qQ,pq->PQ', C_obs, C_obs, h_tmp, optimize='optimal')
+    h_tmp = jnp.einsum('pP,qQ,pq->PQ', C_obs, C_obs, h_tmp, optimize='optimal')
     h = h.at[:nobs, :nobs].set(h_tmp) # <O|O>
 
     h_tmp = compute_f12_oeints(geom, basis_set, cabs_set, xyz_path, deriv_order, options)
-    h_tmp = np.einsum('pP,qQ,pq->PQ', C_obs, C_cabs, h_tmp, optimize='optimal')
+    h_tmp = jnp.einsum('pP,qQ,pq->PQ', C_obs, C_cabs, h_tmp, optimize='optimal')
     h = h.at[:nobs, nobs:nri].set(h_tmp) # <O|C>
     h = h.at[nobs:nri, :nobs].set(jnp.transpose(h_tmp)) # <C|O>
 

From f4670de72fe9a380a0b8198b808ccbbd11d2b5b1 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Fri, 27 Oct 2023 19:03:11 -0400
Subject: [PATCH 24/91] Fix F12^2 Disk maxes

---
 quax/integrals/libint_interface.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index 62a6300..8b586b2 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -2421,8 +2421,6 @@ void f12_squared_deriv_disk(double beta, int max_deriv_order) {
 
         // Libint engine for computing shell quartet derivatives
         std::vector<libint2::Engine> cgtg_squared_engines(nthreads);
-        size_t max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-        int max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
         cgtg_squared_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
         cgtg_squared_engines[0].set_params(cgtg_params);
         for (size_t i = 1; i != nthreads; ++i) {

From 5b8cf18358b13b2514378bb8fb58ed781074b82c Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 30 Oct 2023 13:35:45 -0400
Subject: [PATCH 25/91] Remove pre-allocated lookup arrays, fix primals

---
 quax/integrals/buffer_lookups.h    | 153 -------
 quax/integrals/libint_interface.cc | 695 ++++++++++-------------------
 quax/integrals/tei.py              |   8 +-
 3 files changed, 247 insertions(+), 609 deletions(-)
 delete mode 100644 quax/integrals/buffer_lookups.h

diff --git a/quax/integrals/buffer_lookups.h b/quax/integrals/buffer_lookups.h
deleted file mode 100644
index 529d326..0000000
--- a/quax/integrals/buffer_lookups.h
+++ /dev/null
@@ -1,153 +0,0 @@
-// These functions, generate_*_lookup, create the buffer index lookup arrays.
-// When given a set of indices which represent a Shell derivative operator, e.g. 0,0 == d/dx1 d/dx1, 0,1 = d/dx1 d/dx2, etc
-// these arrays, when indexed with those indices, give the flattened buffer index according to the order these shell derivatives
-// are packed into a Libint integral Engine buffer.
-// These arrays are always the same for finding the shell derivative mapping for overlap, kinetic, and ERI for a given derivative order.
-// These are also used for nuclear derivatives of nuclear attraction integrals,
-// which vary in size dynamically due to the presence of additional nuclear derivatives
-
-std::vector<int> generate_1d_lookup(int dim_size) { 
-    std::vector<int> lookup(dim_size, 0);
-    for (int i = 0; i < dim_size; i++){
-        lookup[i] = i; 
-    }
-    return lookup;
-}
-
-std::vector<std::vector<int>> generate_2d_lookup(int dim_size) { 
-    using namespace std;
-    vector<vector<int>> lookup(dim_size, vector<int> (dim_size, 0));
-    vector<vector<int>> combos; // always the same, list of lists
-
-    // Collect multidimensional indices corresponding to generalized upper triangle
-    for (int i = 0; i < dim_size; i++) {
-      for (int j = i; j < dim_size; j++) {
-        vector<int> tmp = {i, j};
-        combos.push_back(tmp);
-      }
-    }
-    // Build lookup array and return
-    for (int i = 0; i < combos.size(); i++){
-        auto multi_idx = combos[i];
-        // Loop over all permutations, assign 1d buffer index to appropriate addresses in totally symmetric lookup array
-        do { 
-        lookup[multi_idx[0]][multi_idx[1]] = i; 
-        } 
-        while (next_permutation(multi_idx.begin(),multi_idx.end())); 
-    }
-    return lookup;
-}
-
-std::vector<std::vector<std::vector<int>>> generate_3d_lookup(int dim_size) { 
-    using namespace std;
-    vector<vector<vector<int>>> lookup(dim_size, vector<vector<int>>(dim_size, vector<int>(dim_size)));
-    vector<vector<int>> combos; // always the same, list of lists
-    // Collect multidimensional indices corresponding to generalized upper triangle
-    for (int i = 0; i < dim_size; i++) {
-      for (int j = i; j < dim_size; j++) {
-        for (int k = j; k < dim_size; k++) {
-          vector<int> tmp = {i, j, k};
-          combos.push_back(tmp);
-        }
-      }
-    }
-    // Build lookup array and return
-    for (int i = 0; i < combos.size(); i++){
-        auto multi_idx = combos[i];
-        // Loop over all permutations, assign 1d buffer index to appropriate addresses in totally symmetric lookup array
-        do { 
-        lookup[multi_idx[0]][multi_idx[1]][multi_idx[2]] = i; 
-        } 
-        while (next_permutation(multi_idx.begin(),multi_idx.end())); 
-    }
-    return lookup;
-}
-
-std::vector<std::vector<std::vector<std::vector<int>>>> generate_4d_lookup(int dim_size) { 
-    using namespace std;
-    vector<vector<vector<vector<int>>>> lookup(dim_size, vector<vector<vector<int>>>(dim_size, vector<vector<int>>(dim_size, vector<int>(dim_size))));
-    vector<vector<int>> combos; // always the same, list of lists
-    // Collect multidimensional indices corresponding to generalized upper triangle
-    for (int i = 0; i < dim_size; i++) {
-      for (int j = i; j < dim_size; j++) {
-        for (int k = j; k < dim_size; k++) {
-          for (int l = k; l < dim_size; l++) {
-            vector<int> tmp = {i, j, k, l};
-            combos.push_back(tmp);
-          }
-        }
-      }
-    }
-    // Build lookup array and return
-    for (int i = 0; i < combos.size(); i++){
-        auto multi_idx = combos[i];
-        // Loop over all permutations, assign 1d buffer index to appropriate addresses in totally symmetric lookup array
-        do { 
-        lookup[multi_idx[0]][multi_idx[1]][multi_idx[2]][multi_idx[3]] = i; 
-        } 
-        while (next_permutation(multi_idx.begin(),multi_idx.end())); 
-    }
-    return lookup;
-}
-
-/*
-std::vector<std::vector<std::vector<std::vector<std::vector<int>>>>> generate_5d_lookup(int dim_size) {
-    using namespace std;
-    vector<vector<vector<vector<int>>>> lookup(dim_size, vector<vector<vector<int>>>(dim_size, vector<vector<int>>(dim_size, vector<int>(dim_size))));
-    vector<vector<int>> combos; // always the same, list of lists
-    // Collect multidimensional indices corresponding to generalized upper triangle
-    for (int i = 0; i < dim_size; i++) {
-      for (int j = i; j < dim_size; j++) {
-        for (int k = j; k < dim_size; k++) {
-          for (int l = k; l < dim_size; l++) {
-            for (int m = l; m < dim_size; m++) {
-                vector<int> tmp = {i, j, k, l, m};
-                combos.push_back(tmp);
-            }
-          }
-        }
-      }
-    }
-    // Build lookup array and return
-    for (int i = 0; i < combos.size(); i++){
-        auto multi_idx = combos[i];
-        // Loop over all permutations, assign 1d buffer index to appropriate addresses in totally symmetric lookup array
-        do {
-        lookup[multi_idx[0]][multi_idx[1]][multi_idx[2]][multi_idx[3]][multi_idx[4]] = i;
-        }
-        while (next_permutation(multi_idx.begin(),multi_idx.end()));
-    }
-    return lookup;
-}
-
-std::vector<std::vector<std::vector<std::vector<std::vector<std::vector<int>>>>>> generate_6d_lookup(int dim_size) {
-    using namespace std;
-    vector<vector<vector<vector<int>>>> lookup(dim_size, vector<vector<vector<int>>>(dim_size, vector<vector<int>>(dim_size, vector<int>(dim_size))));
-    vector<vector<int>> combos; // always the same, list of lists
-    // Collect multidimensional indices corresponding to generalized upper triangle
-    for (int i = 0; i < dim_size; i++) {
-      for (int j = i; j < dim_size; j++) {
-        for (int k = j; k < dim_size; k++) {
-          for (int l = k; l < dim_size; l++) {
-            for (int m = l; m < dim_size; m++) {
-              for (int n = m; n < dim_size; n++) {
-                vector<int> tmp = {i, j, k, l, m, n};
-                combos.push_back(tmp);
-              }
-            }
-          }
-        }
-      }
-    }
-    // Build lookup array and return
-    for (int i = 0; i < combos.size(); i++){
-        auto multi_idx = combos[i];
-        // Loop over all permutations, assign 1d buffer index to appropriate addresses in totally symmetric lookup array
-        do {
-        lookup[multi_idx[0]][multi_idx[1]][multi_idx[2]][multi_idx[3]][multi_idx[4]][multi_idx[5]] = i;
-        }
-        while (next_permutation(multi_idx.begin(),multi_idx.end()));
-    }
-    return lookup;
-}
-*/
\ No newline at end of file
diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index 8b586b2..3c984a3 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -11,8 +11,6 @@
 #include <pybind11/stl.h>
 #include <libint2.hpp>
 
-#include "buffer_lookups.h"
-
 // TODO support spherical harmonic gaussians, implement symmetry considerations, support 5th, 6th derivs
 
 namespace py = pybind11;
@@ -32,25 +30,6 @@ size_t max_nprim;
 int max_l;
 int nthreads = 1;
 
-// These lookup arrays are for mapping Libint's computed shell-set integrals and integral derivatives to the proper index 
-// in the full OEI/TEI array or derivative array.
-// ERI,overlap,kinetic buffer lookup arrays are always the same, create at compile time.
-// Potential buffer lookups have to be created at runtime since they are dependent on natoms
-// Total size of these is (12 + 12^2 + 12^3 + 12^4 + 6 + 6^2 + 6^3 + 6^4) * 2 bytes = 48 kB 
-// Note quintic, sextics will likely require long int, probably a different algo.
-static const std::vector<int> buffer_index_eri1d = generate_1d_lookup(12);
-static const std::vector<std::vector<int>> buffer_index_eri2d = generate_2d_lookup(12);
-static const std::vector<std::vector<std::vector<int>>> buffer_index_eri3d = generate_3d_lookup(12);
-static const std::vector<std::vector<std::vector<std::vector<int>>>> buffer_index_eri4d = generate_4d_lookup(12);
-//static const std::vector<std::vector<std::vector<std::vector<std::vector<int>>>>> buffer_index_eri5d = generate_5d_lookup(12);
-//static const std::vector<std::vector<std::vector<std::vector<std::vector<std::vector<int>>>>>> buffer_index_eri6d = generate_6d_lookup(12);
-static const std::vector<int> buffer_index_oei1d = generate_1d_lookup(6);
-static const std::vector<std::vector<int>> buffer_index_oei2d = generate_2d_lookup(6);
-static const std::vector<std::vector<std::vector<int>>> buffer_index_oei3d = generate_3d_lookup(6);
-static const std::vector<std::vector<std::vector<std::vector<int>>>> buffer_index_oei4d = generate_4d_lookup(6);
-//static const std::vector<std::vector<std::vector<std::vector<std::vector<int>>>>> buffer_index_oei5d = generate_5d_lookup(6);
-//static const std::vector<std::vector<std::vector<std::vector<std::vector<std::vector<int>>>>>> buffer_index_oei6d = generate_6d_lookup(6);
-
 // Creates atom objects from xyz file path
 std::vector<libint2::Atom> get_atoms(std::string xyzfilename) 
 {
@@ -710,6 +689,9 @@ py::array overlap_deriv(std::vector<int> deriv_vec) {
     // Get order of differentiation
     int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
 
+    // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
+
     // Convert deriv_vec to set of atom indices and their cartesian components which we are differentiating wrt
     std::vector<int> desired_atom_indices;
     std::vector<int> desired_coordinates;
@@ -741,17 +723,29 @@ py::array overlap_deriv(std::vector<int> deriv_vec) {
             // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
             std::vector<long> shell_atom_index_list{atom1, atom2};
 
-            // We can check if EVERY differentiated atom according to deriv_vec is contained in this set of 2 atom indices
-            // This will ensure the derivative we want is in the buffer.
-            std::vector<int> desired_shell_atoms; 
+            // Initialize 2d vector, with DERIV_ORDER subvectors
+            // Each subvector contains index candidates which are possible choices for each partial derivative operator
+            // In other words, indices looks like { {choices for first deriv operator} {choices for second deriv op} {third} ...}
+            // The cartesian product of these subvectors gives all combos that need to be summed to form total nuclear derivative of integrals
+            std::vector<std::vector<int>> indices; 
             for (int i = 0; i < deriv_order; i++){
-                int desired_atom = desired_atom_indices[i];
-                if (shell_atom_index_list[0] == desired_atom) desired_shell_atoms.push_back(0); 
-                else if (shell_atom_index_list[1] == desired_atom) desired_shell_atoms.push_back(1); 
+                std::vector<int> new_vec;
+                indices.push_back(new_vec);
             }
 
-            // If the length of this vector is not == deriv_order, this shell duet can be skipped, since it does not contain desired derivative
-            if (desired_shell_atoms.size() != deriv_order) continue;
+            // For every desired atom derivative, check shell and nuclear indices for a match, add it to subvector for that derivative
+            // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
+            for (int j = 0; j < desired_atom_indices.size(); j++){
+                int desired_atom_idx = desired_atom_indices[j];
+                // Shell indices
+                for (int i = 0; i < 2; i++){
+                    int atom_idx = shell_atom_index_list[i];
+                    if (atom_idx == desired_atom_idx) { 
+                        int tmp = 3 * i + desired_coordinates[j];
+                        indices[j].push_back(tmp);
+                    }
+                }
+            }
 
             // If we made it this far, the shell derivative we want is in the buffer, perhaps even more than once. 
             size_t thread_id = 0;
@@ -761,36 +755,30 @@ py::array overlap_deriv(std::vector<int> deriv_vec) {
             s_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
             const auto& buf_vec = s_engines[thread_id].results(); // will point to computed shell sets
 
-            // Now convert these shell atom indices into a shell derivative index, a set of indices length deriv_order with values between 0 and 5, corresponding to 6 possible shell center coordinates
-            std::vector<int> shell_derivative;
-            for (int i = 0; i < deriv_order; i++){
-                shell_derivative.push_back(3 * desired_shell_atoms[i] + desired_coordinates[i]);
-            }
+            // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+            // and the total number of subvectors is the order of differentiation
+            // Now we want all combinations where we pick exactly one index from each subvector.
+            // This is achievable through a cartesian product
+            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+            std::vector<int> buffer_indices;
 
-            // Now we must convert our multidimensional shell_derivative index into a one-dimensional buffer index. 
-            // We know how to do this since libint tells us what order they come in. The lookup arrays above map the multidim index to the buffer idx
-            int buffer_idx;
-            if (deriv_order == 1) { 
-                buffer_idx = buffer_index_oei1d[shell_derivative[0]];
-            }
-            else if (deriv_order == 2) { 
-                buffer_idx = buffer_index_oei2d[shell_derivative[0]][shell_derivative[1]];
-            }
-            else if (deriv_order == 3) { 
-                buffer_idx = buffer_index_oei3d[shell_derivative[0]][shell_derivative[1]][shell_derivative[2]];
-            }
-            else if (deriv_order == 4) { 
-                buffer_idx = buffer_index_oei4d[shell_derivative[0]][shell_derivative[1]][shell_derivative[2]][shell_derivative[3]];
+            // Overlap/Kinetic integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+            for (auto vec : index_combos)  {
+                std::sort(vec.begin(), vec.end());
+                int buf_idx = 0;
+                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                buffer_indices.push_back(buf_idx);
             }
 
-            auto ints_shellset = buf_vec[buffer_idx]; // Location of the computed integrals
-            if (ints_shellset == nullptr)
-                continue;  // nullptr returned if the entire shell-set was screened out
-
-            // Loop over shell block, keeping a total count idx for the size of shell set
-            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                    result[(bf1 + f1) * nbf2 + bf2 + f2 ] = ints_shellset[idx];
+            // Loop over every buffer index and accumulate for every shell set.
+            for(auto i = 0; i < buffer_indices.size(); ++i) {
+                auto ints_shellset = buf_vec[buffer_indices[i]];
+                if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
+                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                    for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                        result[(bf1 + f1) * nbf2 + bf2 + f2] += ints_shellset[idx];
+                    }
                 }
             }
         }
@@ -804,6 +792,9 @@ py::array kinetic_deriv(std::vector<int> deriv_vec) {
     // Get order of differentiation
     int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
 
+    // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
+
     // Convert deriv_vec to set of atom indices and their cartesian components which we are differentiating wrt
     std::vector<int> desired_atom_indices;
     std::vector<int> desired_coordinates;
@@ -834,17 +825,29 @@ py::array kinetic_deriv(std::vector<int> deriv_vec) {
             // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
             std::vector<long> shell_atom_index_list{atom1, atom2};
 
-            // We can check if EVERY differentiated atom according to deriv_vec is contained in this set of 2 atom indices
-            // This will ensure the derivative we want is in the buffer.
-            std::vector<int> desired_shell_atoms; 
+            // Initialize 2d vector, with DERIV_ORDER subvectors
+            // Each subvector contains index candidates which are possible choices for each partial derivative operator
+            // In other words, indices looks like { {choices for first deriv operator} {choices for second deriv op} {third} ...}
+            // The cartesian product of these subvectors gives all combos that need to be summed to form total nuclear derivative of integrals
+            std::vector<std::vector<int>> indices; 
             for (int i = 0; i < deriv_order; i++){
-                int desired_atom = desired_atom_indices[i];
-                if (shell_atom_index_list[0] == desired_atom) desired_shell_atoms.push_back(0); 
-                else if (shell_atom_index_list[1] == desired_atom) desired_shell_atoms.push_back(1); 
+                std::vector<int> new_vec;
+                indices.push_back(new_vec);
             }
 
-            // If the length of this vector is not == deriv_order, this shell duet can be skipped, since it does not contain desired derivative
-            if (desired_shell_atoms.size() != deriv_order) continue;
+            // For every desired atom derivative, check shell and nuclear indices for a match, add it to subvector for that derivative
+            // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
+            for (int j = 0; j < desired_atom_indices.size(); j++){
+                int desired_atom_idx = desired_atom_indices[j];
+                // Shell indices
+                for (int i = 0; i < 2; i++){
+                    int atom_idx = shell_atom_index_list[i];
+                    if (atom_idx == desired_atom_idx) { 
+                        int tmp = 3 * i + desired_coordinates[j];
+                        indices[j].push_back(tmp);
+                    }
+                }
+            }
 
             // If we made it this far, the shell derivative we want is in the buffer, perhaps even more than once. 
             size_t thread_id = 0;
@@ -854,36 +857,30 @@ py::array kinetic_deriv(std::vector<int> deriv_vec) {
             t_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
             const auto& buf_vec = t_engines[thread_id].results(); // will point to computed shell sets
 
-            // Now convert these shell atom indices into a shell derivative index, a set of indices length deriv_order with values between 0 and 5, corresponding to 6 possible shell center coordinates
-            std::vector<int> shell_derivative;
-            for (int i = 0; i < deriv_order; i++){
-                shell_derivative.push_back(3 * desired_shell_atoms[i] + desired_coordinates[i]);
-            }
+            // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+            // and the total number of subvectors is the order of differentiation
+            // Now we want all combinations where we pick exactly one index from each subvector.
+            // This is achievable through a cartesian product
+            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+            std::vector<int> buffer_indices;
 
-            // Now we must convert our multidimensional shell_derivative index into a one-dimensional buffer index. 
-            // We know how to do this since libint tells us what order they come in. The lookup arrays above map the multidim index to the buffer idx
-            int buffer_idx;
-            if (deriv_order == 1) { 
-                buffer_idx = buffer_index_oei1d[shell_derivative[0]];
-            }
-            else if (deriv_order == 2) { 
-                buffer_idx = buffer_index_oei2d[shell_derivative[0]][shell_derivative[1]];
-            }
-            else if (deriv_order == 3) { 
-                buffer_idx = buffer_index_oei3d[shell_derivative[0]][shell_derivative[1]][shell_derivative[2]];
+            // Overlap/Kinetic integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+            for (auto vec : index_combos)  {
+                std::sort(vec.begin(), vec.end());
+                int buf_idx = 0;
+                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                buffer_indices.push_back(buf_idx);
             }
-            else if (deriv_order == 4) { 
-                buffer_idx = buffer_index_oei4d[shell_derivative[0]][shell_derivative[1]][shell_derivative[2]][shell_derivative[3]];
-            }
-
-            auto ints_shellset = buf_vec[buffer_idx]; // Location of the computed integrals
-            if (ints_shellset == nullptr)
-                continue;  // nullptr returned if the entire shell-set was screened out
 
-            // Loop over shell block, keeping a total count idx for the size of shell set
-            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                    result[(bf1 + f1) * nbf2 + bf2 + f2] = ints_shellset[idx];
+            // Loop over every buffer index and accumulate for every shell set.
+            for(auto i = 0; i < buffer_indices.size(); ++i) {
+                auto ints_shellset = buf_vec[buffer_indices[i]];
+                if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
+                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                    for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                        result[(bf1 + f1) * nbf2 + bf2 + f2] += ints_shellset[idx];
+                    }
                 }
             }
         }
@@ -897,13 +894,9 @@ py::array potential_deriv(std::vector<int> deriv_vec) {
     // Get order of differentiation
     int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
 
-    // Lookup arrays for mapping shell derivative index to buffer index 
-    // Potential lookup arrays depend on atom size
-    int dimensions = 6 + ncart;
-    static const std::vector<int> buffer_index_potential1d = generate_1d_lookup(dimensions);
-    static const std::vector<std::vector<int>> buffer_index_potential2d = generate_2d_lookup(dimensions);
-    static const std::vector<std::vector<std::vector<int>>> buffer_index_potential3d = generate_3d_lookup(dimensions);
-    static const std::vector<std::vector<std::vector<std::vector<int>>>> buffer_index_potential4d = generate_4d_lookup(dimensions);
+    // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+    // Potential integrals buffer is flattened upper triangle of (6 + NCART) dimensional deriv_order tensor
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6 + ncart, deriv_order);
 
     // Convert deriv_vec to set of atom indices and their cartesian components which we are differentiating wrt
     std::vector<int> desired_atom_indices;
@@ -967,9 +960,6 @@ py::array potential_deriv(std::vector<int> deriv_vec) {
                 }
             }
 
-            // Create index combos representing every mixed partial derivative operator which contributes to nuclear derivative
-            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-
             // Compute the integrals
             size_t thread_id = 0;
 #ifdef _OPENMP
@@ -978,37 +968,20 @@ py::array potential_deriv(std::vector<int> deriv_vec) {
             v_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
             const auto& buf_vec = v_engines[thread_id].results(); // will point to computed shell sets
             
-            // Loop over every subvector of index_combos and lookup buffer index.
+            // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+            // and the total number of subvectors is the order of differentiation
+            // Now we want all combinations where we pick exactly one index from each subvector.
+            // This is achievable through a cartesian product
+            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
             std::vector<int> buffer_indices;
-            if (deriv_order == 1){
-                for (int i = 0; i < index_combos.size(); i++){
-                    int idx1 = index_combos[i][0];
-                    buffer_indices.push_back(buffer_index_potential1d[idx1]);
-                }
-            }
-            else if (deriv_order == 2){
-                for (int i = 0; i < index_combos.size(); i++){
-                    int idx1 = index_combos[i][0];
-                    int idx2 = index_combos[i][1];
-                    buffer_indices.push_back(buffer_index_potential2d[idx1][idx2]);
-                }
-            }
-            else if (deriv_order == 3){
-                for (int i = 0; i < index_combos.size(); i++){
-                    int idx1 = index_combos[i][0];
-                    int idx2 = index_combos[i][1];
-                    int idx3 = index_combos[i][2];
-                    buffer_indices.push_back(buffer_index_potential3d[idx1][idx2][idx3]);
-                }
-            }
-            else if (deriv_order == 4){
-                for (int i = 0; i < index_combos.size(); i++){
-                    int idx1 = index_combos[i][0];
-                    int idx2 = index_combos[i][1];
-                    int idx3 = index_combos[i][2];
-                    int idx4 = index_combos[i][3];
-                    buffer_indices.push_back(buffer_index_potential4d[idx1][idx2][idx3][idx4]);
-                }
+
+            // Potential integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+            for (auto vec : index_combos)  {
+                std::sort(vec.begin(), vec.end());
+                int buf_idx = 0;
+                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                buffer_indices.push_back(buf_idx);
             }
 
             // Loop over every buffer index and accumulate for every shell set.
@@ -1037,6 +1010,9 @@ py::array eri_deriv(std::vector<int> deriv_vec) {
 
     assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
 
+    // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+
     // ERI derivative integral engine
     std::vector<libint2::Engine> eri_engines(nthreads);
     eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
@@ -1110,38 +1086,16 @@ py::array eri_deriv(std::vector<int> deriv_vec) {
                     // Now we want all combinations where we pick exactly one index from each subvector.
                     // This is achievable through a cartesian product 
                     std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-
-                    // Now create buffer_indices from these index combos using lookup array
                     std::vector<int> buffer_indices;
-                    if (deriv_order == 1){ 
-                        for (int i = 0; i < index_combos.size(); i++){
-                            int idx1 = index_combos[i][0];
-                            buffer_indices.push_back(buffer_index_eri1d[idx1]);
-                        }
-                    }
-                    else if (deriv_order == 2){ 
-                        for (int i = 0; i < index_combos.size(); i++){
-                            int idx1 = index_combos[i][0];
-                            int idx2 = index_combos[i][1];
-                            buffer_indices.push_back(buffer_index_eri2d[idx1][idx2]);
-                        }
-                    }
-                    else if (deriv_order == 3){ 
-                        for (int i = 0; i < index_combos.size(); i++){
-                            int idx1 = index_combos[i][0];
-                            int idx2 = index_combos[i][1];
-                            int idx3 = index_combos[i][2];
-                            buffer_indices.push_back(buffer_index_eri3d[idx1][idx2][idx3]);
-                        }
-                    }
-                    else if (deriv_order == 4){ 
-                        for (int i = 0; i < index_combos.size(); i++){
-                            int idx1 = index_combos[i][0];
-                            int idx2 = index_combos[i][1];
-                            int idx3 = index_combos[i][2];
-                            int idx4 = index_combos[i][3];
-                            buffer_indices.push_back(buffer_index_eri4d[idx1][idx2][idx3][idx4]);
-                        }
+   
+                    // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
+                    for (auto vec : index_combos)  {
+                        std::sort(vec.begin(), vec.end());
+                        int buf_idx = 0;
+                        // buffer_multidim_lookup
+                        auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                        if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                        buffer_indices.push_back(buf_idx);
                     }
 
                     // If we made it this far, the shell derivative we want is contained in the buffer. 
@@ -1187,6 +1141,9 @@ py::array f12_deriv(double beta, std::vector<int> deriv_vec) {
 
     assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
 
+    // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+
     // F12 derivative integral engine
     auto cgtg_params = make_cgtg(beta);
     std::vector<libint2::Engine> cgtg_engines(nthreads);
@@ -1262,38 +1219,16 @@ py::array f12_deriv(double beta, std::vector<int> deriv_vec) {
                     // Now we want all combinations where we pick exactly one index from each subvector.
                     // This is achievable through a cartesian product 
                     std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-
-                    // Now create buffer_indices from these index combos using lookup array
                     std::vector<int> buffer_indices;
-                    if (deriv_order == 1){ 
-                        for (int i = 0; i < index_combos.size(); i++){
-                            int idx1 = index_combos[i][0];
-                            buffer_indices.push_back(buffer_index_eri1d[idx1]);
-                        }
-                    }
-                    else if (deriv_order == 2){ 
-                        for (int i = 0; i < index_combos.size(); i++){
-                            int idx1 = index_combos[i][0];
-                            int idx2 = index_combos[i][1];
-                            buffer_indices.push_back(buffer_index_eri2d[idx1][idx2]);
-                        }
-                    }
-                    else if (deriv_order == 3){ 
-                        for (int i = 0; i < index_combos.size(); i++){
-                            int idx1 = index_combos[i][0];
-                            int idx2 = index_combos[i][1];
-                            int idx3 = index_combos[i][2];
-                            buffer_indices.push_back(buffer_index_eri3d[idx1][idx2][idx3]);
-                        }
-                    }
-                    else if (deriv_order == 4){ 
-                        for (int i = 0; i < index_combos.size(); i++){
-                            int idx1 = index_combos[i][0];
-                            int idx2 = index_combos[i][1];
-                            int idx3 = index_combos[i][2];
-                            int idx4 = index_combos[i][3];
-                            buffer_indices.push_back(buffer_index_eri4d[idx1][idx2][idx3][idx4]);
-                        }
+                    
+                    // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
+                    for (auto vec : index_combos)  {
+                        std::sort(vec.begin(), vec.end());
+                        int buf_idx = 0;
+                        // buffer_multidim_lookup
+                        auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                        if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                        buffer_indices.push_back(buf_idx);
                     }
 
                     // If we made it this far, the shell derivative we want is contained in the buffer. 
@@ -1304,7 +1239,7 @@ py::array f12_deriv(double beta, std::vector<int> deriv_vec) {
                     cgtg_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
                     const auto& buf_vec = cgtg_engines[thread_id].results(); // will point to computed shell sets
 
-                    for(auto i = 0; i<buffer_indices.size(); ++i) {
+                    for(auto i = 0; i < buffer_indices.size(); ++i) {
                         auto ints_shellset = buf_vec[buffer_indices[i]];
                         if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
                         for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
@@ -1339,6 +1274,9 @@ py::array f12_squared_deriv(double beta, std::vector<int> deriv_vec) {
 
     assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
 
+    // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+
     // F12 Squared derivative integral engine
     auto cgtg_params = take_square(make_cgtg(beta));
     std::vector<libint2::Engine> cgtg_squared_engines(nthreads);
@@ -1414,38 +1352,16 @@ py::array f12_squared_deriv(double beta, std::vector<int> deriv_vec) {
                     // Now we want all combinations where we pick exactly one index from each subvector.
                     // This is achievable through a cartesian product 
                     std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-
-                    // Now create buffer_indices from these index combos using lookup array
                     std::vector<int> buffer_indices;
-                    if (deriv_order == 1){ 
-                        for (int i = 0; i < index_combos.size(); i++){
-                            int idx1 = index_combos[i][0];
-                            buffer_indices.push_back(buffer_index_eri1d[idx1]);
-                        }
-                    }
-                    else if (deriv_order == 2){ 
-                        for (int i = 0; i < index_combos.size(); i++){
-                            int idx1 = index_combos[i][0];
-                            int idx2 = index_combos[i][1];
-                            buffer_indices.push_back(buffer_index_eri2d[idx1][idx2]);
-                        }
-                    }
-                    else if (deriv_order == 3){ 
-                        for (int i = 0; i < index_combos.size(); i++){
-                            int idx1 = index_combos[i][0];
-                            int idx2 = index_combos[i][1];
-                            int idx3 = index_combos[i][2];
-                            buffer_indices.push_back(buffer_index_eri3d[idx1][idx2][idx3]);
-                        }
-                    }
-                    else if (deriv_order == 4){ 
-                        for (int i = 0; i < index_combos.size(); i++){
-                            int idx1 = index_combos[i][0];
-                            int idx2 = index_combos[i][1];
-                            int idx3 = index_combos[i][2];
-                            int idx4 = index_combos[i][3];
-                            buffer_indices.push_back(buffer_index_eri4d[idx1][idx2][idx3][idx4]);
-                        }
+                    
+                    // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
+                    for (auto vec : index_combos)  {
+                        std::sort(vec.begin(), vec.end());
+                        int buf_idx = 0;
+                        // buffer_multidim_lookup
+                        auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                        if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                        buffer_indices.push_back(buf_idx);
                     }
 
                     // If we made it this far, the shell derivative we want is contained in the buffer. 
@@ -1491,6 +1407,9 @@ py::array f12g12_deriv(double beta, std::vector<int> deriv_vec) {
 
     assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
 
+    // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+
     // F12 derivative integral engine
     auto cgtg_params = make_cgtg(beta);
     std::vector<libint2::Engine> cgtg_coulomb_engines(nthreads);
@@ -1566,38 +1485,16 @@ py::array f12g12_deriv(double beta, std::vector<int> deriv_vec) {
                     // Now we want all combinations where we pick exactly one index from each subvector.
                     // This is achievable through a cartesian product 
                     std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-
-                    // Now create buffer_indices from these index combos using lookup array
                     std::vector<int> buffer_indices;
-                    if (deriv_order == 1){ 
-                        for (int i = 0; i < index_combos.size(); i++){
-                            int idx1 = index_combos[i][0];
-                            buffer_indices.push_back(buffer_index_eri1d[idx1]);
-                        }
-                    }
-                    else if (deriv_order == 2){ 
-                        for (int i = 0; i < index_combos.size(); i++){
-                            int idx1 = index_combos[i][0];
-                            int idx2 = index_combos[i][1];
-                            buffer_indices.push_back(buffer_index_eri2d[idx1][idx2]);
-                        }
-                    }
-                    else if (deriv_order == 3){ 
-                        for (int i = 0; i < index_combos.size(); i++){
-                            int idx1 = index_combos[i][0];
-                            int idx2 = index_combos[i][1];
-                            int idx3 = index_combos[i][2];
-                            buffer_indices.push_back(buffer_index_eri3d[idx1][idx2][idx3]);
-                        }
-                    }
-                    else if (deriv_order == 4){ 
-                        for (int i = 0; i < index_combos.size(); i++){
-                            int idx1 = index_combos[i][0];
-                            int idx2 = index_combos[i][1];
-                            int idx3 = index_combos[i][2];
-                            int idx4 = index_combos[i][3];
-                            buffer_indices.push_back(buffer_index_eri4d[idx1][idx2][idx3][idx4]);
-                        }
+                    
+                    // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
+                    for (auto vec : index_combos)  {
+                        std::sort(vec.begin(), vec.end());
+                        int buf_idx = 0;
+                        // buffer_multidim_lookup
+                        auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                        if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                        buffer_indices.push_back(buf_idx);
                     }
 
                     // If we made it this far, the shell derivative we want is contained in the buffer. 
@@ -1643,6 +1540,9 @@ py::array f12_double_commutator_deriv(double beta, std::vector<int> deriv_vec) {
 
     assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
 
+    // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+
     // F12 derivative integral engine
     auto cgtg_params = make_cgtg(beta);
     std::vector<libint2::Engine> cgtg_del_engines(nthreads);
@@ -1718,38 +1618,16 @@ py::array f12_double_commutator_deriv(double beta, std::vector<int> deriv_vec) {
                     // Now we want all combinations where we pick exactly one index from each subvector.
                     // This is achievable through a cartesian product 
                     std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-
-                    // Now create buffer_indices from these index combos using lookup array
                     std::vector<int> buffer_indices;
-                    if (deriv_order == 1){ 
-                        for (int i = 0; i < index_combos.size(); i++){
-                            int idx1 = index_combos[i][0];
-                            buffer_indices.push_back(buffer_index_eri1d[idx1]);
-                        }
-                    }
-                    else if (deriv_order == 2){ 
-                        for (int i = 0; i < index_combos.size(); i++){
-                            int idx1 = index_combos[i][0];
-                            int idx2 = index_combos[i][1];
-                            buffer_indices.push_back(buffer_index_eri2d[idx1][idx2]);
-                        }
-                    }
-                    else if (deriv_order == 3){ 
-                        for (int i = 0; i < index_combos.size(); i++){
-                            int idx1 = index_combos[i][0];
-                            int idx2 = index_combos[i][1];
-                            int idx3 = index_combos[i][2];
-                            buffer_indices.push_back(buffer_index_eri3d[idx1][idx2][idx3]);
-                        }
-                    }
-                    else if (deriv_order == 4){ 
-                        for (int i = 0; i < index_combos.size(); i++){
-                            int idx1 = index_combos[i][0];
-                            int idx2 = index_combos[i][1];
-                            int idx3 = index_combos[i][2];
-                            int idx4 = index_combos[i][3];
-                            buffer_indices.push_back(buffer_index_eri4d[idx1][idx2][idx3][idx4]);
-                        }
+
+                    // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
+                    for (auto vec : index_combos)  {
+                        std::sort(vec.begin(), vec.end());
+                        int buf_idx = 0;
+                        // buffer_multidim_lookup
+                        auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                        if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                        buffer_indices.push_back(buf_idx);
                     }
 
                     // If we made it this far, the shell derivative we want is contained in the buffer. 
@@ -1760,7 +1638,7 @@ py::array f12_double_commutator_deriv(double beta, std::vector<int> deriv_vec) {
                     cgtg_del_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
                     const auto& buf_vec = cgtg_del_engines[thread_id].results(); // will point to computed shell sets
 
-                    for(auto i = 0; i<buffer_indices.size(); ++i) {
+                    for(auto i = 0; i < buffer_indices.size(); ++i) {
                         auto ints_shellset = buf_vec[buffer_indices[i]];
                         if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
                         for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
@@ -2048,8 +1926,7 @@ void eri_deriv_disk(int max_deriv_order) {
         unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
         // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-        // Currently not used due to predefined lookup arrays
-        //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+        const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
 
         // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
         const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
@@ -2130,22 +2007,15 @@ void eri_deriv_disk(int max_deriv_order) {
                             // This is achievable through a cartesian product 
                             std::vector<std::vector<int>> index_combos = cartesian_product(indices);
                             std::vector<int> buffer_indices;
-                            
+
                             // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                            //for (auto vec : index_combos)  {
-                            //    std::sort(vec.begin(), vec.end());
-                            //    int buf_idx = 0;
-                            //    // buffer_multidim_lookup
-                            //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                            //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                            //    buffer_indices.push_back(buf_idx);
-                            //}
-                            // Eventually, if you stop using lookup arrays, use above implementation, but these are sitting around so might as well use them 
                             for (auto vec : index_combos)  {
-                                if (deriv_order == 1) buffer_indices.push_back(buffer_index_eri1d[vec[0]]);
-                                else if (deriv_order == 2) buffer_indices.push_back(buffer_index_eri2d[vec[0]][vec[1]]);
-                                else if (deriv_order == 3) buffer_indices.push_back(buffer_index_eri3d[vec[0]][vec[1]][vec[2]]);
-                                else if (deriv_order == 4) buffer_indices.push_back(buffer_index_eri4d[vec[0]][vec[1]][vec[2]][vec[3]]);
+                                std::sort(vec.begin(), vec.end());
+                                int buf_idx = 0;
+                                // buffer_multidim_lookup
+                                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                                buffer_indices.push_back(buf_idx);
                             }
 
                             // Loop over shell block, keeping a total count idx for the size of shell set
@@ -2230,8 +2100,7 @@ void f12_deriv_disk(double beta, int max_deriv_order) {
         unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
         // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-        // Currently not used due to predefined lookup arrays
-        //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+        const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
 
         // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
         const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
@@ -2315,20 +2184,13 @@ void f12_deriv_disk(double beta, int max_deriv_order) {
                             std::vector<int> buffer_indices;
                             
                             // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                            //for (auto vec : index_combos)  {
-                            //    std::sort(vec.begin(), vec.end());
-                            //    int buf_idx = 0;
-                            //    // buffer_multidim_lookup
-                            //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                            //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                            //    buffer_indices.push_back(buf_idx);
-                            //}
-                            // Eventually, if you stop using lookup arrays, use above implementation, but these are sitting around so might as well use them 
                             for (auto vec : index_combos)  {
-                                if (deriv_order == 1) buffer_indices.push_back(buffer_index_eri1d[vec[0]]);
-                                else if (deriv_order == 2) buffer_indices.push_back(buffer_index_eri2d[vec[0]][vec[1]]);
-                                else if (deriv_order == 3) buffer_indices.push_back(buffer_index_eri3d[vec[0]][vec[1]][vec[2]]);
-                                else if (deriv_order == 4) buffer_indices.push_back(buffer_index_eri4d[vec[0]][vec[1]][vec[2]][vec[3]]);
+                                std::sort(vec.begin(), vec.end());
+                                int buf_idx = 0;
+                                // buffer_multidim_lookup
+                                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                                buffer_indices.push_back(buf_idx);
                             }
 
                             // Loop over shell block, keeping a total count idx for the size of shell set
@@ -2413,8 +2275,7 @@ void f12_squared_deriv_disk(double beta, int max_deriv_order) {
         unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
         // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-        // Currently not used due to predefined lookup arrays
-        //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+        const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
 
         // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
         const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
@@ -2498,20 +2359,13 @@ void f12_squared_deriv_disk(double beta, int max_deriv_order) {
                             std::vector<int> buffer_indices;
                             
                             // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                            //for (auto vec : index_combos)  {
-                            //    std::sort(vec.begin(), vec.end());
-                            //    int buf_idx = 0;
-                            //    // buffer_multidim_lookup
-                            //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                            //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                            //    buffer_indices.push_back(buf_idx);
-                            //}
-                            // Eventually, if you stop using lookup arrays, use above implementation, but these are sitting around so might as well use them 
                             for (auto vec : index_combos)  {
-                                if (deriv_order == 1) buffer_indices.push_back(buffer_index_eri1d[vec[0]]);
-                                else if (deriv_order == 2) buffer_indices.push_back(buffer_index_eri2d[vec[0]][vec[1]]);
-                                else if (deriv_order == 3) buffer_indices.push_back(buffer_index_eri3d[vec[0]][vec[1]][vec[2]]);
-                                else if (deriv_order == 4) buffer_indices.push_back(buffer_index_eri4d[vec[0]][vec[1]][vec[2]][vec[3]]);
+                                std::sort(vec.begin(), vec.end());
+                                int buf_idx = 0;
+                                // buffer_multidim_lookup
+                                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                                buffer_indices.push_back(buf_idx);
                             }
 
                             // Loop over shell block, keeping a total count idx for the size of shell set
@@ -2596,8 +2450,7 @@ void f12g12_deriv_disk(double beta, int max_deriv_order) {
         unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
         // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-        // Currently not used due to predefined lookup arrays
-        //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+        const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
 
         // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
         const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
@@ -2681,20 +2534,13 @@ void f12g12_deriv_disk(double beta, int max_deriv_order) {
                             std::vector<int> buffer_indices;
                             
                             // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                            //for (auto vec : index_combos)  {
-                            //    std::sort(vec.begin(), vec.end());
-                            //    int buf_idx = 0;
-                            //    // buffer_multidim_lookup
-                            //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                            //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                            //    buffer_indices.push_back(buf_idx);
-                            //}
-                            // Eventually, if you stop using lookup arrays, use above implementation, but these are sitting around so might as well use them 
                             for (auto vec : index_combos)  {
-                                if (deriv_order == 1) buffer_indices.push_back(buffer_index_eri1d[vec[0]]);
-                                else if (deriv_order == 2) buffer_indices.push_back(buffer_index_eri2d[vec[0]][vec[1]]);
-                                else if (deriv_order == 3) buffer_indices.push_back(buffer_index_eri3d[vec[0]][vec[1]][vec[2]]);
-                                else if (deriv_order == 4) buffer_indices.push_back(buffer_index_eri4d[vec[0]][vec[1]][vec[2]][vec[3]]);
+                                std::sort(vec.begin(), vec.end());
+                                int buf_idx = 0;
+                                // buffer_multidim_lookup
+                                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                                buffer_indices.push_back(buf_idx);
                             }
 
                             // Loop over shell block, keeping a total count idx for the size of shell set
@@ -2779,8 +2625,7 @@ void f12_double_commutator_deriv_disk(double beta, int max_deriv_order) {
         unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
         // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-        // Currently not used due to predefined lookup arrays
-        //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+        const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
 
         // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
         const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
@@ -2864,20 +2709,13 @@ void f12_double_commutator_deriv_disk(double beta, int max_deriv_order) {
                             std::vector<int> buffer_indices;
                             
                             // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                            //for (auto vec : index_combos)  {
-                            //    std::sort(vec.begin(), vec.end());
-                            //    int buf_idx = 0;
-                            //    // buffer_multidim_lookup
-                            //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                            //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                            //    buffer_indices.push_back(buf_idx);
-                            //}
-                            // Eventually, if you stop using lookup arrays, use above implementation, but these are sitting around so might as well use them 
                             for (auto vec : index_combos)  {
-                                if (deriv_order == 1) buffer_indices.push_back(buffer_index_eri1d[vec[0]]);
-                                else if (deriv_order == 2) buffer_indices.push_back(buffer_index_eri2d[vec[0]][vec[1]]);
-                                else if (deriv_order == 3) buffer_indices.push_back(buffer_index_eri3d[vec[0]][vec[1]][vec[2]]);
-                                else if (deriv_order == 4) buffer_indices.push_back(buffer_index_eri4d[vec[0]][vec[1]][vec[2]][vec[3]]);
+                                std::sort(vec.begin(), vec.end());
+                                int buf_idx = 0;
+                                // buffer_multidim_lookup
+                                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                                buffer_indices.push_back(buf_idx);
                             }
 
                             // Loop over shell block, keeping a total count idx for the size of shell set
@@ -2938,8 +2776,7 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
 
     // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
     // Overlap and kinetic have different mappings than potential since potential has more elements in the buffer
-    // Currently unused due to predefined lookup arrays
-    //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
     // Potential integrals buffer is flattened upper triangle of (6 + NCART) dimensional deriv_order tensor
     const std::vector<std::vector<int>> potential_buffer_multidim_lookup = generate_multi_index_lookup(6 + ncart, deriv_order);
 
@@ -3031,18 +2868,12 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
                 std::vector<int> buffer_indices;
                 std::vector<int> potential_buffer_indices;
                 // Overlap/Kinetic integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
-                //for (auto vec : index_combos)  {
-                //    std::sort(vec.begin(), vec.end());
-                //    int buf_idx = 0;
-                //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                //    buffer_indices.push_back(buf_idx);
-                //}
                 for (auto vec : index_combos)  {
-                    if (deriv_order == 1) buffer_indices.push_back(buffer_index_oei1d[vec[0]]);
-                    else if (deriv_order == 2) buffer_indices.push_back(buffer_index_oei2d[vec[0]][vec[1]]);
-                    else if (deriv_order == 3) buffer_indices.push_back(buffer_index_oei3d[vec[0]][vec[1]][vec[2]]);
-                    else if (deriv_order == 4) buffer_indices.push_back(buffer_index_oei4d[vec[0]][vec[1]][vec[2]][vec[3]]);
+                    std::sort(vec.begin(), vec.end());
+                    int buf_idx = 0;
+                    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                    buffer_indices.push_back(buf_idx);
                 }
                 // Potential integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
                 for (auto vec : potential_index_combos)  {
@@ -3088,8 +2919,7 @@ py::array eri_deriv_core(int deriv_order) {
     unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
     // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-    // Currently unused due to predefined lookup arrays
-    //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
 
     // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
     const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
@@ -3162,20 +2992,13 @@ py::array eri_deriv_core(int deriv_order) {
                         std::vector<int> buffer_indices;
                         
                         // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                        //for (auto vec : index_combos)  {
-                        //    std::sort(vec.begin(), vec.end());
-                        //    int buf_idx = 0;
-                        //    // buffer_multidim_lookup
-                        //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                        //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                        //    buffer_indices.push_back(buf_idx);
-                        //}
-                        // Eventually, if you stop using lookup arrays, use above implementation, but these are sitting around so might as well use them 
                         for (auto vec : index_combos)  {
-                            if (deriv_order == 1) buffer_indices.push_back(buffer_index_eri1d[vec[0]]);
-                            else if (deriv_order == 2) buffer_indices.push_back(buffer_index_eri2d[vec[0]][vec[1]]);
-                            else if (deriv_order == 3) buffer_indices.push_back(buffer_index_eri3d[vec[0]][vec[1]][vec[2]]);
-                            else if (deriv_order == 4) buffer_indices.push_back(buffer_index_eri4d[vec[0]][vec[1]][vec[2]][vec[3]]);
+                            std::sort(vec.begin(), vec.end());
+                            int buf_idx = 0;
+                            // buffer_multidim_lookup
+                            auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                            if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                            buffer_indices.push_back(buf_idx);
                         }
 
                         // Loop over shell block, keeping a total count idx for the size of shell set
@@ -3212,8 +3035,7 @@ py::array f12_deriv_core(double beta, int deriv_order) {
     unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
     // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-    // Currently unused due to predefined lookup arrays
-    //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
 
     // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
     const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
@@ -3288,20 +3110,13 @@ py::array f12_deriv_core(double beta, int deriv_order) {
                         std::vector<int> buffer_indices;
                         
                         // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                        //for (auto vec : index_combos)  {
-                        //    std::sort(vec.begin(), vec.end());
-                        //    int buf_idx = 0;
-                        //    // buffer_multidim_lookup
-                        //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                        //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                        //    buffer_indices.push_back(buf_idx);
-                        //}
-                        // Eventually, if you stop using lookup arrays, use above implementation, but these are sitting around so might as well use them 
                         for (auto vec : index_combos)  {
-                            if (deriv_order == 1) buffer_indices.push_back(buffer_index_eri1d[vec[0]]);
-                            else if (deriv_order == 2) buffer_indices.push_back(buffer_index_eri2d[vec[0]][vec[1]]);
-                            else if (deriv_order == 3) buffer_indices.push_back(buffer_index_eri3d[vec[0]][vec[1]][vec[2]]);
-                            else if (deriv_order == 4) buffer_indices.push_back(buffer_index_eri4d[vec[0]][vec[1]][vec[2]][vec[3]]);
+                            std::sort(vec.begin(), vec.end());
+                            int buf_idx = 0;
+                            // buffer_multidim_lookup
+                            auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                            if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                            buffer_indices.push_back(buf_idx);
                         }
 
                         // Loop over shell block, keeping a total count idx for the size of shell set
@@ -3338,8 +3153,7 @@ py::array f12_squared_deriv_core(double beta, int deriv_order) {
     unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
     // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-    // Currently unused due to predefined lookup arrays
-    //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
 
     // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
     const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
@@ -3414,20 +3228,13 @@ py::array f12_squared_deriv_core(double beta, int deriv_order) {
                         std::vector<int> buffer_indices;
                         
                         // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                        //for (auto vec : index_combos)  {
-                        //    std::sort(vec.begin(), vec.end());
-                        //    int buf_idx = 0;
-                        //    // buffer_multidim_lookup
-                        //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                        //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                        //    buffer_indices.push_back(buf_idx);
-                        //}
-                        // Eventually, if you stop using lookup arrays, use above implementation, but these are sitting around so might as well use them 
                         for (auto vec : index_combos)  {
-                            if (deriv_order == 1) buffer_indices.push_back(buffer_index_eri1d[vec[0]]);
-                            else if (deriv_order == 2) buffer_indices.push_back(buffer_index_eri2d[vec[0]][vec[1]]);
-                            else if (deriv_order == 3) buffer_indices.push_back(buffer_index_eri3d[vec[0]][vec[1]][vec[2]]);
-                            else if (deriv_order == 4) buffer_indices.push_back(buffer_index_eri4d[vec[0]][vec[1]][vec[2]][vec[3]]);
+                            std::sort(vec.begin(), vec.end());
+                            int buf_idx = 0;
+                            // buffer_multidim_lookup
+                            auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                            if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                            buffer_indices.push_back(buf_idx);
                         }
 
                         // Loop over shell block, keeping a total count idx for the size of shell set
@@ -3464,8 +3271,7 @@ py::array f12g12_deriv_core(double beta, int deriv_order) {
     unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
     // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-    // Currently unused due to predefined lookup arrays
-    //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
 
     // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
     const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
@@ -3540,20 +3346,13 @@ py::array f12g12_deriv_core(double beta, int deriv_order) {
                         std::vector<int> buffer_indices;
                         
                         // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                        //for (auto vec : index_combos)  {
-                        //    std::sort(vec.begin(), vec.end());
-                        //    int buf_idx = 0;
-                        //    // buffer_multidim_lookup
-                        //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                        //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                        //    buffer_indices.push_back(buf_idx);
-                        //}
-                        // Eventually, if you stop using lookup arrays, use above implementation, but these are sitting around so might as well use them 
                         for (auto vec : index_combos)  {
-                            if (deriv_order == 1) buffer_indices.push_back(buffer_index_eri1d[vec[0]]);
-                            else if (deriv_order == 2) buffer_indices.push_back(buffer_index_eri2d[vec[0]][vec[1]]);
-                            else if (deriv_order == 3) buffer_indices.push_back(buffer_index_eri3d[vec[0]][vec[1]][vec[2]]);
-                            else if (deriv_order == 4) buffer_indices.push_back(buffer_index_eri4d[vec[0]][vec[1]][vec[2]][vec[3]]);
+                            std::sort(vec.begin(), vec.end());
+                            int buf_idx = 0;
+                            // buffer_multidim_lookup
+                            auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                            if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                            buffer_indices.push_back(buf_idx);
                         }
 
                         // Loop over shell block, keeping a total count idx for the size of shell set
@@ -3590,8 +3389,7 @@ py::array f12_double_commutator_deriv_core(double beta, int deriv_order) {
     unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
     // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-    // Currently unused due to predefined lookup arrays
-    //const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
 
     // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
     const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
@@ -3666,20 +3464,13 @@ py::array f12_double_commutator_deriv_core(double beta, int deriv_order) {
                         std::vector<int> buffer_indices;
                         
                         // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                        //for (auto vec : index_combos)  {
-                        //    std::sort(vec.begin(), vec.end());
-                        //    int buf_idx = 0;
-                        //    // buffer_multidim_lookup
-                        //    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                        //    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                        //    buffer_indices.push_back(buf_idx);
-                        //}
-                        // Eventually, if you stop using lookup arrays, use above implementation, but these are sitting around so might as well use them 
                         for (auto vec : index_combos)  {
-                            if (deriv_order == 1) buffer_indices.push_back(buffer_index_eri1d[vec[0]]);
-                            else if (deriv_order == 2) buffer_indices.push_back(buffer_index_eri2d[vec[0]][vec[1]]);
-                            else if (deriv_order == 3) buffer_indices.push_back(buffer_index_eri3d[vec[0]][vec[1]][vec[2]]);
-                            else if (deriv_order == 4) buffer_indices.push_back(buffer_index_eri4d[vec[0]][vec[1]][vec[2]][vec[3]]);
+                            std::sort(vec.begin(), vec.end());
+                            int buf_idx = 0;
+                            // buffer_multidim_lookup
+                            auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                            if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                            buffer_indices.push_back(buf_idx);
                         }
 
                         // Loop over shell block, keeping a total count idx for the size of shell set
diff --git a/quax/integrals/tei.py b/quax/integrals/tei.py
index bf97660..05bf410 100644
--- a/quax/integrals/tei.py
+++ b/quax/integrals/tei.py
@@ -332,7 +332,7 @@ def eri_deriv_jvp(self, primals, tangents):
         return primals_out, tangents_out
 
     def f12_jvp(self, primals, tangents):
-        geom, beta, = primals
+        geom, beta = primals
         primals_out = self.f12(geom, beta)
         tangents_out = self.f12_deriv(geom, beta, tangents[0])
         return primals_out, tangents_out
@@ -346,7 +346,7 @@ def f12_deriv_jvp(self, primals, tangents):
         return primals_out, tangents_out
 
     def f12_squared_jvp(self, primals, tangents):
-        geom, beta, = primals
+        geom, beta = primals
         primals_out = self.f12_squared(geom, beta)
         tangents_out = self.f12_squared_deriv(geom, beta, tangents[0])
         return primals_out, tangents_out
@@ -360,7 +360,7 @@ def f12_squared_deriv_jvp(self, primals, tangents):
         return primals_out, tangents_out
 
     def f12g12_jvp(self, primals, tangents):
-        geom, beta, = primals
+        geom, beta = primals
         primals_out = self.f12g12(geom, beta)
         tangents_out = self.f12g12_deriv(geom, beta, tangents[0])
         return primals_out, tangents_out
@@ -374,7 +374,7 @@ def f12g12_deriv_jvp(self, primals, tangents):
         return primals_out, tangents_out
 
     def f12_double_commutator_jvp(self, primals, tangents):
-        geom, beta, = primals
+        geom, beta = primals
         primals_out = self.f12_double_commutator(geom, beta)
         tangents_out = self.f12_double_commutator_deriv(geom, beta, tangents[0])
         return primals_out, tangents_out

From 84df1c3edfbb7063703445452055c658727a6c8a Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Tue, 7 Nov 2023 11:00:23 -0500
Subject: [PATCH 26/91] Standardize creation of indices

---
 quax/integrals/libint_interface.cc | 156 ++++++++---------------------
 1 file changed, 42 insertions(+), 114 deletions(-)

diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index 3c984a3..f64f388 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -723,18 +723,9 @@ py::array overlap_deriv(std::vector<int> deriv_vec) {
             // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
             std::vector<long> shell_atom_index_list{atom1, atom2};
 
-            // Initialize 2d vector, with DERIV_ORDER subvectors
-            // Each subvector contains index candidates which are possible choices for each partial derivative operator
-            // In other words, indices looks like { {choices for first deriv operator} {choices for second deriv op} {third} ...}
-            // The cartesian product of these subvectors gives all combos that need to be summed to form total nuclear derivative of integrals
-            std::vector<std::vector<int>> indices; 
-            for (int i = 0; i < deriv_order; i++){
-                std::vector<int> new_vec;
-                indices.push_back(new_vec);
-            }
-
             // For every desired atom derivative, check shell and nuclear indices for a match, add it to subvector for that derivative
             // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
+            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
             for (int j = 0; j < desired_atom_indices.size(); j++){
                 int desired_atom_idx = desired_atom_indices[j];
                 // Shell indices
@@ -747,14 +738,6 @@ py::array overlap_deriv(std::vector<int> deriv_vec) {
                 }
             }
 
-            // If we made it this far, the shell derivative we want is in the buffer, perhaps even more than once. 
-            size_t thread_id = 0;
-#ifdef _OPENMP
-            thread_id = omp_get_thread_num();
-#endif
-            s_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-            const auto& buf_vec = s_engines[thread_id].results(); // will point to computed shell sets
-
             // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
             // and the total number of subvectors is the order of differentiation
             // Now we want all combinations where we pick exactly one index from each subvector.
@@ -771,6 +754,14 @@ py::array overlap_deriv(std::vector<int> deriv_vec) {
                 buffer_indices.push_back(buf_idx);
             }
 
+            // If we made it this far, the shell derivative we want is in the buffer, perhaps even more than once. 
+            size_t thread_id = 0;
+#ifdef _OPENMP
+            thread_id = omp_get_thread_num();
+#endif
+            s_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+            const auto& buf_vec = s_engines[thread_id].results(); // will point to computed shell sets
+
             // Loop over every buffer index and accumulate for every shell set.
             for(auto i = 0; i < buffer_indices.size(); ++i) {
                 auto ints_shellset = buf_vec[buffer_indices[i]];
@@ -825,38 +816,21 @@ py::array kinetic_deriv(std::vector<int> deriv_vec) {
             // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
             std::vector<long> shell_atom_index_list{atom1, atom2};
 
-            // Initialize 2d vector, with DERIV_ORDER subvectors
-            // Each subvector contains index candidates which are possible choices for each partial derivative operator
-            // In other words, indices looks like { {choices for first deriv operator} {choices for second deriv op} {third} ...}
-            // The cartesian product of these subvectors gives all combos that need to be summed to form total nuclear derivative of integrals
-            std::vector<std::vector<int>> indices; 
-            for (int i = 0; i < deriv_order; i++){
-                std::vector<int> new_vec;
-                indices.push_back(new_vec);
-            }
-
             // For every desired atom derivative, check shell and nuclear indices for a match, add it to subvector for that derivative
             // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
+            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
             for (int j = 0; j < desired_atom_indices.size(); j++){
                 int desired_atom_idx = desired_atom_indices[j];
                 // Shell indices
                 for (int i = 0; i < 2; i++){
                     int atom_idx = shell_atom_index_list[i];
-                    if (atom_idx == desired_atom_idx) { 
+                    if (atom_idx == desired_atom_idx) {
                         int tmp = 3 * i + desired_coordinates[j];
                         indices[j].push_back(tmp);
                     }
                 }
             }
 
-            // If we made it this far, the shell derivative we want is in the buffer, perhaps even more than once. 
-            size_t thread_id = 0;
-#ifdef _OPENMP
-            thread_id = omp_get_thread_num();
-#endif
-            t_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-            const auto& buf_vec = t_engines[thread_id].results(); // will point to computed shell sets
-
             // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
             // and the total number of subvectors is the order of differentiation
             // Now we want all combinations where we pick exactly one index from each subvector.
@@ -873,6 +847,14 @@ py::array kinetic_deriv(std::vector<int> deriv_vec) {
                 buffer_indices.push_back(buf_idx);
             }
 
+            // If we made it this far, the shell derivative we want is in the buffer, perhaps even more than once.
+            size_t thread_id = 0;
+#ifdef _OPENMP
+            thread_id = omp_get_thread_num();
+#endif
+            t_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+            const auto& buf_vec = t_engines[thread_id].results(); // will point to computed shell sets
+
             // Loop over every buffer index and accumulate for every shell set.
             for(auto i = 0; i < buffer_indices.size(); ++i) {
                 auto ints_shellset = buf_vec[buffer_indices[i]];
@@ -928,18 +910,9 @@ py::array potential_deriv(std::vector<int> deriv_vec) {
             // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
             std::vector<long> shell_atom_index_list{atom1, atom2};
 
-            // Initialize 2d vector, with DERIV_ORDER subvectors
-            // Each subvector contains index candidates which are possible choices for each partial derivative operator
-            // In other words, indices looks like { {choices for first deriv operator} {choices for second deriv op} {third} ...}
-            // The cartesian product of these subvectors gives all combos that need to be summed to form total nuclear derivative of integrals
-            std::vector<std::vector<int>> indices; 
-            for (int i = 0; i < deriv_order; i++){
-                std::vector<int> new_vec;
-                indices.push_back(new_vec);
-            }
-
             // For every desired atom derivative, check shell and nuclear indices for a match, add it to subvector for that derivative
             // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
+            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
             for (int j = 0; j < desired_atom_indices.size(); j++){
                 int desired_atom_idx = desired_atom_indices[j];
                 // Shell indices
@@ -954,19 +927,11 @@ py::array potential_deriv(std::vector<int> deriv_vec) {
                 for (int i = 0; i < natom; i++){
                     // i = shell_atom_index_list[i];
                     if (i == desired_atom_idx) {
-                        int tmp = 3 * (i +2) + desired_coordinates[j];
+                        int tmp = 3 * (i + 2) + desired_coordinates[j];
                         indices[j].push_back(tmp);
                     }
                 }
             }
-
-            // Compute the integrals
-            size_t thread_id = 0;
-#ifdef _OPENMP
-            thread_id = omp_get_thread_num();
-#endif
-            v_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-            const auto& buf_vec = v_engines[thread_id].results(); // will point to computed shell sets
             
             // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
             // and the total number of subvectors is the order of differentiation
@@ -984,6 +949,14 @@ py::array potential_deriv(std::vector<int> deriv_vec) {
                 buffer_indices.push_back(buf_idx);
             }
 
+            // Compute the integrals
+            size_t thread_id = 0;
+#ifdef _OPENMP
+            thread_id = omp_get_thread_num();
+#endif
+            v_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+            const auto& buf_vec = v_engines[thread_id].results(); // will point to computed shell sets
+
             // Loop over every buffer index and accumulate for every shell set.
             for(auto i = 0; i < buffer_indices.size(); ++i) {
                 auto ints_shellset = buf_vec[buffer_indices[i]];
@@ -1056,19 +1029,10 @@ py::array eri_deriv(std::vector<int> deriv_vec) {
 
                     // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
                     std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
-
-                    // Initialize 2d vector, with DERIV_ORDER subvectors
-                    // Each subvector contains index candidates which are possible choices for each partial derivative operator
-                    // In other words, indices looks like { {choices for first deriv operator} {choices for second deriv op} {third} ...}
-                    // The cartesian product of these subvectors gives all combos that need to be summed to form total nuclear derivative of integrals
-                    std::vector<std::vector<int>> indices;
-                    for (int i = 0; i < deriv_order; i++){
-                        std::vector<int> new_vec;
-                        indices.push_back(new_vec);
-                    }
                 
                     // For every desired atom derivative, check shell indices for a match, add it to subvector for that derivative
                     // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
+                    std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
                     for (int j = 0; j < desired_atom_indices.size(); j++){
                         int desired_atom_idx = desired_atom_indices[j];
                         // Shell indices
@@ -1087,7 +1051,7 @@ py::array eri_deriv(std::vector<int> deriv_vec) {
                     // This is achievable through a cartesian product 
                     std::vector<std::vector<int>> index_combos = cartesian_product(indices);
                     std::vector<int> buffer_indices;
-   
+
                     // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
                     for (auto vec : index_combos)  {
                         std::sort(vec.begin(), vec.end());
@@ -1106,7 +1070,7 @@ py::array eri_deriv(std::vector<int> deriv_vec) {
                     eri_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
                     const auto& buf_vec = eri_engines[thread_id].results(); // will point to computed shell sets
 
-                    for(auto i = 0; i<buffer_indices.size(); ++i) {
+                    for(auto i = 0; i < buffer_indices.size(); ++i) {
                         auto ints_shellset = buf_vec[buffer_indices[i]];
                         if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
                         for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
@@ -1189,19 +1153,10 @@ py::array f12_deriv(double beta, std::vector<int> deriv_vec) {
 
                     // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
                     std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
-
-                    // Initialize 2d vector, with DERIV_ORDER subvectors
-                    // Each subvector contains index candidates which are possible choices for each partial derivative operator
-                    // In other words, indices looks like { {choices for first deriv operator} {choices for second deriv op} {third} ...}
-                    // The cartesian product of these subvectors gives all combos that need to be summed to form total nuclear derivative of integrals
-                    std::vector<std::vector<int>> indices;
-                    for (int i = 0; i < deriv_order; i++){
-                        std::vector<int> new_vec;
-                        indices.push_back(new_vec);
-                    }
                 
                     // For every desired atom derivative, check shell indices for a match, add it to subvector for that derivative
                     // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
+                    std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
                     for (int j = 0; j < desired_atom_indices.size(); j++){
                         int desired_atom_idx = desired_atom_indices[j];
                         // Shell indices
@@ -1220,7 +1175,7 @@ py::array f12_deriv(double beta, std::vector<int> deriv_vec) {
                     // This is achievable through a cartesian product 
                     std::vector<std::vector<int>> index_combos = cartesian_product(indices);
                     std::vector<int> buffer_indices;
-                    
+
                     // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
                     for (auto vec : index_combos)  {
                         std::sort(vec.begin(), vec.end());
@@ -1241,7 +1196,7 @@ py::array f12_deriv(double beta, std::vector<int> deriv_vec) {
 
                     for(auto i = 0; i < buffer_indices.size(); ++i) {
                         auto ints_shellset = buf_vec[buffer_indices[i]];
-                        if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
+                        if (ints_shellset == nullptr) continue;
                         for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                             size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
                             for(auto f2 = 0; f2 != n2; ++f2) {
@@ -1322,19 +1277,10 @@ py::array f12_squared_deriv(double beta, std::vector<int> deriv_vec) {
 
                     // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
                     std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
-
-                    // Initialize 2d vector, with DERIV_ORDER subvectors
-                    // Each subvector contains index candidates which are possible choices for each partial derivative operator
-                    // In other words, indices looks like { {choices for first deriv operator} {choices for second deriv op} {third} ...}
-                    // The cartesian product of these subvectors gives all combos that need to be summed to form total nuclear derivative of integrals
-                    std::vector<std::vector<int>> indices;
-                    for (int i = 0; i < deriv_order; i++){
-                        std::vector<int> new_vec;
-                        indices.push_back(new_vec);
-                    }
                 
                     // For every desired atom derivative, check shell indices for a match, add it to subvector for that derivative
                     // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
+                    std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
                     for (int j = 0; j < desired_atom_indices.size(); j++){
                         int desired_atom_idx = desired_atom_indices[j];
                         // Shell indices
@@ -1353,7 +1299,7 @@ py::array f12_squared_deriv(double beta, std::vector<int> deriv_vec) {
                     // This is achievable through a cartesian product 
                     std::vector<std::vector<int>> index_combos = cartesian_product(indices);
                     std::vector<int> buffer_indices;
-                    
+
                     // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
                     for (auto vec : index_combos)  {
                         std::sort(vec.begin(), vec.end());
@@ -1372,7 +1318,7 @@ py::array f12_squared_deriv(double beta, std::vector<int> deriv_vec) {
                     cgtg_squared_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
                     const auto& buf_vec = cgtg_squared_engines[thread_id].results(); // will point to computed shell sets
 
-                    for(auto i = 0; i<buffer_indices.size(); ++i) {
+                    for(auto i = 0; i < buffer_indices.size(); ++i) {
                         auto ints_shellset = buf_vec[buffer_indices[i]];
                         if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
                         for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
@@ -1455,19 +1401,10 @@ py::array f12g12_deriv(double beta, std::vector<int> deriv_vec) {
 
                     // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
                     std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
-
-                    // Initialize 2d vector, with DERIV_ORDER subvectors
-                    // Each subvector contains index candidates which are possible choices for each partial derivative operator
-                    // In other words, indices looks like { {choices for first deriv operator} {choices for second deriv op} {third} ...}
-                    // The cartesian product of these subvectors gives all combos that need to be summed to form total nuclear derivative of integrals
-                    std::vector<std::vector<int>> indices;
-                    for (int i = 0; i < deriv_order; i++){
-                        std::vector<int> new_vec;
-                        indices.push_back(new_vec);
-                    }
                 
                     // For every desired atom derivative, check shell indices for a match, add it to subvector for that derivative
                     // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
+                    std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
                     for (int j = 0; j < desired_atom_indices.size(); j++){
                         int desired_atom_idx = desired_atom_indices[j];
                         // Shell indices
@@ -1486,7 +1423,7 @@ py::array f12g12_deriv(double beta, std::vector<int> deriv_vec) {
                     // This is achievable through a cartesian product 
                     std::vector<std::vector<int>> index_combos = cartesian_product(indices);
                     std::vector<int> buffer_indices;
-                    
+
                     // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
                     for (auto vec : index_combos)  {
                         std::sort(vec.begin(), vec.end());
@@ -1505,7 +1442,7 @@ py::array f12g12_deriv(double beta, std::vector<int> deriv_vec) {
                     cgtg_coulomb_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
                     const auto& buf_vec = cgtg_coulomb_engines[thread_id].results(); // will point to computed shell sets
 
-                    for(auto i = 0; i<buffer_indices.size(); ++i) {
+                    for(auto i = 0; i < buffer_indices.size(); ++i) {
                         auto ints_shellset = buf_vec[buffer_indices[i]];
                         if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
                         for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
@@ -1588,19 +1525,10 @@ py::array f12_double_commutator_deriv(double beta, std::vector<int> deriv_vec) {
 
                     // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
                     std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
-
-                    // Initialize 2d vector, with DERIV_ORDER subvectors
-                    // Each subvector contains index candidates which are possible choices for each partial derivative operator
-                    // In other words, indices looks like { {choices for first deriv operator} {choices for second deriv op} {third} ...}
-                    // The cartesian product of these subvectors gives all combos that need to be summed to form total nuclear derivative of integrals
-                    std::vector<std::vector<int>> indices;
-                    for (int i = 0; i < deriv_order; i++){
-                        std::vector<int> new_vec;
-                        indices.push_back(new_vec);
-                    }
                 
                     // For every desired atom derivative, check shell indices for a match, add it to subvector for that derivative
                     // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
+                    std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
                     for (int j = 0; j < desired_atom_indices.size(); j++){
                         int desired_atom_idx = desired_atom_indices[j];
                         // Shell indices

From 8db0cfbecbad6d3430a39e4aea0f872736242311 Mon Sep 17 00:00:00 2001
From: Erica Mitchell <ericamitch5@gmail.com>
Date: Tue, 7 Nov 2023 17:00:35 -0500
Subject: [PATCH 27/91] Remove extraneous calls, don't use at for full tensor

---
 quax/integrals/oei.py  |  6 ---
 quax/integrals/tei.py  | 10 -----
 quax/methods/mp2f12.py | 95 +++++++++++++++++-------------------------
 3 files changed, 39 insertions(+), 72 deletions(-)

diff --git a/quax/integrals/oei.py b/quax/integrals/oei.py
index ec30695..64de62b 100644
--- a/quax/integrals/oei.py
+++ b/quax/integrals/oei.py
@@ -17,12 +17,6 @@ def __init__(self, basis1, basis2, xyz_path, max_deriv_order, mode):
         molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
         natoms = molecule.natom()
 
-        # Libint and Psi4 CABS naming
-        if 'cabs' in basis1.name().lower():
-            basis1_name = basis1.name().lower().replace('cabs', 'optri')
-        if 'cabs' in basis2.name().lower():
-            basis2_name = basis2.name().lower().replace('cabs', 'optri')
-
         nbf1 = basis1.nbf()
         nbf2 = basis2.nbf()
 
diff --git a/quax/integrals/tei.py b/quax/integrals/tei.py
index 05bf410..148f06c 100644
--- a/quax/integrals/tei.py
+++ b/quax/integrals/tei.py
@@ -17,16 +17,6 @@ def __init__(self, basis1, basis2, basis3, basis4, xyz_path, max_deriv_order, op
         molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
         natoms = molecule.natom()
 
-        # Libint and Psi4 CABS naming
-        if 'cabs' in basis1.name().lower():
-            basis1_name = basis1.name().lower().replace('cabs', 'optri')
-        if 'cabs' in basis2.name().lower():
-            basis2_name = basis2.name().lower().replace('cabs', 'optri')
-        if 'cabs' in basis3.name().lower():
-            basis3_name = basis3.name().lower().replace('cabs', 'optri')
-        if 'cabs' in basis4.name().lower():
-            basis4_name = basis4.name().lower().replace('cabs', 'optri')
-
         nbf1 = basis1.nbf()
         nbf2 = basis2.nbf()
         nbf3 = basis3.nbf()
diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index 08b2820..09e4b50 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -40,11 +40,11 @@ def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, optio
     indices = jnp.asarray(jnp.triu_indices(ndocc)).reshape(2,-1).T
 
     def loop_energy(idx, f12_corr):
-        i,j = indices[idx]
+        i, j = indices[idx]
         kd = jax.lax.cond(i == j, lambda: 1.0, lambda: 2.0)
 
         V_ij = V[i, j, :, :]
-        V_ij = V_ij.at[:, :].add(-1.0 *jnp.einsum('klab,ab,ab->kl', C, G[i, j, :, :], D[i, j, :, :], optimize='optimal'))
+        V_ij -= jnp.einsum('klab,ab,ab->kl', C, G[i, j, :, :], D[i, j, :, :], optimize='optimal')
 
         V_s = 0.5 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd * (V_ij[i, j] + V_ij[j, i])
 
@@ -52,7 +52,7 @@ def loop_energy(idx, f12_corr):
                                                * kd * (V_ij[i, j] - V_ij[j, i]), lambda: 0.0)
 
         B_ij = B - (X * (f[i, i] + f[j, j]))
-        B_ij = B_ij.at[:, :, :, :].add(-1.0 * jnp.einsum('klab,ab,mnab', C, D[i, j, :, :], C, optimize='optimal'))
+        B_ij -= jnp.einsum('klab,ab,mnab', C, D[i, j, :, :], C, optimize='optimal')
 
         B_s = 0.125 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd \
                      * (B_ij[i, j, i, j] + B_ij[j, i, i, j]) \
@@ -74,7 +74,7 @@ def loop_energy(idx, f12_corr):
 
 # Fixed Amplitude Ansatz
 @jax.jit
-def t_(p = 0, q = 0, r = 0, s = 0):
+def t_(p, q, r, s):
     return jnp.select(
         [(p == q) & (p == r) & (p ==s), (p == r) & (q == s), (p == s) & (q == r)],
         [0.5, 0.375, 0.125],
@@ -127,10 +127,10 @@ def form_Fock(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_pa
     G = G.at[nobs:nri, :ndocc, nobs:nri, :nobs].set(G_tmp) # <Co|CO>
 
     # Fill Fock Matrix
-    f = f.at[:, :].add(2.0 * jnp.einsum('piqi->pq', G[:, :ndocc, :, :ndocc], optimize='optimal'))
+    f += 2.0 * jnp.einsum('piqi->pq', G[:, :ndocc, :, :ndocc], optimize='optimal')
     fk = f # Fock Matrix without Exchange
     k =  jnp.einsum('piiq->pq', G[:, :ndocc, :ndocc, :], optimize='optimal')
-    f = f.at[:, :].add(-1.0 * k)
+    f -= k
 
     return f, fk, k
 
@@ -146,15 +146,14 @@ def form_V(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, xyz_path, deri
     F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs)
     G_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "eri", xyz_path, deriv_order, options)
     G_tmp = partial_tei_transformation(G_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs)
-    V_tmp = -1.0 * jnp.einsum('ijmy,klmy->ijkl', G_tmp, F_tmp, optimize='optimal')
-    V = V.at[:, :, :, :].add(V_tmp)
-    V = V.at[:, :, :, :].add(f12_transpose(V_tmp))
+    V_tmp = jnp.einsum('ijmy,klmy->ijkl', G_tmp, F_tmp, optimize='optimal')
+    V -= V_tmp + f12_transpose(V_tmp)
 
     F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12", xyz_path, deriv_order, options)
     F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :nobs], C_obs[:, :nobs])
     G_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
     G_tmp = partial_tei_transformation(G_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :nobs], C_obs[:, :nobs])
-    V = V.at[:, :, :, :].add(-1.0 * jnp.einsum('ijrs,klrs->ijkl', G_tmp, F_tmp, optimize='optimal'))
+    V -= jnp.einsum('ijrs,klrs->ijkl', G_tmp, F_tmp, optimize='optimal')
 
     return V
 
@@ -165,28 +164,22 @@ def form_X(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, xyz_path, deri
 
     F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
     F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs)
-    X_tmp = -1.0 * jnp.einsum('ijmy,klmy->ijkl', F_tmp, F_tmp, optimize='optimal')
-    X = X.at[:, :, :, :].add(X_tmp)
-    X = X.at[:, :, :, :].add(f12_transpose(X_tmp))
+    X_tmp = jnp.einsum('ijmy,klmy->ijkl', F_tmp, F_tmp, optimize='optimal')
+    X -= X_tmp + f12_transpose(X_tmp)
 
     F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12", xyz_path, deriv_order, options)
     F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :nobs], C_obs[:, :nobs])
-    X = X.at[:, :, :, :].add(-1.0 * jnp.einsum('ijrs,klrs->ijkl', F_tmp, F_tmp, optimize='optimal'))
+    X -= jnp.einsum('ijrs,klrs->ijkl', F_tmp, F_tmp, optimize='optimal')
 
     return X
 
 def form_C(geom, basis_set, cabs_set, C_obs, C_cabs, Fock, ndocc, nobs, xyz_path, deriv_order, options):
 
-    C = jnp.empty((ndocc, ndocc, nobs - ndocc, nobs - ndocc))
-
     F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
     F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, ndocc:nobs], C_cabs)
-    C_tmp = jnp.einsum('klay,by->klab', F_tmp, Fock[ndocc:nobs, nobs:], optimize='optimal')
-
-    C = C.at[:, :, :, :].set(C_tmp)
-    C = C.at[:, :, :, :].add(f12_transpose(C_tmp))
+    tmp = jnp.einsum('klay,by->klab', F_tmp, Fock[ndocc:nobs, nobs:], optimize='optimal')
 
-    return C
+    return tmp + f12_transpose(tmp)
 
 def form_B(geom, basis_set, cabs_set, C_obs, C_cabs, Fock, noK, K, ndocc, nobs, nri, xyz_path, deriv_order, options):
     # Term 1
@@ -195,57 +188,47 @@ def form_B(geom, basis_set, cabs_set, C_obs, C_cabs, Fock, noK, K, ndocc, nobs,
 
     # Term 2
     F2 = jnp.empty((ndocc, ndocc, ndocc, nri))
-    tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12_squared", xyz_path, deriv_order, options)
-    F2 = F2.at[:, :, :, :nobs].set(partial_tei_transformation(tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs)) # <oo|oO>
-    tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12_squared", xyz_path, deriv_order, options)
-    F2 = F2.at[:, :, :, nobs:].set(partial_tei_transformation(tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs)) # <oo|oC>
+    F2_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12_squared", xyz_path, deriv_order, options)
+    F2 = F2.at[:, :, :, :nobs].set(partial_tei_transformation(F2_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs)) # <oo|oO>
+    F2_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12_squared", xyz_path, deriv_order, options)
+    F2 = F2.at[:, :, :, nobs:].set(partial_tei_transformation(F2_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs)) # <oo|oC>
 
     tmp = jnp.einsum('lknI,mI->lknm', F2, noK[:ndocc, :])
-    B = B.at[:, :, :, :].add(tmp)
-    B = B.at[:, :, :, :].add(f12_transpose(tmp))
+    B += tmp + f12_transpose(tmp)
 
     # F12 Integral
     F_oo11 = jnp.empty((ndocc, ndocc, nri, nri))
-    tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12", xyz_path, deriv_order, options)
-    F_oo11 = F_oo11.at[:, :, :nobs, :nobs].set(partial_tei_transformation(tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs, C_obs)) # <oo|OO>
-    tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
-    tmp = partial_tei_transformation(tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs, C_cabs)
-    F_oo11 = F_oo11.at[:, :, :nobs, nobs:].set(tmp) # <oo|OC>
-    F_oo11 = F_oo11.at[:, :, nobs:, :nobs].set(f12_transpose(tmp)) # <oo|CO>
-    tmp = compute_f12_teints(geom, basis_set, cabs_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
-    F_oo11 = F_oo11.at[:, :, nobs:, nobs:].set(partial_tei_transformation(tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs, C_cabs)) # <oo|CC>
+    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12", xyz_path, deriv_order, options)
+    F_oo11 = F_oo11.at[:, :, :nobs, :nobs].set(partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs, C_obs)) # <oo|OO>
+    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
+    F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs, C_cabs)
+    F_oo11 = F_oo11.at[:, :, :nobs, nobs:].set(F_tmp) # <oo|OC>
+    F_oo11 = F_oo11.at[:, :, nobs:, :nobs].set(f12_transpose(F_tmp)) # <oo|CO>
+    F_tmp = compute_f12_teints(geom, basis_set, cabs_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
+    F_oo11 = F_oo11.at[:, :, nobs:, nobs:].set(partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs, C_cabs)) # <oo|CC>
 
     # Term 3
-    tmp = -1.0 * jnp.einsum('lkPC,CA,nmPA->lknm', F_oo11, K, F_oo11, optimize='optimal')
-    B = B.at[:, :, :, :].add(tmp)
-    B = B.at[:, :, :, :].add(f12_transpose(tmp))
+    tmp = jnp.einsum('lkPC,CA,nmPA->lknm', F_oo11, K, F_oo11, optimize='optimal')
+    B -= tmp + f12_transpose(tmp)
 
     # Term 4
-    tmp = -1.0 * jnp.einsum('lkjC,CA,nmjA->lknm', F_oo11[:, :, :ndocc, :], Fock, F_oo11[:, :, :ndocc, :], optimize='optimal')
-    B = B.at[:, :, :, :].add(tmp)
-    B = B.at[:, :, :, :].add(f12_transpose(tmp))
+    tmp = jnp.einsum('lkjC,CA,nmjA->lknm', F_oo11[:, :, :ndocc, :], Fock, F_oo11[:, :, :ndocc, :], optimize='optimal')
+    B -= tmp + f12_transpose(tmp)
 
     # Term 5
     tmp = jnp.einsum('lkxj,ji,nmxi->lknm', F_oo11[:, :, nobs:, :ndocc], Fock[:ndocc, :ndocc], F_oo11[:, :, nobs:, :ndocc], optimize='optimal')
-    B = B.at[:, :, :, :].add(tmp)
-    B = B.at[:, :, :, :].add(f12_transpose(tmp))
+    B += tmp + f12_transpose(tmp)
 
     # Term 6
-    tmp = -1.0 * jnp.einsum('lkbp,pq,nmbq->lknm', F_oo11[:, :, ndocc:nobs, :nobs], Fock[:nobs, :nobs], F_oo11[:, :, ndocc:nobs, :nobs], optimize='optimal')
-    B = B.at[:, :, :, :].add(tmp)
-    B = B.at[:, :, :, :].add(f12_transpose(tmp))
+    tmp = jnp.einsum('lkbp,pq,nmbq->lknm', F_oo11[:, :, ndocc:nobs, :nobs], Fock[:nobs, :nobs], F_oo11[:, :, ndocc:nobs, :nobs], optimize='optimal')
+    B -= tmp + f12_transpose(tmp)
 
     # Term 7
-    tmp = -2.0 * jnp.einsum('lkxI,jI,nmxj->lknm', F_oo11[:, :, nobs:, :], Fock[:ndocc, :], F_oo11[:, :, nobs:, :ndocc], optimize='optimal')
-    B = B.at[:, :, :, :].add(tmp)
-    B = B.at[:, :, :, :].add(f12_transpose(tmp))
+    tmp = 2.0 * jnp.einsum('lkxI,Ij,nmxj->lknm', F_oo11[:, :, nobs:, :], Fock[:, :ndocc], F_oo11[:, :, nobs:, :ndocc], optimize='optimal')
+    B -= tmp + f12_transpose(tmp)
 
     # Term 8
-    tmp = -2.0 * jnp.einsum('lkbq,qy,nmby->lknm', F_oo11[:, :, ndocc:nobs, :nobs], Fock[:nobs, nobs:], F_oo11[:, :, ndocc:nobs, nobs:], optimize='optimal')
-    B = B.at[:, :, :, :].add(tmp)
-    B = B.at[:, :, :, :].add(f12_transpose(tmp))
-
-    tmp = jnp.transpose(B, (2,3,0,1))
-    B = B.at[:, :, :, :].add(tmp)
+    tmp = 2.0 * jnp.einsum('lkbq,qy,nmby->lknm', F_oo11[:, :, ndocc:nobs, :nobs], Fock[:nobs, nobs:], F_oo11[:, :, ndocc:nobs, nobs:], optimize='optimal')
+    B -= tmp + f12_transpose(tmp)
 
-    return 0.5 * B
+    return 0.5 * (B + jnp.transpose(B, (2,3,0,1)))

From 440b1be0321380d05a0ac2bffe7a4b3f42f0cc20 Mon Sep 17 00:00:00 2001
From: Erica Mitchell <ericamitch5@gmail.com>
Date: Tue, 28 Nov 2023 13:31:29 -0500
Subject: [PATCH 28/91] CABS progress

---
 quax/core.py                       |  15 +-
 quax/integrals/basis_utils.py      |  82 +++++--
 quax/integrals/libint_interface.cc |  15 +-
 quax/methods/energy_utils.py       |  40 ++--
 quax/methods/ints.py               |  91 +++++---
 quax/methods/mp2f12.py             | 343 +++++++++++++++++------------
 6 files changed, 345 insertions(+), 241 deletions(-)

diff --git a/quax/core.py b/quax/core.py
index d0b13bd..2a47801 100644
--- a/quax/core.py
+++ b/quax/core.py
@@ -8,8 +8,7 @@
 import os
 import h5py
 
-from .integrals.basis_utils import build_CABS
-from .methods.energy_utils import nuclear_repulsion, cholesky_orthogonalization
+from .integrals.basis_utils import build_RIBS
 from .methods.hartree_fock import restricted_hartree_fock
 from .methods.mp2 import restricted_mp2
 from .methods.mp2f12 import restricted_mp2_f12
@@ -84,12 +83,8 @@ def compute(molecule, basis_name, method, options=None, deriv_order=0, partial=N
     natoms = molecule.natom()
     print("Number of basis functions: ", nbf)
 
-    if 'f12' in method: # Ensure use of Dunning basis sets
-        try:
-            cabs_name = basis_name + "-cabs"
-            cabs_space = build_CABS(molecule, basis_name, cabs_name)
-        except:
-            raise Exception("Must use a cc-pVXZ-F12 or aug-cc-pVXZ basis set for F12 methods.")
+    if 'f12' in method:
+        cabs_set = build_RIBS(molecule, basis_set, basis_name + '-cabs')
 
     # Energy and full derivative tensor evaluations
     args = (geom, basis_set, xyz_path, nuclear_charges, charge, options)
@@ -102,7 +97,7 @@ def electronic_energy(*args, deriv_order=deriv_order):
             def electronic_energy(*args, deriv_order=deriv_order):
                 return restricted_mp2(*args, deriv_order=deriv_order)
         elif method =='mp2-f12':
-            args = args + (cabs_space,)
+            args += (cabs_set,)
             def electronic_energy(*args, deriv_order=deriv_order):
                 return restricted_mp2_f12(*args, deriv_order=deriv_order)
         elif method =='ccsd':
@@ -164,7 +159,7 @@ def partial_wrapper(*args):
         elif method =='mp2-f12':
             def partial_wrapper(*args):
                 geom = jnp.asarray(args)
-                E_mp2f12 = restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, options, cabs_space, deriv_order=deriv_order)
+                E_mp2f12 = restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, options, cabs_set, deriv_order=deriv_order)
                 return E_mp2f12
         elif method =='ccsd':
             def partial_wrapper(*args):
diff --git a/quax/integrals/basis_utils.py b/quax/integrals/basis_utils.py
index 9801105..fafcbe9 100644
--- a/quax/integrals/basis_utils.py
+++ b/quax/integrals/basis_utils.py
@@ -1,16 +1,23 @@
-import psi4 
+import psi4
+import jax
 import jax.numpy as jnp
-import numpy as np
+from jax.lax import fori_loop
 
-def build_CABS(molecule, basis_name, cabs_name):
+from ..methods.ints import compute_f12_oeints
+from ..methods.energy_utils import symmetric_orthogonalization
+
+def build_RIBS(molecule, basis_set, cabs_name):
     """
-    Builds and returns CABS
-    Provide molecule from Psi4,
-    OBS name, CABS name, and
-    MO coefficients from RHF
+    Builds basis set for
+    CABS procedure
     """
+
     # Libint uses the suffix 'cabs' but Psi4 uses 'optri'
-    psi4_name = cabs_name.lower().replace('cabs', 'optri')
+    basis_name = basis_set.name()
+    try:
+        psi4_name = cabs_name.lower().replace('cabs', 'optri')
+    except:
+        raise Exception("Must use a cc-pVXZ-F12 or aug-cc-pVXZ basis set for F12 methods.")
 
     keys = ["BASIS","CABS_BASIS"]
     targets = [basis_name, psi4_name]
@@ -18,29 +25,56 @@ def build_CABS(molecule, basis_name, cabs_name):
     others = [basis_name, basis_name]
 
     # Creates combined basis set in Python
-    obs = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
     ao_union = psi4.driver.qcdb.libmintsbasisset.BasisSet.pyconstruct_combined(molecule.save_string_xyz(), keys, targets, roles, others)
     ao_union['name'] = cabs_name
-    ao_union = psi4.core.BasisSet.construct_from_pydict(molecule, ao_union, 0)
-    ri_space = psi4.core.OrbitalSpace.build_ri_space(ao_union, 1.0e-8)
+    ribs_set = psi4.core.BasisSet.construct_from_pydict(molecule, ao_union, 0)
+
+    return ribs_set
+
+def build_CABS(geom, basis_set, cabs_set, xyz_path, deriv_order, options):
+    """
+    Builds and returns 
+    CABS transformation matrix
+    """
 
-    C_ribs = np.array(ri_space.C()) # Orthogonalizes the AOs of the RI space
+    # Orthogonalize combined basis set
+    S_ao_ribs_ribs = compute_f12_oeints(geom, cabs_set, cabs_set, xyz_path, deriv_order, options, True)
+    C_ribs = symmetric_orthogonalization(S_ao_ribs_ribs, 1.0e-8)
 
     # Compute the overlap matrix between OBS and RIBS, then orthogonalizes the RIBS
-    mints = psi4.core.MintsHelper(obs)
-    S_ao_obs_ribs = np.array(mints.ao_overlap(obs, ri_space.basisset()))
-    C12 = np.einsum('Pq,qQ->PQ', S_ao_obs_ribs, C_ribs)
+    S_ao_obs_ribs = compute_f12_oeints(geom, basis_set, cabs_set, xyz_path, deriv_order, options, True)
+    C12 = jnp.dot(S_ao_obs_ribs, C_ribs)
 
-    # Compute the eigenvectors and eigenvalues of S12.T * S12
-    _, S, Vt = np.linalg.svd(C12)
+    nN, Vt = null_svd(C12)
+
+    V_N = jnp.transpose(Vt[nN:, :])
+
+    C_cabs = jnp.dot(C_ribs, V_N)
+
+    return C_cabs
+
+@jax.custom_jvp
+def null_svd(C12, cutoff = 1.0e-6):
+    """
+    Grabs the null vectors from the V matrix
+    of an SVD procedure and returns the 
+    number of null vecs and the null vec matrix
+    """
+    # Compute the eigenvectors and eigenvalues of C12.T @ C12
+    _, S, Vt = jnp.linalg.svd(C12)
 
     # Collect the eigenvectors that are associated with (near) zero eignevalues
-    ncabs = S.shape[0]
-    for eval_i in S:
-        if abs(eval_i) < 1.0e-6: ncabs += 1
-    V_N = Vt[ncabs:, :].T
+    def loop_zero_vals(idx, count):
+        count += jax.lax.cond(abs(S[idx]) < cutoff, lambda: 1, lambda: 0)
+        return count
+    nN = fori_loop(0, S.shape[0], loop_zero_vals, S.shape[0])
 
-    # Make sure the CABS is an orthonormal set
-    C_cabs = psi4.core.Matrix.from_array(np.einsum('pQ,QP->pP', C_ribs, V_N))
+    return nN, Vt
 
-    return psi4.core.OrbitalSpace(ri_space.id(), cabs_name, C_cabs, ri_space.basisset(), ri_space.integral())
+@null_svd.defjvp
+def null_svd_jvp(primals, tangents):
+  C12, cutoff = primals
+  C12_dot, cutoff_dot = tangents
+  primal_out = null_svd(C12, cutoff)
+  tangent_out = null_svd(C12_dot, cutoff)
+  return primal_out, tangent_out
diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index f64f388..201ae26 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -2940,8 +2940,7 @@ py::array eri_deriv_core(int deriv_order) {
                                     for(auto f3 = 0; f3 != n3; ++f3) {
                                         size_t offset_3 = (bf3 + f3) * nbf4;
                                         for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                            size_t offset_4 = bf4 + f4;
-                                            result[offset_1 + offset_2 + offset_3 + offset_4 + offset_nuc_idx] += eri_shellset[idx];
+                                            result[offset_1 + offset_2 + offset_3 + bf4 + f4 + offset_nuc_idx] += eri_shellset[idx];
                                         }
                                     }
                                 }
@@ -3058,8 +3057,7 @@ py::array f12_deriv_core(double beta, int deriv_order) {
                                     for(auto f3 = 0; f3 != n3; ++f3) {
                                         size_t offset_3 = (bf3 + f3) * nbf4;
                                         for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                            size_t offset_4 = bf4 + f4;
-                                            result[offset_1 + offset_2 + offset_3 + offset_4 + offset_nuc_idx] += f12_shellset[idx];
+                                            result[offset_1 + offset_2 + offset_3 + bf4 + f4 + offset_nuc_idx] += f12_shellset[idx];
                                         }
                                     }
                                 }
@@ -3176,8 +3174,7 @@ py::array f12_squared_deriv_core(double beta, int deriv_order) {
                                     for(auto f3 = 0; f3 != n3; ++f3) {
                                         size_t offset_3 = (bf3 + f3) * nbf4;
                                         for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                            size_t offset_4 = bf4 + f4;
-                                            result[offset_1 + offset_2 + offset_3 + offset_4 + offset_nuc_idx] += f12_squared_shellset[idx];
+                                            result[offset_1 + offset_2 + offset_3 + bf4 + f4 + offset_nuc_idx] += f12_squared_shellset[idx];
                                         }
                                     }
                                 }
@@ -3294,8 +3291,7 @@ py::array f12g12_deriv_core(double beta, int deriv_order) {
                                     for(auto f3 = 0; f3 != n3; ++f3) {
                                         size_t offset_3 = (bf3 + f3) * nbf4;
                                         for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                            size_t offset_4 = bf4 + f4;
-                                            result[offset_1 + offset_2 + offset_3 + offset_4 + offset_nuc_idx] += f12g12_shellset[idx];
+                                            result[offset_1 + offset_2 + offset_3 + bf4 + f4 + offset_nuc_idx] += f12g12_shellset[idx];
                                         }
                                     }
                                 }
@@ -3412,8 +3408,7 @@ py::array f12_double_commutator_deriv_core(double beta, int deriv_order) {
                                     for(auto f3 = 0; f3 != n3; ++f3) {
                                         size_t offset_3 = (bf3 + f3) * nbf4;
                                         for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                            size_t offset_4 = bf4 + f4;
-                                            result[offset_1 + offset_2 + offset_3 + offset_4 + offset_nuc_idx] += f12_double_commutator_shellset[idx];
+                                            result[offset_1 + offset_2 + offset_3 + bf4 + f4 + offset_nuc_idx] += f12_double_commutator_shellset[idx];
                                         }
                                     }
                                 }
diff --git a/quax/methods/energy_utils.py b/quax/methods/energy_utils.py
index 61df17c..39a1119 100644
--- a/quax/methods/energy_utils.py
+++ b/quax/methods/energy_utils.py
@@ -14,18 +14,24 @@ def nuclear_repulsion(geom, nuclear_charges):
             nuc += nuclear_charges[i] * nuclear_charges[j] / jnp.linalg.norm(geom[i] - geom[j])
     return nuc
 
-def symmetric_orthogonalization(S):
+def symmetric_orthogonalization(S, cutoff = 1.0e-12):
     """
     Compute the symmetric orthogonalization transform U = S^(-1/2)
     where S is the overlap matrix
     """
-    # Warning: Higher order derivatives for some larger basis sets (TZ on) give NaNs for this algo 
-    eigval, eigvec = jnp.linalg.eigh(S)
-    cutoff = 1.0e-12
-    above_cutoff = (abs(eigval) > cutoff * jnp.max(abs(eigval)))
-    val = 1 / jnp.sqrt(eigval[above_cutoff])
-    vec = eigvec[:, above_cutoff]
-    A = vec.dot(jnp.diag(val)).dot(vec.T)
+    evals, evecs = jnp.linalg.eigh(S)
+
+    def loop_evals(idx, M):
+        val = jax.lax.cond(abs(evals[idx]) > cutoff,
+                           lambda: jnp.reciprocal(jnp.sqrt(evals[idx])),
+                           lambda: 0.0)
+        
+        M = M.at[idx, idx].set(val)
+        return M
+    
+    sqrtm = jax.lax.fori_loop(0, evals.shape[0], loop_evals, jnp.zeros(S.shape))
+
+    A = jnp.dot(evecs, jnp.dot(sqrtm, jnp.transpose(evecs)))
     return A
 
 def cholesky_orthogonalization(S):
@@ -64,23 +70,15 @@ def old_partial_tei_transformation(G, Ci, Cj, Ck, Cl):
     G = jnp.einsum('pqrs, pP, qQ, rR, sS -> PQRS', G, Ci, Cj, Ck, Cl, optimize='optimal')
     return G
 
-def partial_tei_transformation(G, C1, C2, C3, C4):
+def partial_tei_transformation(G, Ci, Cj, Ck, Cl):
     """
     New algo for Partial TEI transform
     """
-    G = transform(C4, G)
-    G = transform(C3, G)
-    G = transform(C2, G)
-    G = transform(C1, G)
+    G = transform(Cl, G)
+    G = transform(Ck, G)
+    G = transform(Cj, G)
+    G = transform(Ci, G)
     return G
-
-@jax.jit
-def chem2phys(G):
-    return jnp.transpose(G, (0,2,1,3))
-
-@jax.jit
-def f12_transpose(G):
-    return jnp.transpose(G, (1,0,3,2))
     
 def cartesian_product(*arrays):
     '''
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index 5949cd7..3f2f4b9 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -7,8 +7,6 @@
 import psi4
 import os
 
-from .energy_utils import chem2phys
-
 # Check for Libint interface
 from ..integrals import TEI
 from ..integrals import OEI
@@ -58,36 +56,59 @@ def compute_integrals(geom, basis_set, xyz_path, deriv_order, options):
     libint_interface.finalize()
     return S, T, V, G
 
-def compute_f12_oeints(geom, basis1, basis2, xyz_path, deriv_order, options):
+def compute_f12_oeints(geom, basis1, basis2, xyz_path, deriv_order, options, cabs):
     # Load integral algo, decides to compute integrals in memory or use disk
     algo = options['integral_algo']
     basis1_name = basis1.name()
     basis2_name = basis2.name()
     libint_interface.initialize(xyz_path, basis1_name, basis2_name, basis1_name, basis2_name)
 
-    if algo == 'libint_disk':
-        # Check disk for currently existing integral derivatives
-        check = check_oei_disk(geom, basis1, basis2, xyz_path, deriv_order)
+    if cabs:
+        if algo == 'libint_disk':
+            # Check disk for currently existing integral derivatives
+            check = check_oei_disk(geom, basis1, basis2, xyz_path, deriv_order)
+    
+            oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'disk')
+            # If disk integral derivs are right, nothing to do
+            if check:
+                S = oei_obj.overlap(geom)
+            else:
+                libint_interface.oei_deriv_disk(deriv_order)
+                S = oei_obj.overlap(geom)
 
-        oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'f12_disk')
-        # If disk integral derivs are right, nothing to do
-        if check:
-            T = oei_obj.kinetic(geom)
-            V = oei_obj.potential(geom)
         else:
-            libint_interface.oei_deriv_disk(deriv_order)
-            T = oei_obj.kinetic(geom)
-            V = oei_obj.potential(geom)
+            # Precompute OEI derivatives
+            oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'core')
+            # Compute integrals
+            S = oei_obj.overlap(geom)
+        
+        libint_interface.finalize()
+        return S
 
     else:
-        # Precompute TEI derivatives
-        oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'f12_core')
-        # Compute integrals
-        T = oei_obj.kinetic(geom)
-        V = oei_obj.potential(geom)
+        if algo == 'libint_disk':
+            # Check disk for currently existing integral derivatives
+            check = check_oei_disk(geom, basis1, basis2, xyz_path, deriv_order)
 
-    libint_interface.finalize()
-    return T + V
+            oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'disk')
+            # If disk integral derivs are right, nothing to do
+            if check:
+                T = oei_obj.kinetic(geom)
+                V = oei_obj.potential(geom)
+            else:
+                libint_interface.oei_deriv_disk(deriv_order)
+                T = oei_obj.kinetic(geom)
+                V = oei_obj.potential(geom)
+
+        else:
+            # Precompute OEI derivatives
+            oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'core')
+            # Compute integrals
+            T = oei_obj.kinetic(geom)
+            V = oei_obj.potential(geom)
+        
+        libint_interface.finalize()
+        return T, V
 
 def compute_f12_teints(geom, basis1, basis2, basis3, basis4, int_type, xyz_path, deriv_order, options):
     # Load integral algo, decides to compute integrals in memory or use disk
@@ -103,7 +124,7 @@ def compute_f12_teints(geom, basis1, basis2, basis3, basis4, int_type, xyz_path,
         # Check disk for currently existing integral derivatives
         check = check_tei_disk(geom, basis1, basis2, basis3, basis4, int_type, xyz_path, deriv_order)
 
-        tei_obj = TEI(basis1, basis2, basis3, basis4, xyz_path, deriv_order, options, 'f12_disk')
+        tei_obj = TEI(basis1, basis2, basis3, basis4, xyz_path, deriv_order, options, 'disk')
         # If disk integral derivs are right, nothing to do
         if check:
             match int_type:
@@ -137,7 +158,7 @@ def compute_f12_teints(geom, basis1, basis2, basis3, basis4, int_type, xyz_path,
 
     else:
         # Precompute TEI derivatives
-        tei_obj = TEI(basis1, basis2, basis3, basis4, xyz_path, deriv_order, options, 'f12_core')
+        tei_obj = TEI(basis1, basis2, basis3, basis4, xyz_path, deriv_order, options, 'core')
         # Compute integrals
         match int_type:
             case "f12":
@@ -152,7 +173,7 @@ def compute_f12_teints(geom, basis1, basis2, basis3, basis4, int_type, xyz_path,
                 F = tei_obj.eri(geom)
 
     libint_interface.finalize()
-    return chem2phys(F)
+    return F
 
 def check_oei_disk(geom, basis1, basis2, xyz_path, deriv_order, address=None):
     # TODO need to check geometry and basis set name in addition to nbf
@@ -176,7 +197,11 @@ def check_oei_disk(geom, basis1, basis2, xyz_path, deriv_order, address=None):
         oeifile.close()
         correct_int_derivs = correct_deriv_order and correct_nbf1 and correct_nbf2
 
-    # TODO flesh out this logic for determining if partials file contains all integrals needed
+    if correct_int_derivs:
+        print("Integral derivatives appear to be correct. Avoiding recomputation.")
+    return correct_int_derivs
+
+"""     # TODO flesh out this logic for determining if partials file contains all integrals needed
     # for particular address
     elif (os.path.exists("oei_partials.h5")):
         print("Found currently existing partial oei derivatives in working directory. Assuming they are correct.")
@@ -193,11 +218,7 @@ def check_oei_disk(geom, basis1, basis2, xyz_path, deriv_order, address=None):
         correct_nbf1 = oeifile[sample_dataset_name].shape[0] == nbf1
         correct_nbf2 = oeifile[sample_dataset_name].shape[1] == nbf2
         oeifile.close()
-        correct_int_derivs = correct_deriv_order and correct_nbf1 and correct_nbf2
-
-    if correct_int_derivs:
-        print("Integral derivatives appear to be correct. Avoiding recomputation.")
-    return correct_int_derivs
+        correct_int_derivs = correct_deriv_order and correct_nbf1 and correct_nbf2 """
 
 def check_tei_disk(geom, basis1, basis2, basis3, basis4, int_type, xyz_path, deriv_order, address=None):
     # TODO need to check geometry and basis set name in addition to nbf
@@ -224,11 +245,12 @@ def check_tei_disk(geom, basis1, basis2, basis3, basis4, int_type, xyz_path, der
         correct_nbf4 = erifile[sample_dataset_name].shape[3] == nbf4
         erifile.close()
         correct_int_derivs = correct_deriv_order and correct_nbf1 and correct_nbf2 and correct_nbf3 and correct_nbf4
-        if correct_int_derivs:
-            print("Integral derivatives appear to be correct. Avoiding recomputation.")
-        return correct_int_derivs
+    
+    if correct_int_derivs:
+        print("Integral derivatives appear to be correct. Avoiding recomputation.")
+    return correct_int_derivs
 
-    # TODO flesh out this logic for determining if partials file contains all integrals needed
+"""     # TODO flesh out this logic for determining if partials file contains all integrals needed
     # for particular address
     elif ((os.path.exists("eri_partials.h5"))):
         print("Found currently existing partial tei derivatives in working directory. Assuming they are correct.")
@@ -250,3 +272,4 @@ def check_tei_disk(geom, basis1, basis2, basis3, basis4, int_type, xyz_path, der
         if correct_int_derivs:
             print("Integral derivatives appear to be correct. Avoiding recomputation.")
         return correct_int_derivs
+ """
\ No newline at end of file
diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index 09e4b50..f220ca7 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -6,229 +6,288 @@
 import sys
 jnp.set_printoptions(threshold=sys.maxsize, linewidth=100)
 
+from ..integrals.basis_utils import build_CABS
 from .ints import compute_f12_oeints, compute_f12_teints
-from .energy_utils import partial_tei_transformation, f12_transpose
+from .energy_utils import partial_tei_transformation
 from .mp2 import restricted_mp2
 
-def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, options, cabs_space, deriv_order=0):
+def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, options, cabs_set, deriv_order=0):
     nelectrons = int(jnp.sum(nuclear_charges)) - charge
     ndocc = nelectrons // 2
     E_mp2, C_obs, eps = restricted_mp2(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
-    eps_occ, eps_vir = eps[:ndocc], eps[ndocc:]
 
     print("Running MP2-F12 Computation...")
-    cabs_set = cabs_space.basisset()
-    C_cabs = jnp.asarray(cabs_space.C().to_array())
+    C_cabs = build_CABS(geom, basis_set, cabs_set, xyz_path, deriv_order, options)
+
     nobs = C_obs.shape[0]
     nri = C_cabs.shape[0]
+	
+    o, v, p, c, A = slice(0, ndocc), slice(ndocc, nobs), slice(0, nobs), slice(nobs, nri), slice(0, nri)
+
+    eps_occ, eps_vir = eps[o], eps[v]
 
-    f, fk, k = form_Fock(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
+    # Fock
+    h = form_h(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_order, options)
+    G = form_G(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_order, options)
+    f, fk, k = form_Fock(h, (G[A, o, A, o], G[A, o, o, A]))
 
-    V = form_V(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options)
+    # V Intermediate
+    FG = form_FG(geom, basis_set, C_obs, ndocc, xyz_path, deriv_order, options)
+    F = form_F(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
+    V = form_V(FG, (F[o, o, o, c], F[o, o, p, p]), (G[o, o, o, c], G[o, o, p, p]))
 
-    X = form_X(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options)
+    # X Intermediate
+    F2 = form_F2(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
+    X = form_X(F2[o, o, o, o], (F[o, o, o, c], F[o, o, p, p]))
 
-    C = form_C(geom, basis_set, cabs_set, C_obs, C_cabs, f, ndocc, nobs, xyz_path, deriv_order, options)
+    # C Intermediate
+    C = form_C(F[o, o, v, c], f[v, c])
 
-    B = form_B(geom, basis_set, cabs_set, C_obs, C_cabs, f, fk, k, ndocc, nobs, nri, xyz_path, deriv_order, options)
+    # B Intermediate
+    Uf = form_Uf(geom, basis_set, C_obs, ndocc, xyz_path, deriv_order, options)
+    B = form_B(Uf, F2, (F, F[o, o, o, A], F[o, o, c, o], F[o, o, v, p], F[o, o, c, A], F[o, o, v, c]),\
+               (f, f[o, o], f[p, p], f[A, o], f[p, c]), fk[o, A], k)
 
     D = -1.0 * jnp.reciprocal(eps_occ.reshape(-1, 1, 1, 1) + eps_occ.reshape(-1, 1, 1) - eps_vir.reshape(-1, 1) - eps_vir)
+    Dv = slice(0, nobs - ndocc)
+    
+    # indices = jnp.asarray(jnp.triu_indices(ndocc)).reshape(2,-1).T
 
-    G = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
-    G = partial_tei_transformation(G, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, ndocc:nobs], C_obs[:, ndocc:nobs])
+    # def loop_energy(idx, f12_corr):
+        # i, j = indices[idx]
     
-    indices = jnp.asarray(jnp.triu_indices(ndocc)).reshape(2,-1).T
+    dE_mp2f12 = 0.0
+    for i in range(ndocc):
+        for j in range(i, ndocc):
+            kd = jax.lax.cond(i == j, lambda: 1.0, lambda: 2.0)
+
+            D_ij = D[i, j, Dv, Dv]
 
-    def loop_energy(idx, f12_corr):
-        i, j = indices[idx]
-        kd = jax.lax.cond(i == j, lambda: 1.0, lambda: 2.0)
+            V_ij = V[i, j, o, o]
+            GD_ij = G[i, j, v, v] * D_ij
+            V_ij -= jnp.tensordot(C, GD_ij, [(2, 3), (0, 1)])
+            print(V_ij)
 
-        V_ij = V[i, j, :, :]
-        V_ij -= jnp.einsum('klab,ab,ab->kl', C, G[i, j, :, :], D[i, j, :, :], optimize='optimal')
+            V_s = 0.25 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd * (V_ij[i, j] + V_ij[j, i])
 
-        V_s = 0.5 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd * (V_ij[i, j] + V_ij[j, i])
+            V_t = 0.25 * jax.lax.cond(i != j, lambda: (t_(i, j, i, j) - t_(i, j, j, i))
+                                                   * kd * (V_ij[i, j] - V_ij[j, i]), lambda: 0.0)
 
-        V_t = 0.5 * jax.lax.cond(i != j, lambda: (t_(i, j, i, j) - t_(i, j, j, i))
-                                               * kd * (V_ij[i, j] - V_ij[j, i]), lambda: 0.0)
+            B_ij = B - (X * (f[i, i] + f[j, j]))
+            CD_ij = jnp.einsum('mnab,ab->mnab', C, D_ij, optimize='optimal')
+            B_ij -= jnp.tensordot(C, CD_ij, [(2, 3), (2, 3)])
+            print(B_ij)
 
-        B_ij = B - (X * (f[i, i] + f[j, j]))
-        B_ij -= jnp.einsum('klab,ab,mnab', C, D[i, j, :, :], C, optimize='optimal')
+            B_s = 0.125 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd \
+                         * (B_ij[i, j, i, j] + B_ij[j, i, i, j]) \
+                         * (t_(i, j, i, j) + t_(i, j, j, i)) * kd
 
-        B_s = 0.125 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd \
-                     * (B_ij[i, j, i, j] + B_ij[j, i, i, j]) \
-                     * (t_(i, j, i, j) + t_(i, j, j, i)) * kd
+            B_t = 0.125 * jax.lax.cond(i != j, lambda: (t_(i, j, i, j) - t_(i, j, j, i)) * kd
+                                                     * (B_ij[i, j, i, j] - B_ij[j, i, i, j])
+                                                     * (t_(i, j, i, j) - t_(i, j, j, i)) * kd,
+                                                     lambda: 0.0)
 
-        B_t = 0.125 * jax.lax.cond(i != j, lambda: (t_(i, j, i, j) - t_(i, j, j, i)) * kd
-                                                 * (B_ij[i, j, i, j] - B_ij[j, i, i, j])
-                                                 * (t_(i, j, i, j) - t_(i, j, j, i)) * kd,
-                                                 lambda: 0.0)
+            dE_mp2f12 += kd * (2.0 * V_s + B_s)         # Singlet Pair Energy
+            dE_mp2f12 += 3.0 * kd * (2.0 * V_t + B_t)   # Triplet Pair Energy
 
-        f12_corr += kd * (V_s + B_s)
-        f12_corr += 3.0 * kd * (V_t + B_t)
+    #     return f12_corr
 
-        return f12_corr
+    # dE_mp2f12 = fori_loop(0, indices.shape[0], loop_energy, 0.0)
 
-    dE_mp2f12 = fori_loop(0, indices.shape[0], loop_energy, 0.0)
+    jax.debug.print("OG: {e}", e=dE_mp2f12)
 
-    return E_mp2 + dE_mp2f12
+    return dE_mp2f12
 
 # Fixed Amplitude Ansatz
 @jax.jit
 def t_(p, q, r, s):
     return jnp.select(
-        [(p == q) & (p == r) & (p ==s), (p == r) & (q == s), (p == s) & (q == r)],
+        [(p == q) & (p == r) & (p == s), (p == r) & (q == s), (p == s) & (q == r)],
         [0.5, 0.375, 0.125],
         default = jnp.nan
     )
 
 # One-Electron Integrals
+def one_body_mo_computer(geom, bs1, bs2, C1, C2, xyz_path, deriv_order, options):
+    """
+    General one-body MO computer
+    that computes the AOs and 
+    transforms to MOs
+    """
+    T, V = compute_f12_oeints(geom, bs1, bs2, xyz_path, deriv_order, options, False)
+    AO = T + V
+    MO = jnp.dot(C1.T, jnp.dot(AO, C2))
+    return MO
 
 def form_h(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_order, options):
-    h = jnp.empty((nri, nri))
+    tv = jnp.empty((nri, nri))
 
-    h_tmp = compute_f12_oeints(geom, basis_set, basis_set, xyz_path, deriv_order, options)
-    h_tmp = jnp.einsum('pP,qQ,pq->PQ', C_obs, C_obs, h_tmp, optimize='optimal')
-    h = h.at[:nobs, :nobs].set(h_tmp) # <O|O>
+    mo1 = one_body_mo_computer(geom, basis_set, basis_set, C_obs, C_obs, xyz_path, deriv_order, options)
+    tv = tv.at[:nobs, :nobs].set(mo1) # <O|O>
 
-    h_tmp = compute_f12_oeints(geom, basis_set, cabs_set, xyz_path, deriv_order, options)
-    h_tmp = jnp.einsum('pP,qQ,pq->PQ', C_obs, C_cabs, h_tmp, optimize='optimal')
-    h = h.at[:nobs, nobs:nri].set(h_tmp) # <O|C>
-    h = h.at[nobs:nri, :nobs].set(jnp.transpose(h_tmp)) # <C|O>
+    mo2 = one_body_mo_computer(geom, basis_set, cabs_set, C_obs, C_cabs, xyz_path, deriv_order, options)
+    tv = tv.at[:nobs, nobs:nri].set(mo2) # <O|C>
+    tv = tv.at[nobs:nri, :nobs].set(mo2.T) # <C|O>
 
-    h_tmp = compute_f12_oeints(geom, cabs_set, cabs_set, xyz_path, deriv_order, options)
-    h_tmp = jnp.einsum('pP,qQ,pq->PQ', C_cabs, C_cabs, h_tmp, optimize='optimal')
-    h = h.at[nobs:nri, nobs:nri].set(h_tmp) # <C|C>
+    mo3 = one_body_mo_computer(geom, cabs_set, cabs_set, C_cabs, C_cabs, xyz_path, deriv_order, options)
+    tv = tv.at[nobs:nri, nobs:nri].set(mo3) # <C|C>
 
-    return h
+    return tv
 
-def form_Fock(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
-    # OEINTS
-    f = form_h(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_order, options)
+# Two-Electron Integrals
+def two_body_mo_computer(geom, int_type, bs1, bs2, bs3, bs4, C1, C2, C3, C4, xyz_path, deriv_order, options):
+    """
+    General two-body MO computer
+    that computes the AOs in chem notation,
+    returns them in phys notation,
+    and then transforms to MOs
+    """
+    AO = compute_f12_teints(geom, bs1, bs3, bs2, bs4, int_type, xyz_path, deriv_order, options)
+    AO = jnp.transpose(AO, (0,2,1,3))
+    MO = partial_tei_transformation(AO, C1, C2, C3, C4)
+    return MO
 
-    # TEINTS
-    G = jnp.empty((nri, nobs, nri, nri))
+def form_G(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_order, options):
+    eri = jnp.empty((nri, nobs, nri, nri))
 
-    G_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
-    G_tmp = partial_tei_transformation(G_tmp, C_obs, C_obs[:, :ndocc], C_obs, C_obs)
-    G = G.at[:nobs, :ndocc, :nobs, :nobs].set(G_tmp) # <Oo|OO>
+    mo1 = two_body_mo_computer(geom, "eri", basis_set, basis_set, basis_set, basis_set,\
+                              C_obs, C_obs, C_obs, C_obs, xyz_path, deriv_order, options)
+    eri = eri.at[:nobs, :nobs, :nobs, :nobs].set(mo1) # <OO|OO>
 
-    G_tmp = compute_f12_teints(geom, cabs_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
-    G_tmp = partial_tei_transformation(G_tmp, C_cabs, C_obs, C_obs, C_obs)
-    G = G.at[nobs:nri, :nobs, :nobs, :nobs].set(G_tmp) # <CO|OO>
-    G = G.at[:nobs, :nobs, nobs:nri, :nobs].set(jnp.transpose(G_tmp, (2,1,0,3))) # <OO|CO>
-    G = G.at[:nobs, :nobs, :nobs, nobs:nri].set(jnp.transpose(G_tmp, (3,2,1,0))) # <OO|OC>
+    mo2 = two_body_mo_computer(geom, "eri", cabs_set, basis_set, basis_set, basis_set,\
+                              C_cabs, C_obs, C_obs, C_obs, xyz_path, deriv_order, options)
+    eri = eri.at[nobs:nri, :nobs, :nobs, :nobs].set(mo2) # <CO|OO>
+    eri = eri.at[:nobs, :nobs, nobs:nri, :nobs].set(jnp.transpose(mo2, (2,3,0,1))) # <OO|CO>
+    eri = eri.at[:nobs, :nobs, :nobs, nobs:nri].set(jnp.transpose(mo2, (3,2,1,0))) # <OO|OC>
 
-    G_tmp = compute_f12_teints(geom, cabs_set, basis_set, basis_set, cabs_set, "eri", xyz_path, deriv_order, options)
-    G_tmp = partial_tei_transformation(G_tmp, C_cabs, C_obs[:, :ndocc], C_obs, C_cabs)
-    G = G.at[nobs:nri, :ndocc, :nobs, nobs:nri].set(G_tmp) # <Co|OC>
+    mo3 = two_body_mo_computer(geom, "eri", cabs_set, basis_set, basis_set, cabs_set,\
+                              C_cabs, C_obs, C_obs, C_cabs, xyz_path, deriv_order, options)
+    eri = eri.at[nobs:nri, :nobs, :nobs, nobs:nri].set(mo3) # <CO|OC>
 
-    G_tmp = compute_f12_teints(geom, cabs_set, cabs_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
-    G_tmp = partial_tei_transformation(G_tmp, C_cabs, C_obs[:, :ndocc], C_cabs, C_obs)
-    G = G.at[nobs:nri, :ndocc, nobs:nri, :nobs].set(G_tmp) # <Co|CO>
+    mo4 = two_body_mo_computer(geom, "eri", cabs_set, basis_set, cabs_set, basis_set,\
+                              C_cabs, C_obs, C_cabs, C_obs, xyz_path, deriv_order, options)
+    eri = eri.at[nobs:nri, :nobs, nobs:nri, :nobs].set(mo4) # <CO|CO>
 
-    # Fill Fock Matrix
-    f += 2.0 * jnp.einsum('piqi->pq', G[:, :ndocc, :, :ndocc], optimize='optimal')
-    fk = f # Fock Matrix without Exchange
-    k =  jnp.einsum('piiq->pq', G[:, :ndocc, :ndocc, :], optimize='optimal')
-    f -= k
+    return eri
 
-    return f, fk, k
+def form_F(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
+    f12 = jnp.empty((ndocc, ndocc, nri, nri))
+    C_occ = C_obs.at[:, :ndocc].get()
 
-# F12 Intermediates
-# F12 TEINTS are entered in Chem and returned in Phys
+    mo1 = two_body_mo_computer(geom, "f12", basis_set, basis_set, basis_set, basis_set,\
+                              C_occ, C_occ, C_obs, C_obs, xyz_path, deriv_order, options)
+    f12 = f12.at[:, :, :nobs, :nobs].set(mo1) # <oo|OO>
+
+    mo2 = two_body_mo_computer(geom, "f12", basis_set, basis_set, basis_set, cabs_set,\
+                              C_occ, C_occ, C_obs, C_cabs, xyz_path, deriv_order, options)
+    f12 = f12.at[:, :, :nobs, nobs:].set(mo2) # <oo|OC>
+    f12 = f12.at[:, :, nobs:, :nobs].set(jnp.transpose(mo2, (1,0,3,2))) # <oo|CO>
+
+    mo3 = two_body_mo_computer(geom, "f12", basis_set, basis_set, cabs_set, cabs_set,\
+                              C_occ, C_occ, C_cabs, C_cabs, xyz_path, deriv_order, options)
+    f12 = f12.at[:, :, nobs:, nobs:].set(mo3) # <oo|CC>
+
+    return f12
+
+def form_F2(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
+    f12_squared = jnp.empty((ndocc, ndocc, ndocc, nri))
+    C_occ = C_obs.at[:, :ndocc].get()
+
+    mo1 = two_body_mo_computer(geom, "f12_squared", basis_set, basis_set, basis_set, basis_set,\
+                              C_occ, C_occ, C_occ, C_obs, xyz_path, deriv_order, options)
+    f12_squared = f12_squared.at[:, :, :, :nobs].set(mo1) # <oo|oO>
+
+    mo2 = two_body_mo_computer(geom, "f12_squared", basis_set, basis_set, basis_set, cabs_set,\
+                              C_occ, C_occ, C_occ, C_cabs, xyz_path, deriv_order, options)
+    f12_squared = f12_squared.at[:, :, :, nobs:].set(mo2) # <oo|oC>
+
+    return f12_squared
 
-def form_V(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options):
+def form_FG(geom, basis_set, C_obs, ndocc, xyz_path, deriv_order, options):
+    C_occ = C_obs.at[:, :ndocc].get()
 
-    V = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12g12", xyz_path, deriv_order, options)
-    V = partial_tei_transformation(V, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc])
+    f12g12 = two_body_mo_computer(geom, "f12g12", basis_set, basis_set, basis_set, basis_set,\
+                                  C_occ, C_occ, C_occ, C_occ, xyz_path, deriv_order, options)
+    return f12g12
 
-    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
-    F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs)
-    G_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "eri", xyz_path, deriv_order, options)
-    G_tmp = partial_tei_transformation(G_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs)
-    V_tmp = jnp.einsum('ijmy,klmy->ijkl', G_tmp, F_tmp, optimize='optimal')
-    V -= V_tmp + f12_transpose(V_tmp)
+def form_Uf(geom, basis_set, C_obs, ndocc, xyz_path, deriv_order, options):
+    C_occ = C_obs.at[:, :ndocc].get()
 
-    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12", xyz_path, deriv_order, options)
-    F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :nobs], C_obs[:, :nobs])
-    G_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order, options)
-    G_tmp = partial_tei_transformation(G_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :nobs], C_obs[:, :nobs])
-    V -= jnp.einsum('ijrs,klrs->ijkl', G_tmp, F_tmp, optimize='optimal')
+    f12_double_commutator = two_body_mo_computer(geom, "f12_double_commutator",\
+                                    basis_set, basis_set, basis_set, basis_set,\
+                                    C_occ, C_occ, C_occ, C_occ, xyz_path, deriv_order, options)
+    return f12_double_commutator
 
-    return V
+# Fock
+def form_Fock(h, Fock_G):
 
-def form_X(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options):
+    G_1o1o, G_1oo1 = Fock_G
+    
+    # Fock Matrix without Exchange
+    fk = h + 2.0 * jnp.einsum('piqi->pq', G_1o1o, optimize='optimal')
 
-    X = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12_squared", xyz_path, deriv_order, options)
-    X = partial_tei_transformation(X, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc])
+    # Exchange
+    k =  jnp.einsum('piiq->pq', G_1oo1, optimize='optimal')
 
-    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
-    F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs)
-    X_tmp = jnp.einsum('ijmy,klmy->ijkl', F_tmp, F_tmp, optimize='optimal')
-    X -= X_tmp + f12_transpose(X_tmp)
+    f = fk - k
 
-    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12", xyz_path, deriv_order, options)
-    F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :nobs], C_obs[:, :nobs])
-    X -= jnp.einsum('ijrs,klrs->ijkl', F_tmp, F_tmp, optimize='optimal')
+    return f, fk, k
 
-    return X
+# F12 Intermediates
+def form_V(FG, VX_F, V_G):
+    
+    G_oooc, G_oopq = V_G
+    F_oooc, F_oopq = VX_F
 
-def form_C(geom, basis_set, cabs_set, C_obs, C_cabs, Fock, ndocc, nobs, xyz_path, deriv_order, options):
+    ijkl_1 = jnp.tensordot(G_oooc, F_oooc, [(2, 3), (2, 3)])
+    ijkl_2 = jnp.transpose(ijkl_1, (1,0,3,2))
+    ijkl_3 = jnp.tensordot(G_oopq, F_oopq, [(2, 3), (2, 3)])
+
+    return FG - ijkl_1 - ijkl_2 - ijkl_3
+
+def form_X(F2_oooo, VX_F):
+    
+    F_oooc, F_oopq = VX_F
 
-    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
-    F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, ndocc:nobs], C_cabs)
-    tmp = jnp.einsum('klay,by->klab', F_tmp, Fock[ndocc:nobs, nobs:], optimize='optimal')
+    ijkl_1 = jnp.tensordot(F_oooc, F_oooc, [(2, 3), (2, 3)])
+    ijkl_2 = jnp.transpose(ijkl_1, (1,0,3,2))
+    ijkl_3 = jnp.tensordot(F_oopq, F_oopq, [(2, 3), (2, 3)])
 
-    return tmp + f12_transpose(tmp)
+    return F2_oooo - ijkl_1 - ijkl_2 - ijkl_3
 
-def form_B(geom, basis_set, cabs_set, C_obs, C_cabs, Fock, noK, K, ndocc, nobs, nri, xyz_path, deriv_order, options):
-    # Term 1
-    B = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12_double_commutator", xyz_path, deriv_order, options)
-    B = partial_tei_transformation(B, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc])
+def form_C(F_oovc, f_vc):
+
+    klab = jnp.tensordot(F_oovc, f_vc, [(3), (1)])
+
+    return klab + jnp.transpose(klab, (1,0,3,2))
+
+def form_B(Uf, F2, B_F, B_f, fk_o1, k):
+
+    F, F_ooo1, F_ooco, F_oovq, F_ooc1, F_oovc = B_F
+    f, f_oo, f_pq, f_1o, f_pc = B_f
 
     # Term 2
-    F2 = jnp.empty((ndocc, ndocc, ndocc, nri))
-    F2_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12_squared", xyz_path, deriv_order, options)
-    F2 = F2.at[:, :, :, :nobs].set(partial_tei_transformation(F2_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs)) # <oo|oO>
-    F2_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12_squared", xyz_path, deriv_order, options)
-    F2 = F2.at[:, :, :, nobs:].set(partial_tei_transformation(F2_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs)) # <oo|oC>
-
-    tmp = jnp.einsum('lknI,mI->lknm', F2, noK[:ndocc, :])
-    B += tmp + f12_transpose(tmp)
-
-    # F12 Integral
-    F_oo11 = jnp.empty((ndocc, ndocc, nri, nri))
-    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, basis_set, "f12", xyz_path, deriv_order, options)
-    F_oo11 = F_oo11.at[:, :, :nobs, :nobs].set(partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs, C_obs)) # <oo|OO>
-    F_tmp = compute_f12_teints(geom, basis_set, basis_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
-    F_tmp = partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_obs, C_cabs)
-    F_oo11 = F_oo11.at[:, :, :nobs, nobs:].set(F_tmp) # <oo|OC>
-    F_oo11 = F_oo11.at[:, :, nobs:, :nobs].set(f12_transpose(F_tmp)) # <oo|CO>
-    F_tmp = compute_f12_teints(geom, basis_set, cabs_set, basis_set, cabs_set, "f12", xyz_path, deriv_order, options)
-    F_oo11 = F_oo11.at[:, :, nobs:, nobs:].set(partial_tei_transformation(F_tmp, C_obs[:, :ndocc], C_obs[:, :ndocc], C_cabs, C_cabs)) # <oo|CC>
+    terms = jnp.tensordot(F2, fk_o1, [(3), (1)])
 
     # Term 3
-    tmp = jnp.einsum('lkPC,CA,nmPA->lknm', F_oo11, K, F_oo11, optimize='optimal')
-    B -= tmp + f12_transpose(tmp)
+    terms -= jnp.tensordot(jnp.tensordot(F, k, [(3), (0)]), F, [(2, 3), (2, 3)])
 
     # Term 4
-    tmp = jnp.einsum('lkjC,CA,nmjA->lknm', F_oo11[:, :, :ndocc, :], Fock, F_oo11[:, :, :ndocc, :], optimize='optimal')
-    B -= tmp + f12_transpose(tmp)
+    terms -= jnp.tensordot(jnp.tensordot(F_ooo1, f, [(3), (0)]), F_ooo1, [(2, 3), (2, 3)])
 
     # Term 5
-    tmp = jnp.einsum('lkxj,ji,nmxi->lknm', F_oo11[:, :, nobs:, :ndocc], Fock[:ndocc, :ndocc], F_oo11[:, :, nobs:, :ndocc], optimize='optimal')
-    B += tmp + f12_transpose(tmp)
+    terms += jnp.tensordot(jnp.tensordot(F_ooco, f_oo, [(3), (0)]), F_ooco, [(2, 3), (2, 3)])
 
     # Term 6
-    tmp = jnp.einsum('lkbp,pq,nmbq->lknm', F_oo11[:, :, ndocc:nobs, :nobs], Fock[:nobs, :nobs], F_oo11[:, :, ndocc:nobs, :nobs], optimize='optimal')
-    B -= tmp + f12_transpose(tmp)
+    terms -= jnp.tensordot(jnp.tensordot(F_oovq, f_pq, [(3), (0)]), F_oovq, [(2, 3), (2, 3)])
 
     # Term 7
-    tmp = 2.0 * jnp.einsum('lkxI,Ij,nmxj->lknm', F_oo11[:, :, nobs:, :], Fock[:, :ndocc], F_oo11[:, :, nobs:, :ndocc], optimize='optimal')
-    B -= tmp + f12_transpose(tmp)
+    terms -= 2.0 * jnp.tensordot(jnp.tensordot(F_ooc1, f_1o, [(3), (0)]), F_ooco, [(2, 3), (2, 3)])
 
     # Term 8
-    tmp = 2.0 * jnp.einsum('lkbq,qy,nmby->lknm', F_oo11[:, :, ndocc:nobs, :nobs], Fock[:nobs, nobs:], F_oo11[:, :, ndocc:nobs, nobs:], optimize='optimal')
-    B -= tmp + f12_transpose(tmp)
+    terms -= 2.0 * jnp.tensordot(jnp.tensordot(F_oovq, f_pc, [(3), (0)]), F_oovc, [(2, 3), (2, 3)])
+
+
+    B_nosymm = Uf + terms + jnp.transpose(terms, (1,0,3,2))
 
-    return 0.5 * (B + jnp.transpose(B, (2,3,0,1)))
+    return 0.5 * (B_nosymm + jnp.transpose(B_nosymm, (2,3,0,1)))

From c687fda02c52d3f4e79b9d406e9bae36a82725f1 Mon Sep 17 00:00:00 2001
From: Erica Mitchell <ericamitch5@gmail.com>
Date: Tue, 28 Nov 2023 13:45:33 -0500
Subject: [PATCH 29/91] Better AM sorting

---
 quax/integrals/libint_interface.cc | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index 201ae26..cf987c7 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -62,13 +62,7 @@ libint2::BasisSet make_ao_cabs(std::string obs_name, libint2::BasisSet cabs) {
 
             stable_sort(tmp.begin(), tmp.end(), [](const auto& a, const auto& b) -> bool
             {
-                int a_l, b_l;
-                for (auto&& c_a : a.contr)
-                    a_l = c_a.l;
-                for (auto&& c_b : b.contr)
-                    b_l = c_b.l;
-
-                return a_l < b_l;
+                return a.contr[0].l < b.contr[0].l;
             });
 
             el_bases[atoms[i].atomic_number] = tmp;

From 5cd77ff4c46270f585e5d8f03f9199151ee2a760 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Wed, 29 Nov 2023 17:07:03 -0500
Subject: [PATCH 30/91] Threading fixed

---
 quax/core.py                  | 2 +-
 quax/integrals/basis_utils.py | 6 +++++-
 quax/methods/mp2f12.py        | 3 ---
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/quax/core.py b/quax/core.py
index 2a47801..f6eb256 100644
--- a/quax/core.py
+++ b/quax/core.py
@@ -68,7 +68,7 @@ def compute(molecule, basis_name, method, options=None, deriv_order=0, partial=N
 
     # Load molecule data
     geom2d = np.asarray(molecule.geometry())
-    geom_list = geom2d.reshape(-1).tolist() 
+    geom_list = geom2d.reshape(-1).tolist()
     geom = jnp.asarray(geom2d.flatten())
     dim = geom.reshape(-1).shape[0]
     xyz_file_name = "geom.xyz"
diff --git a/quax/integrals/basis_utils.py b/quax/integrals/basis_utils.py
index fafcbe9..f16f072 100644
--- a/quax/integrals/basis_utils.py
+++ b/quax/integrals/basis_utils.py
@@ -36,6 +36,9 @@ def build_CABS(geom, basis_set, cabs_set, xyz_path, deriv_order, options):
     Builds and returns 
     CABS transformation matrix
     """
+    # Make Thread Safe
+    threads = psi4.get_num_threads()
+    psi4.set_num_threads(1)
 
     # Orthogonalize combined basis set
     S_ao_ribs_ribs = compute_f12_oeints(geom, cabs_set, cabs_set, xyz_path, deriv_order, options, True)
@@ -48,9 +51,10 @@ def build_CABS(geom, basis_set, cabs_set, xyz_path, deriv_order, options):
     nN, Vt = null_svd(C12)
 
     V_N = jnp.transpose(Vt[nN:, :])
-
     C_cabs = jnp.dot(C_ribs, V_N)
 
+    psi4.set_num_threads(threads)
+
     return C_cabs
 
 @jax.custom_jvp
diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index f220ca7..cb209f9 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -66,7 +66,6 @@ def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, optio
             V_ij = V[i, j, o, o]
             GD_ij = G[i, j, v, v] * D_ij
             V_ij -= jnp.tensordot(C, GD_ij, [(2, 3), (0, 1)])
-            print(V_ij)
 
             V_s = 0.25 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd * (V_ij[i, j] + V_ij[j, i])
 
@@ -76,8 +75,6 @@ def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, optio
             B_ij = B - (X * (f[i, i] + f[j, j]))
             CD_ij = jnp.einsum('mnab,ab->mnab', C, D_ij, optimize='optimal')
             B_ij -= jnp.tensordot(C, CD_ij, [(2, 3), (2, 3)])
-            print(B_ij)
-
             B_s = 0.125 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd \
                          * (B_ij[i, j, i, j] + B_ij[j, i, i, j]) \
                          * (t_(i, j, i, j) + t_(i, j, j, i)) * kd

From 460955376effefa8e86eeddcf6b139da51606d1d Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Wed, 29 Nov 2023 17:36:50 -0500
Subject: [PATCH 31/91] Passes S12 @ C_cabs == 0

---
 quax/integrals/basis_utils.py | 42 ++++++++++-------------------------
 1 file changed, 12 insertions(+), 30 deletions(-)

diff --git a/quax/integrals/basis_utils.py b/quax/integrals/basis_utils.py
index f16f072..2999f47 100644
--- a/quax/integrals/basis_utils.py
+++ b/quax/integrals/basis_utils.py
@@ -31,7 +31,7 @@ def build_RIBS(molecule, basis_set, cabs_name):
 
     return ribs_set
 
-def build_CABS(geom, basis_set, cabs_set, xyz_path, deriv_order, options):
+def build_CABS(geom, basis_set, cabs_set, C_obs, xyz_path, deriv_order, options):
     """
     Builds and returns 
     CABS transformation matrix
@@ -46,39 +46,21 @@ def build_CABS(geom, basis_set, cabs_set, xyz_path, deriv_order, options):
 
     # Compute the overlap matrix between OBS and RIBS, then orthogonalizes the RIBS
     S_ao_obs_ribs = compute_f12_oeints(geom, basis_set, cabs_set, xyz_path, deriv_order, options, True)
-    C12 = jnp.dot(S_ao_obs_ribs, C_ribs)
+    C12 = jnp.dot(C_obs.T, jnp.dot(S_ao_obs_ribs, C_ribs))
 
-    nN, Vt = null_svd(C12)
-
-    V_N = jnp.transpose(Vt[nN:, :])
-    C_cabs = jnp.dot(C_ribs, V_N)
-
-    psi4.set_num_threads(threads)
-
-    return C_cabs
-
-@jax.custom_jvp
-def null_svd(C12, cutoff = 1.0e-6):
-    """
-    Grabs the null vectors from the V matrix
-    of an SVD procedure and returns the 
-    number of null vecs and the null vec matrix
-    """
     # Compute the eigenvectors and eigenvalues of C12.T @ C12
-    _, S, Vt = jnp.linalg.svd(C12)
+    CTC = jnp.dot(C12.T, C12)
+    S2, V = jax.scipy.linalg.eigh(CTC)
 
-    # Collect the eigenvectors that are associated with (near) zero eignevalues
     def loop_zero_vals(idx, count):
-        count += jax.lax.cond(abs(S[idx]) < cutoff, lambda: 1, lambda: 0)
+        count += jax.lax.cond(abs(S2[idx]) < 1.0e-8, lambda: 1, lambda: 0)
         return count
-    nN = fori_loop(0, S.shape[0], loop_zero_vals, S.shape[0])
+    ncabs = jax.lax.fori_loop(0, S2.shape[0], loop_zero_vals, 0)
 
-    return nN, Vt
+    V_N = V.at[:, :ncabs].get()    
+
+    C_cabs = jnp.dot(C_ribs, V_N)
 
-@null_svd.defjvp
-def null_svd_jvp(primals, tangents):
-  C12, cutoff = primals
-  C12_dot, cutoff_dot = tangents
-  primal_out = null_svd(C12, cutoff)
-  tangent_out = null_svd(C12_dot, cutoff)
-  return primal_out, tangent_out
+    psi4.set_num_threads(threads)
+
+    return C_cabs

From 8139634a641c40d0461455616d627a1c654ba9ff Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Tue, 12 Dec 2023 16:38:11 -0500
Subject: [PATCH 32/91] CABS C.T @ S22 @ C == 1, and deriv == 0

---
 quax/integrals/basis_utils.py | 16 ++++++++--------
 quax/methods/energy_utils.py  |  6 +++---
 quax/methods/hartree_fock.py  |  8 ++++----
 quax/methods/ints.py          |  2 +-
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/quax/integrals/basis_utils.py b/quax/integrals/basis_utils.py
index 2999f47..9c7f7b8 100644
--- a/quax/integrals/basis_utils.py
+++ b/quax/integrals/basis_utils.py
@@ -31,7 +31,7 @@ def build_RIBS(molecule, basis_set, cabs_name):
 
     return ribs_set
 
-def build_CABS(geom, basis_set, cabs_set, C_obs, xyz_path, deriv_order, options):
+def build_CABS(geom, basis_set, cabs_set, xyz_path, deriv_order, options):
     """
     Builds and returns 
     CABS transformation matrix
@@ -46,21 +46,21 @@ def build_CABS(geom, basis_set, cabs_set, C_obs, xyz_path, deriv_order, options)
 
     # Compute the overlap matrix between OBS and RIBS, then orthogonalizes the RIBS
     S_ao_obs_ribs = compute_f12_oeints(geom, basis_set, cabs_set, xyz_path, deriv_order, options, True)
-    C12 = jnp.dot(C_obs.T, jnp.dot(S_ao_obs_ribs, C_ribs))
 
-    # Compute the eigenvectors and eigenvalues of C12.T @ C12
-    CTC = jnp.dot(C12.T, C12)
-    S2, V = jax.scipy.linalg.eigh(CTC)
+    # Compute the eigenvectors and eigenvalues of C2.T @ S12.T @ S12 @ C2
+    S22 = jnp.dot(S_ao_obs_ribs.T, S_ao_obs_ribs)
+    CTC = C_ribs.T @ S22 @ C_ribs
+    S2, V = jnp.linalg.eigh(CTC)
 
     def loop_zero_vals(idx, count):
-        count += jax.lax.cond(abs(S2[idx]) < 1.0e-8, lambda: 1, lambda: 0)
+        count += jax.lax.cond(abs(S2[idx]) < 1.0e-6, lambda: 1, lambda: 0)
         return count
     ncabs = jax.lax.fori_loop(0, S2.shape[0], loop_zero_vals, 0)
 
-    V_N = V.at[:, :ncabs].get()    
+    V_N = V.at[:, :ncabs].get()
 
     C_cabs = jnp.dot(C_ribs, V_N)
 
     psi4.set_num_threads(threads)
 
-    return C_cabs
+    return C_cabs
\ No newline at end of file
diff --git a/quax/methods/energy_utils.py b/quax/methods/energy_utils.py
index 39a1119..8df2234 100644
--- a/quax/methods/energy_utils.py
+++ b/quax/methods/energy_utils.py
@@ -22,8 +22,8 @@ def symmetric_orthogonalization(S, cutoff = 1.0e-12):
     evals, evecs = jnp.linalg.eigh(S)
 
     def loop_evals(idx, M):
-        val = jax.lax.cond(abs(evals[idx]) > cutoff,
-                           lambda: jnp.reciprocal(jnp.sqrt(evals[idx])),
+        val = jax.lax.cond(abs(evals[idx]) > cutoff * jnp.max(abs(evals)),
+                           lambda: 1 / jnp.sqrt(evals[idx]),
                            lambda: 0.0)
         
         M = M.at[idx, idx].set(val)
@@ -31,7 +31,7 @@ def loop_evals(idx, M):
     
     sqrtm = jax.lax.fori_loop(0, evals.shape[0], loop_evals, jnp.zeros(S.shape))
 
-    A = jnp.dot(evecs, jnp.dot(sqrtm, jnp.transpose(evecs)))
+    A = evecs @ sqrtm @ evecs.T
     return A
 
 def cholesky_orthogonalization(S):
diff --git a/quax/methods/hartree_fock.py b/quax/methods/hartree_fock.py
index 623d071..ebaa392 100644
--- a/quax/methods/hartree_fock.py
+++ b/quax/methods/hartree_fock.py
@@ -47,12 +47,12 @@ def restricted_hartree_fock(geom, basis_set, xyz_path, nuclear_charges, charge,
     
     def rhf_iter(F,D):
         E_scf = jnp.einsum('pq,pq->', F + H, D) + Enuc
-        Fp = jnp.dot(A.T, jnp.dot(F, A))
+        Fp = A.T @ F @ A
         Fp = Fp + shift 
         eps, C2 = jnp.linalg.eigh(Fp)
-        C = jnp.dot(A, C2)
+        C = A @ C2
         Cocc = C[:, :ndocc]
-        D = jnp.dot(Cocc, Cocc.T)
+        D = Cocc @ Cocc.T
         return E_scf, D, C, eps
 
     iteration = 0
@@ -77,7 +77,7 @@ def rhf_iter(F,D):
         # Update convergence error
         if iteration > 1:
             diis_e = jnp.einsum('ij,jk,kl->il', F, D, S) - jnp.einsum('ij,jk,kl->il', S, D, F)
-            diis_e = A.dot(diis_e).dot(A)
+            diis_e = A @ diis_e @ A
             dRMS = jnp.mean(diis_e ** 2) ** 0.5
         # Compute energy, transform Fock and diagonalize, get new density
         E_scf, D, C, eps = rhf_iter(F, D)
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index 3f2f4b9..c48b73a 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -189,7 +189,7 @@ def check_oei_disk(geom, basis1, basis2, xyz_path, deriv_order, address=None):
         nbf1 = basis1.nbf()
         nbf2 = basis2.nbf()
         # Check if there are `deriv_order` datasets in the eri file
-        correct_deriv_order = len(oeifile) >= 3 * (deriv_order)
+        correct_deriv_order = len(oeifile) >= (3 * deriv_order)
         # Check nbf dimension of integral arrays
         sample_dataset_name = list(oeifile.keys())[0]
         correct_nbf1 = oeifile[sample_dataset_name].shape[0] == nbf1

From 53b94235e5334e750de8181742489b59aca379db Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Tue, 12 Dec 2023 17:23:26 -0500
Subject: [PATCH 33/91] More mem efficient F12 algo

---
 quax/methods/mp2f12.py | 205 +++++++++++++++++++++++------------------
 1 file changed, 117 insertions(+), 88 deletions(-)

diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index cb209f9..5639fbf 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -14,42 +14,34 @@
 def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, options, cabs_set, deriv_order=0):
     nelectrons = int(jnp.sum(nuclear_charges)) - charge
     ndocc = nelectrons // 2
-    E_mp2, C_obs, eps = restricted_mp2(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
+    E_mp2, C_obs, eps = restricted_mp2(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order, return_aux_data=True)
+    eps_occ, eps_vir = eps[:ndocc], eps[ndocc:]
 
     print("Running MP2-F12 Computation...")
     C_cabs = build_CABS(geom, basis_set, cabs_set, xyz_path, deriv_order, options)
 
     nobs = C_obs.shape[0]
-    nri = C_cabs.shape[0]
-	
-    o, v, p, c, A = slice(0, ndocc), slice(ndocc, nobs), slice(0, nobs), slice(nobs, nri), slice(0, nri)
-
-    eps_occ, eps_vir = eps[o], eps[v]
+    nri = C_obs.shape[0] + C_cabs.shape[1]
 
     # Fock
-    h = form_h(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_order, options)
-    G = form_G(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_order, options)
-    f, fk, k = form_Fock(h, (G[A, o, A, o], G[A, o, o, A]))
+    f, fk, k = form_Fock(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
 
     # V Intermediate
-    FG = form_FG(geom, basis_set, C_obs, ndocc, xyz_path, deriv_order, options)
-    F = form_F(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
-    V = form_V(FG, (F[o, o, o, c], F[o, o, p, p]), (G[o, o, o, c], G[o, o, p, p]))
+    V = form_V(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
 
     # X Intermediate
-    F2 = form_F2(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
-    X = form_X(F2[o, o, o, o], (F[o, o, o, c], F[o, o, p, p]))
+    X = form_X(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
 
     # C Intermediate
-    C = form_C(F[o, o, v, c], f[v, c])
+    C = form_C(geom, basis_set, cabs_set, f[ndocc:nobs, nobs:], C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options)
 
     # B Intermediate
-    Uf = form_Uf(geom, basis_set, C_obs, ndocc, xyz_path, deriv_order, options)
-    B = form_B(Uf, F2, (F, F[o, o, o, A], F[o, o, c, o], F[o, o, v, p], F[o, o, c, A], F[o, o, v, c]),\
-               (f, f[o, o], f[p, p], f[A, o], f[p, c]), fk[o, A], k)
+    B = form_B(geom, basis_set, cabs_set, f, k, fk[:ndocc, :], C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
+
+    D = -1.0 / (eps_occ.reshape(-1, 1, 1, 1) + eps_occ.reshape(-1, 1, 1) - eps_vir.reshape(-1, 1) - eps_vir)
 
-    D = -1.0 * jnp.reciprocal(eps_occ.reshape(-1, 1, 1, 1) + eps_occ.reshape(-1, 1, 1) - eps_vir.reshape(-1, 1) - eps_vir)
-    Dv = slice(0, nobs - ndocc)
+    G = two_body_mo_computer(geom, "eri", basis_set, basis_set, basis_set, basis_set,\
+                             C_obs, C_obs, C_obs, C_obs, xyz_path, deriv_order, options)
     
     # indices = jnp.asarray(jnp.triu_indices(ndocc)).reshape(2,-1).T
 
@@ -61,20 +53,19 @@ def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, optio
         for j in range(i, ndocc):
             kd = jax.lax.cond(i == j, lambda: 1.0, lambda: 2.0)
 
-            D_ij = D[i, j, Dv, Dv]
+            D_ij = D[i, j, :, :]
 
-            V_ij = V[i, j, o, o]
-            GD_ij = G[i, j, v, v] * D_ij
-            V_ij -= jnp.tensordot(C, GD_ij, [(2, 3), (0, 1)])
+            GD_ij = G[i, j, ndocc:, ndocc:] * D_ij
+            V_ij = V[i, j, :, :] - jnp.tensordot(C, GD_ij, [(2, 3), (0, 1)])
 
             V_s = 0.25 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd * (V_ij[i, j] + V_ij[j, i])
 
             V_t = 0.25 * jax.lax.cond(i != j, lambda: (t_(i, j, i, j) - t_(i, j, j, i))
                                                    * kd * (V_ij[i, j] - V_ij[j, i]), lambda: 0.0)
 
-            B_ij = B - (X * (f[i, i] + f[j, j]))
             CD_ij = jnp.einsum('mnab,ab->mnab', C, D_ij, optimize='optimal')
-            B_ij -= jnp.tensordot(C, CD_ij, [(2, 3), (2, 3)])
+            B_ij = B - (X * (f[i, i] + f[j, j])) - jnp.tensordot(C, CD_ij, [(2, 3), (2, 3)])
+
             B_s = 0.125 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd \
                          * (B_ij[i, j, i, j] + B_ij[j, i, i, j]) \
                          * (t_(i, j, i, j) + t_(i, j, j, i)) * kd
@@ -84,8 +75,13 @@ def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, optio
                                                      * (t_(i, j, i, j) - t_(i, j, j, i)) * kd,
                                                      lambda: 0.0)
 
-            dE_mp2f12 += kd * (2.0 * V_s + B_s)         # Singlet Pair Energy
-            dE_mp2f12 += 3.0 * kd * (2.0 * V_t + B_t)   # Triplet Pair Energy
+            E_s = kd * (2.0 * V_s + B_s)         # Singlet Pair Energy
+            E_t = 3.0 * kd * (2.0 * V_t + B_t)   # Triplet Pair Energy
+
+            # print(E_s)
+            # print(E_t)
+
+            dE_mp2f12 += E_s + E_t
 
     #     return f12_corr
 
@@ -113,11 +109,11 @@ def one_body_mo_computer(geom, bs1, bs2, C1, C2, xyz_path, deriv_order, options)
     """
     T, V = compute_f12_oeints(geom, bs1, bs2, xyz_path, deriv_order, options, False)
     AO = T + V
-    MO = jnp.dot(C1.T, jnp.dot(AO, C2))
+    MO = C1.T @ AO @ C2
     return MO
 
 def form_h(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_order, options):
-    tv = jnp.empty((nri, nri))
+    tv = jnp.zeros((nri, nri))
 
     mo1 = one_body_mo_computer(geom, basis_set, basis_set, C_obs, C_obs, xyz_path, deriv_order, options)
     tv = tv.at[:nobs, :nobs].set(mo1) # <O|O>
@@ -140,35 +136,64 @@ def two_body_mo_computer(geom, int_type, bs1, bs2, bs3, bs4, C1, C2, C3, C4, xyz
     and then transforms to MOs
     """
     AO = compute_f12_teints(geom, bs1, bs3, bs2, bs4, int_type, xyz_path, deriv_order, options)
-    AO = jnp.transpose(AO, (0,2,1,3))
-    MO = partial_tei_transformation(AO, C1, C2, C3, C4)
+    MO = partial_tei_transformation(AO, C1, C3, C2, C4)
+    MO = jnp.swapaxes(MO, 1, 2)
     return MO
 
-def form_G(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_order, options):
-    eri = jnp.empty((nri, nobs, nri, nri))
+def form_J(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
+    eri = jnp.zeros((nri, ndocc, nri, ndocc))
+    C_occ = C_obs.at[:, :ndocc].get()
 
     mo1 = two_body_mo_computer(geom, "eri", basis_set, basis_set, basis_set, basis_set,\
-                              C_obs, C_obs, C_obs, C_obs, xyz_path, deriv_order, options)
-    eri = eri.at[:nobs, :nobs, :nobs, :nobs].set(mo1) # <OO|OO>
+                               C_obs, C_occ, C_obs, C_occ, xyz_path, deriv_order, options)
+    eri = eri.at[:nobs, :, :nobs, :].set(mo1) # <Oo|Oo>
 
     mo2 = two_body_mo_computer(geom, "eri", cabs_set, basis_set, basis_set, basis_set,\
-                              C_cabs, C_obs, C_obs, C_obs, xyz_path, deriv_order, options)
-    eri = eri.at[nobs:nri, :nobs, :nobs, :nobs].set(mo2) # <CO|OO>
-    eri = eri.at[:nobs, :nobs, nobs:nri, :nobs].set(jnp.transpose(mo2, (2,3,0,1))) # <OO|CO>
-    eri = eri.at[:nobs, :nobs, :nobs, nobs:nri].set(jnp.transpose(mo2, (3,2,1,0))) # <OO|OC>
+                              C_cabs, C_occ, C_obs, C_occ, xyz_path, deriv_order, options)
+    eri = eri.at[nobs:nri, :, :nobs, :].set(mo2) # <Co|Oo>
+    eri = eri.at[:nobs, :, nobs:nri, :].set(jnp.transpose(mo2, (2,3,0,1))) # <Oo|Co>
+
+    mo3 = two_body_mo_computer(geom, "eri", cabs_set, basis_set, cabs_set, basis_set,\
+                              C_cabs, C_occ, C_cabs, C_occ, xyz_path, deriv_order, options)
+    eri = eri.at[nobs:nri, :, nobs:nri, :].set(mo3) # <Co|Co>
+
+    return eri
+
+def form_K(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
+    eri = jnp.empty((nri, ndocc, ndocc, nri))
+    C_occ = C_obs.at[:, :ndocc].get()
+
+    mo1 = two_body_mo_computer(geom, "eri", basis_set, basis_set, basis_set, basis_set,\
+                              C_obs, C_occ, C_occ, C_obs, xyz_path, deriv_order, options)
+    eri = eri.at[:nobs, :, :, :nobs].set(mo1) # <Oo|oO>
+
+    mo2 = two_body_mo_computer(geom, "eri", cabs_set, basis_set, basis_set, basis_set,\
+                              C_cabs, C_occ, C_occ, C_obs, xyz_path, deriv_order, options)
+    eri = eri.at[nobs:nri, :, :, :nobs].set(mo2) # <Co|oO>
+    eri = eri.at[:nobs, :, :, nobs:nri].set(jnp.transpose(mo2, (3,2,1,0))) # <Oo|oC>
 
     mo3 = two_body_mo_computer(geom, "eri", cabs_set, basis_set, basis_set, cabs_set,\
-                              C_cabs, C_obs, C_obs, C_cabs, xyz_path, deriv_order, options)
-    eri = eri.at[nobs:nri, :nobs, :nobs, nobs:nri].set(mo3) # <CO|OC>
+                              C_cabs, C_occ, C_occ, C_cabs, xyz_path, deriv_order, options)
+    eri = eri.at[nobs:nri, :, :, nobs:nri].set(mo3) # <Co|oC>
+
+    return eri
+
+def form_ooO1(geom, int_type, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
+    eri = jnp.zeros((ndocc, ndocc, nobs, nri))
+    C_occ = C_obs.at[:, :ndocc].get()
+
+    mo1 = two_body_mo_computer(geom, int_type, basis_set, basis_set, basis_set, basis_set,\
+                              C_occ, C_occ, C_obs, C_obs, xyz_path, deriv_order, options)
+    eri = eri.at[:, :, :, :nobs].set(mo1) # <oo|OO>
 
-    mo4 = two_body_mo_computer(geom, "eri", cabs_set, basis_set, cabs_set, basis_set,\
-                              C_cabs, C_obs, C_cabs, C_obs, xyz_path, deriv_order, options)
-    eri = eri.at[nobs:nri, :nobs, nobs:nri, :nobs].set(mo4) # <CO|CO>
+    mo2 = two_body_mo_computer(geom, int_type, basis_set, basis_set, basis_set, cabs_set,\
+                               C_occ, C_occ, C_obs, C_cabs, xyz_path, deriv_order, options)
+    eri = eri.at[:, :, :, nobs:].set(mo2) # <oo|OC>
 
     return eri
 
 def form_F(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
-    f12 = jnp.empty((ndocc, ndocc, nri, nri))
+    f12 = jnp.zeros((ndocc, ndocc, nri, nri))
     C_occ = C_obs.at[:, :ndocc].get()
 
     mo1 = two_body_mo_computer(geom, "f12", basis_set, basis_set, basis_set, basis_set,\
@@ -187,7 +212,7 @@ def form_F(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path,
     return f12
 
 def form_F2(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
-    f12_squared = jnp.empty((ndocc, ndocc, ndocc, nri))
+    f12_squared = jnp.zeros((ndocc, ndocc, ndocc, nri))
     C_occ = C_obs.at[:, :ndocc].get()
 
     mo1 = two_body_mo_computer(geom, "f12_squared", basis_set, basis_set, basis_set, basis_set,\
@@ -200,68 +225,68 @@ def form_F2(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path
 
     return f12_squared
 
-def form_FG(geom, basis_set, C_obs, ndocc, xyz_path, deriv_order, options):
-    C_occ = C_obs.at[:, :ndocc].get()
-
-    f12g12 = two_body_mo_computer(geom, "f12g12", basis_set, basis_set, basis_set, basis_set,\
-                                  C_occ, C_occ, C_occ, C_occ, xyz_path, deriv_order, options)
-    return f12g12
-
-def form_Uf(geom, basis_set, C_obs, ndocc, xyz_path, deriv_order, options):
-    C_occ = C_obs.at[:, :ndocc].get()
-
-    f12_double_commutator = two_body_mo_computer(geom, "f12_double_commutator",\
-                                    basis_set, basis_set, basis_set, basis_set,\
-                                    C_occ, C_occ, C_occ, C_occ, xyz_path, deriv_order, options)
-    return f12_double_commutator
-
 # Fock
-def form_Fock(h, Fock_G):
+def form_Fock(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
 
-    G_1o1o, G_1oo1 = Fock_G
+    h = form_h(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_order, options)
+    J = form_J(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
+    K = form_K(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
     
     # Fock Matrix without Exchange
-    fk = h + 2.0 * jnp.einsum('piqi->pq', G_1o1o, optimize='optimal')
+    fk = h + (2.0 * jnp.einsum('piqi->pq', J, optimize='optimal'))
 
     # Exchange
-    k =  jnp.einsum('piiq->pq', G_1oo1, optimize='optimal')
+    k =  jnp.einsum('piiq->pq', K, optimize='optimal')
 
     f = fk - k
 
     return f, fk, k
 
 # F12 Intermediates
-def form_V(FG, VX_F, V_G):
+def form_V(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
+    C_occ = C_obs.at[:, :ndocc].get()
     
-    G_oooc, G_oopq = V_G
-    F_oooc, F_oopq = VX_F
+    FG = two_body_mo_computer(geom, "f12g12", basis_set, basis_set, basis_set, basis_set,\
+                              C_occ, C_occ, C_occ, C_occ, xyz_path, deriv_order, options)
+    G = form_ooO1(geom, "eri", basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
+    F = form_ooO1(geom, "f12", basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
 
-    ijkl_1 = jnp.tensordot(G_oooc, F_oooc, [(2, 3), (2, 3)])
+    ijkl_1 = jnp.tensordot(G[:, :, :ndocc, nobs:], F[:, :, :ndocc, nobs:], [(2, 3), (2, 3)])
     ijkl_2 = jnp.transpose(ijkl_1, (1,0,3,2))
-    ijkl_3 = jnp.tensordot(G_oopq, F_oopq, [(2, 3), (2, 3)])
+    ijkl_3 = jnp.tensordot(G[:, :, :nobs, :nobs], F[:, :, :nobs, :nobs], [(2, 3), (2, 3)])
 
     return FG - ijkl_1 - ijkl_2 - ijkl_3
 
-def form_X(F2_oooo, VX_F):
+def form_X(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
+    C_occ = C_obs.at[:, :ndocc].get()
     
-    F_oooc, F_oopq = VX_F
+    F2 = two_body_mo_computer(geom, "f12_squared", basis_set, basis_set, basis_set, basis_set,\
+                              C_occ, C_occ, C_occ, C_occ, xyz_path, deriv_order, options)
+    F = form_ooO1(geom, "f12", basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
 
-    ijkl_1 = jnp.tensordot(F_oooc, F_oooc, [(2, 3), (2, 3)])
+    ijkl_1 = jnp.tensordot(F[:, :, :ndocc, nobs:], F[:, :, :ndocc, nobs:], [(2, 3), (2, 3)])
     ijkl_2 = jnp.transpose(ijkl_1, (1,0,3,2))
-    ijkl_3 = jnp.tensordot(F_oopq, F_oopq, [(2, 3), (2, 3)])
+    ijkl_3 = jnp.tensordot(F[:, :, :nobs, :nobs], F[:, :, :nobs, :nobs], [(2, 3), (2, 3)])
 
-    return F2_oooo - ijkl_1 - ijkl_2 - ijkl_3
+    return F2 - ijkl_1 - ijkl_2 - ijkl_3
 
-def form_C(F_oovc, f_vc):
+def form_C(geom, basis_set, cabs_set, f_vc, C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options):
+    C_occ = C_obs.at[:, :ndocc].get()
 
-    klab = jnp.tensordot(F_oovc, f_vc, [(3), (1)])
+    F = two_body_mo_computer(geom, "f12", basis_set, basis_set, basis_set, cabs_set,\
+                              C_occ, C_occ, C_obs, C_cabs, xyz_path, deriv_order, options)
 
-    return klab + jnp.transpose(klab, (1,0,3,2))
+    klab = jnp.tensordot(F[:, :, ndocc:nobs, :], f_vc, [(3), (1)])
 
-def form_B(Uf, F2, B_F, B_f, fk_o1, k):
+    return klab + jnp.transpose(klab, (1,0,3,2))
 
-    F, F_ooo1, F_ooco, F_oovq, F_ooc1, F_oovc = B_F
-    f, f_oo, f_pq, f_1o, f_pc = B_f
+def form_B(geom, basis_set, cabs_set, f, k, fk_o1, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
+    C_occ = C_obs.at[:, :ndocc].get()
+    
+    Uf = two_body_mo_computer(geom, "f12_double_commutator", basis_set, basis_set, basis_set, basis_set,\
+                              C_occ, C_occ, C_occ, C_occ, xyz_path, deriv_order, options)
+    F2 = form_F2(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
+    F = form_F(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
 
     # Term 2
     terms = jnp.tensordot(F2, fk_o1, [(3), (1)])
@@ -270,20 +295,24 @@ def form_B(Uf, F2, B_F, B_f, fk_o1, k):
     terms -= jnp.tensordot(jnp.tensordot(F, k, [(3), (0)]), F, [(2, 3), (2, 3)])
 
     # Term 4
-    terms -= jnp.tensordot(jnp.tensordot(F_ooo1, f, [(3), (0)]), F_ooo1, [(2, 3), (2, 3)])
+    terms -= jnp.tensordot(jnp.tensordot(F[:, :, :ndocc, :], f, [(3), (0)]), \
+                           F[:, :, :ndocc, :], [(2, 3), (2, 3)])
 
     # Term 5
-    terms += jnp.tensordot(jnp.tensordot(F_ooco, f_oo, [(3), (0)]), F_ooco, [(2, 3), (2, 3)])
+    terms += jnp.tensordot(jnp.tensordot(F[:, :, nobs:, :ndocc], f[:ndocc, :ndocc], [(3), (0)]), \
+                           F[:, :, nobs:, :ndocc], [(2, 3), (2, 3)])
 
     # Term 6
-    terms -= jnp.tensordot(jnp.tensordot(F_oovq, f_pq, [(3), (0)]), F_oovq, [(2, 3), (2, 3)])
+    terms -= jnp.tensordot(jnp.tensordot(F[:, :, ndocc:nobs, :nobs], f[:nobs, :nobs], [(3), (0)]), \
+                           F[:, :, ndocc:nobs, :nobs], [(2, 3), (2, 3)])
 
     # Term 7
-    terms -= 2.0 * jnp.tensordot(jnp.tensordot(F_ooc1, f_1o, [(3), (0)]), F_ooco, [(2, 3), (2, 3)])
+    terms -= 2.0 * jnp.tensordot(jnp.tensordot(F[:, :, nobs:, :], f[:, :ndocc], [(3), (0)]), \
+                                 F[:, :, nobs:, :ndocc], [(2, 3), (2, 3)])
 
     # Term 8
-    terms -= 2.0 * jnp.tensordot(jnp.tensordot(F_oovq, f_pc, [(3), (0)]), F_oovc, [(2, 3), (2, 3)])
-
+    terms -= 2.0 * jnp.tensordot(jnp.tensordot(F[:, :, ndocc:nobs, :nobs], f[:nobs, nobs:], [(3), (0)]), \
+                                 F[:, :, ndocc:nobs, nobs:], [(2, 3), (2, 3)])
 
     B_nosymm = Uf + terms + jnp.transpose(terms, (1,0,3,2))
 

From 1c8d4e5dc58fb1c2d8645903a0ffa266037b8478 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Fri, 15 Dec 2023 18:36:09 -0500
Subject: [PATCH 34/91] Back to einsums, and testing CABS (WIP)

---
 quax/integrals/basis_utils.py | 11 ++---
 quax/methods/mp2f12.py        | 81 ++++++++++++++++++++++-------------
 2 files changed, 57 insertions(+), 35 deletions(-)

diff --git a/quax/integrals/basis_utils.py b/quax/integrals/basis_utils.py
index 9c7f7b8..6f53a17 100644
--- a/quax/integrals/basis_utils.py
+++ b/quax/integrals/basis_utils.py
@@ -52,15 +52,16 @@ def build_CABS(geom, basis_set, cabs_set, xyz_path, deriv_order, options):
     CTC = C_ribs.T @ S22 @ C_ribs
     S2, V = jnp.linalg.eigh(CTC)
 
-    def loop_zero_vals(idx, count):
-        count += jax.lax.cond(abs(S2[idx]) < 1.0e-6, lambda: 1, lambda: 0)
-        return count
-    ncabs = jax.lax.fori_loop(0, S2.shape[0], loop_zero_vals, 0)
+    ### PROBLEM CHILD ###
 
-    V_N = V.at[:, :ncabs].get()
+    ncabs = jnp.where(S2 < 1.0e-6, True, False)
+
+    V_N = V[:, ncabs]
 
     C_cabs = jnp.dot(C_ribs, V_N)
 
+    ### PROBLEM CHILD ###
+
     psi4.set_num_threads(threads)
 
     return C_cabs
\ No newline at end of file
diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index 5639fbf..1dfb01a 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -20,6 +20,12 @@ def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, optio
     print("Running MP2-F12 Computation...")
     C_cabs = build_CABS(geom, basis_set, cabs_set, xyz_path, deriv_order, options)
 
+    # S_ao = compute_f12_oeints(geom, cabs_set, cabs_set, xyz_path, deriv_order, options, True)
+    # test = C_cabs.T @ S_ao @ C_cabs
+    # print(test)
+
+    # return jnp.array([0, 0])
+
     nobs = C_obs.shape[0]
     nri = C_obs.shape[0] + C_cabs.shape[1]
 
@@ -33,7 +39,7 @@ def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, optio
     X = form_X(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
 
     # C Intermediate
-    C = form_C(geom, basis_set, cabs_set, f[ndocc:nobs, nobs:], C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options)
+    C = form_C(geom, basis_set, cabs_set, f[nobs:, ndocc:nobs], C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options)
 
     # B Intermediate
     B = form_B(geom, basis_set, cabs_set, f, k, fk[:ndocc, :], C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
@@ -55,8 +61,8 @@ def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, optio
 
             D_ij = D[i, j, :, :]
 
-            GD_ij = G[i, j, ndocc:, ndocc:] * D_ij
-            V_ij = V[i, j, :, :] - jnp.tensordot(C, GD_ij, [(2, 3), (0, 1)])
+            GD_ij = jnp.einsum('ab,ab->ab', G[i, j, ndocc:, ndocc:], D_ij, optimize='optimal')
+            V_ij = V[i, j, :, :] - jnp.einsum('klab,ab->kl', C, GD_ij, optimize='optimal')
 
             V_s = 0.25 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd * (V_ij[i, j] + V_ij[j, i])
 
@@ -64,7 +70,7 @@ def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, optio
                                                    * kd * (V_ij[i, j] - V_ij[j, i]), lambda: 0.0)
 
             CD_ij = jnp.einsum('mnab,ab->mnab', C, D_ij, optimize='optimal')
-            B_ij = B - (X * (f[i, i] + f[j, j])) - jnp.tensordot(C, CD_ij, [(2, 3), (2, 3)])
+            B_ij = B - (X * (f[i, i] + f[j, j])) - jnp.einsum('klab,mnab->klmn', C, CD_ij, optimize='optimal')
 
             B_s = 0.125 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd \
                          * (B_ij[i, j, i, j] + B_ij[j, i, i, j]) \
@@ -132,8 +138,8 @@ def two_body_mo_computer(geom, int_type, bs1, bs2, bs3, bs4, C1, C2, C3, C4, xyz
     """
     General two-body MO computer
     that computes the AOs in chem notation,
-    returns them in phys notation,
-    and then transforms to MOs
+    then transforms to MOs,
+    and returns the MOs in phys notation
     """
     AO = compute_f12_teints(geom, bs1, bs3, bs2, bs4, int_type, xyz_path, deriv_order, options)
     MO = partial_tei_transformation(AO, C1, C3, C2, C4)
@@ -228,12 +234,12 @@ def form_F2(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path
 # Fock
 def form_Fock(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
 
-    h = form_h(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_order, options)
+    fk = form_h(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_order, options)
     J = form_J(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
     K = form_K(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
     
     # Fock Matrix without Exchange
-    fk = h + (2.0 * jnp.einsum('piqi->pq', J, optimize='optimal'))
+    fk += 2.0 * jnp.einsum('piqi->pq', J, optimize='optimal')
 
     # Exchange
     k =  jnp.einsum('piiq->pq', K, optimize='optimal')
@@ -242,6 +248,22 @@ def form_Fock(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_pa
 
     return f, fk, k
 
+# CABS Singles
+def cabs_singles(f, ndocc, nri):
+    all_vir = nri - ndocc
+
+    e_ij, C_ij = jnp.linalg.eigh(f[:ndocc, :ndocc])
+    e_AB, C_AB = jnp.linalg.eigh(f[ndocc:, ndocc:])
+
+    f_iA = C_ij @ f[:ndocc, ndocc:] @ C_AB.T
+
+    E_s = 0.0
+    for A in range(all_vir):
+        for i in range(ndocc):
+            E_s += (2 * f_iA[i, A] ** 2) / (e_ij[i] - e_AB[A])
+
+    return E_s
+
 # F12 Intermediates
 def form_V(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
     C_occ = C_obs.at[:, :ndocc].get()
@@ -251,9 +273,9 @@ def form_V(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path,
     G = form_ooO1(geom, "eri", basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
     F = form_ooO1(geom, "f12", basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
 
-    ijkl_1 = jnp.tensordot(G[:, :, :ndocc, nobs:], F[:, :, :ndocc, nobs:], [(2, 3), (2, 3)])
-    ijkl_2 = jnp.transpose(ijkl_1, (1,0,3,2))
-    ijkl_3 = jnp.tensordot(G[:, :, :nobs, :nobs], F[:, :, :nobs, :nobs], [(2, 3), (2, 3)])
+    ijkl_1 = jnp.einsum('ijmy,klmy->ijkl', G[:, :, :ndocc, nobs:], F[:, :, :ndocc, nobs:], optimize='optimal')
+    ijkl_2 = jnp.transpose(ijkl_1, (1,0,3,2)) # ijxn,klxn->ijkl
+    ijkl_3 = jnp.einsum('ijrs,klrs->ijkl', G[:, :, :nobs, :nobs], F[:, :, :nobs, :nobs], optimize='optimal')
 
     return FG - ijkl_1 - ijkl_2 - ijkl_3
 
@@ -264,19 +286,19 @@ def form_X(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path,
                               C_occ, C_occ, C_occ, C_occ, xyz_path, deriv_order, options)
     F = form_ooO1(geom, "f12", basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
 
-    ijkl_1 = jnp.tensordot(F[:, :, :ndocc, nobs:], F[:, :, :ndocc, nobs:], [(2, 3), (2, 3)])
-    ijkl_2 = jnp.transpose(ijkl_1, (1,0,3,2))
-    ijkl_3 = jnp.tensordot(F[:, :, :nobs, :nobs], F[:, :, :nobs, :nobs], [(2, 3), (2, 3)])
+    ijkl_1 = jnp.einsum('ijmy,klmy->ijkl', F[:, :, :ndocc, nobs:], F[:, :, :ndocc, nobs:], optimize='optimal')
+    ijkl_2 = jnp.transpose(ijkl_1, (1,0,3,2)) # ijxn,klxn->ijkl
+    ijkl_3 = jnp.einsum('ijrs,klrs->ijkl', F[:, :, :nobs, :nobs], F[:, :, :nobs, :nobs], optimize='optimal')
 
     return F2 - ijkl_1 - ijkl_2 - ijkl_3
 
-def form_C(geom, basis_set, cabs_set, f_vc, C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options):
+def form_C(geom, basis_set, cabs_set, f_cv, C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options):
     C_occ = C_obs.at[:, :ndocc].get()
 
     F = two_body_mo_computer(geom, "f12", basis_set, basis_set, basis_set, cabs_set,\
                               C_occ, C_occ, C_obs, C_cabs, xyz_path, deriv_order, options)
 
-    klab = jnp.tensordot(F[:, :, ndocc:nobs, :], f_vc, [(3), (1)])
+    klab = jnp.einsum('klax,xb->klab', F[:, :, ndocc:nobs, :], f_cv, optimize='optimal')
 
     return klab + jnp.transpose(klab, (1,0,3,2))
 
@@ -289,31 +311,30 @@ def form_B(geom, basis_set, cabs_set, f, k, fk_o1, C_obs, C_cabs, ndocc, nobs, n
     F = form_F(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
 
     # Term 2
-    terms = jnp.tensordot(F2, fk_o1, [(3), (1)])
+    terms = jnp.einsum('nmlP,kP->nmlk', F2, fk_o1)
 
     # Term 3
-    terms -= jnp.tensordot(jnp.tensordot(F, k, [(3), (0)]), F, [(2, 3), (2, 3)])
+    terms -= jnp.einsum('nmQP,PR,lkQR->nmlk', F, k, F, optimize='optimal')
 
     # Term 4
-    terms -= jnp.tensordot(jnp.tensordot(F[:, :, :ndocc, :], f, [(3), (0)]), \
-                           F[:, :, :ndocc, :], [(2, 3), (2, 3)])
+    terms -= jnp.einsum('nmjP,PR,lkjR->nmlk', F[:, :, :ndocc, :], f, F[:, :, :ndocc, :], optimize='optimal')
 
     # Term 5
-    terms += jnp.tensordot(jnp.tensordot(F[:, :, nobs:, :ndocc], f[:ndocc, :ndocc], [(3), (0)]), \
-                           F[:, :, nobs:, :ndocc], [(2, 3), (2, 3)])
+    terms += jnp.einsum('nmyi,ij,lkyj->nmlk', F[:, :, nobs:, :ndocc], f[:ndocc, :ndocc],\
+                                              F[:, :, nobs:, :ndocc], optimize='optimal')
 
     # Term 6
-    terms -= jnp.tensordot(jnp.tensordot(F[:, :, ndocc:nobs, :nobs], f[:nobs, :nobs], [(3), (0)]), \
-                           F[:, :, ndocc:nobs, :nobs], [(2, 3), (2, 3)])
+    terms -= jnp.einsum('nmbp,pr,lkbr->nmlk', F[:, :, ndocc:nobs, :nobs], f[:nobs, :nobs],\
+                                              F[:, :, ndocc:nobs, :nobs], optimize='optimal')
 
     # Term 7
-    terms -= 2.0 * jnp.tensordot(jnp.tensordot(F[:, :, nobs:, :], f[:, :ndocc], [(3), (0)]), \
-                                 F[:, :, nobs:, :ndocc], [(2, 3), (2, 3)])
+    terms -= 2.0 * jnp.einsum('nmyi,iP,lkyP->nmlk', F[:, :, nobs:, :], f[:, :ndocc],\
+                                                    F[:, :, nobs:, :ndocc], optimize='optimal')
 
     # Term 8
-    terms -= 2.0 * jnp.tensordot(jnp.tensordot(F[:, :, ndocc:nobs, :nobs], f[:nobs, nobs:], [(3), (0)]), \
-                                 F[:, :, ndocc:nobs, nobs:], [(2, 3), (2, 3)])
+    terms -= 2.0 * jnp.einsum('nmbx,xq,lkbq->nmlk', F[:, :, ndocc:nobs, :nobs], f[:nobs, nobs:],\
+                                                    F[:, :, ndocc:nobs, nobs:], optimize='optimal')
 
-    B_nosymm = Uf + terms + jnp.transpose(terms, (1,0,3,2))
+    B_nosymm = Uf + terms + jnp.transpose(terms, (1,0,3,2)) # nmlk->mnkl
 
-    return 0.5 * (B_nosymm + jnp.transpose(B_nosymm, (2,3,0,1)))
+    return 0.5 * (B_nosymm + jnp.transpose(B_nosymm, (2,3,0,1))) # mnkl + klmn

From 21b077ad5f8e409d92178b8286bd87dc34b02ebd Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Wed, 3 Jan 2024 15:56:06 -0500
Subject: [PATCH 35/91] Working MP2-F12, custom JVP for SVD, working CABS
 singles

---
 quax/core.py                       |   2 +-
 quax/integrals/basis_utils.py      |  67 ----------------
 quax/integrals/libint_interface.cc |   8 +-
 quax/integrals/oei.py              |  34 +++++---
 quax/integrals/tei.py              |  69 ++++++-----------
 quax/methods/basis_utils.py        | 120 +++++++++++++++++++++++++++++
 quax/methods/energy_utils.py       |  10 +--
 quax/methods/ints.py               |   6 +-
 quax/methods/mp2f12.py             |  76 +++++++++---------
 9 files changed, 209 insertions(+), 183 deletions(-)
 delete mode 100644 quax/integrals/basis_utils.py
 create mode 100644 quax/methods/basis_utils.py

diff --git a/quax/core.py b/quax/core.py
index f6eb256..0c861c5 100644
--- a/quax/core.py
+++ b/quax/core.py
@@ -8,7 +8,7 @@
 import os
 import h5py
 
-from .integrals.basis_utils import build_RIBS
+from .methods.basis_utils import build_RIBS
 from .methods.hartree_fock import restricted_hartree_fock
 from .methods.mp2 import restricted_mp2
 from .methods.mp2f12 import restricted_mp2_f12
diff --git a/quax/integrals/basis_utils.py b/quax/integrals/basis_utils.py
deleted file mode 100644
index 6f53a17..0000000
--- a/quax/integrals/basis_utils.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import psi4
-import jax
-import jax.numpy as jnp
-from jax.lax import fori_loop
-
-from ..methods.ints import compute_f12_oeints
-from ..methods.energy_utils import symmetric_orthogonalization
-
-def build_RIBS(molecule, basis_set, cabs_name):
-    """
-    Builds basis set for
-    CABS procedure
-    """
-
-    # Libint uses the suffix 'cabs' but Psi4 uses 'optri'
-    basis_name = basis_set.name()
-    try:
-        psi4_name = cabs_name.lower().replace('cabs', 'optri')
-    except:
-        raise Exception("Must use a cc-pVXZ-F12 or aug-cc-pVXZ basis set for F12 methods.")
-
-    keys = ["BASIS","CABS_BASIS"]
-    targets = [basis_name, psi4_name]
-    roles = ["ORBITAL","F12"]
-    others = [basis_name, basis_name]
-
-    # Creates combined basis set in Python
-    ao_union = psi4.driver.qcdb.libmintsbasisset.BasisSet.pyconstruct_combined(molecule.save_string_xyz(), keys, targets, roles, others)
-    ao_union['name'] = cabs_name
-    ribs_set = psi4.core.BasisSet.construct_from_pydict(molecule, ao_union, 0)
-
-    return ribs_set
-
-def build_CABS(geom, basis_set, cabs_set, xyz_path, deriv_order, options):
-    """
-    Builds and returns 
-    CABS transformation matrix
-    """
-    # Make Thread Safe
-    threads = psi4.get_num_threads()
-    psi4.set_num_threads(1)
-
-    # Orthogonalize combined basis set
-    S_ao_ribs_ribs = compute_f12_oeints(geom, cabs_set, cabs_set, xyz_path, deriv_order, options, True)
-    C_ribs = symmetric_orthogonalization(S_ao_ribs_ribs, 1.0e-8)
-
-    # Compute the overlap matrix between OBS and RIBS, then orthogonalizes the RIBS
-    S_ao_obs_ribs = compute_f12_oeints(geom, basis_set, cabs_set, xyz_path, deriv_order, options, True)
-
-    # Compute the eigenvectors and eigenvalues of C2.T @ S12.T @ S12 @ C2
-    S22 = jnp.dot(S_ao_obs_ribs.T, S_ao_obs_ribs)
-    CTC = C_ribs.T @ S22 @ C_ribs
-    S2, V = jnp.linalg.eigh(CTC)
-
-    ### PROBLEM CHILD ###
-
-    ncabs = jnp.where(S2 < 1.0e-6, True, False)
-
-    V_N = V[:, ncabs]
-
-    C_cabs = jnp.dot(C_ribs, V_N)
-
-    ### PROBLEM CHILD ###
-
-    psi4.set_num_threads(threads)
-
-    return C_cabs
\ No newline at end of file
diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index cf987c7..f974a72 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -3447,10 +3447,10 @@ PYBIND11_MODULE(libint_interface, m) {
     m.def("f12_double_commutator_deriv_disk", &f12_double_commutator_deriv_disk, "Computes gradient norm of contracted Gaussian-type geminal integral nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
     m.def("oei_deriv_core", &oei_deriv_core, "Computes a single OEI integral derivative tensor, in memory.");
     m.def("eri_deriv_core", &eri_deriv_core, "Computes a single coulomb integral nuclear derivative tensor, in memory.");
-    m.def("f12_deriv_core", &f12_deriv_core, "Computes a single contracted Gaussian-type geminal integral nuclear derivative tensor, in memory.");
-    m.def("f12_squared_deriv_core", &f12_squared_deriv_core, "Computes a single sqaured contracted Gaussian-type geminal integral nuclear derivative tensor, in memory.");
-    m.def("f12g12_deriv_core", &f12g12_deriv_core, "Computes a single contracted Gaussian-type geminal times Coulomb repulsion integral nuclear derivative tensor, in memory.");
-    m.def("f12_double_commutator_deriv_core", &f12_double_commutator_deriv_core, "Computes a single gradient norm of contracted Gaussian-type geminal integral nuclear derivative tensor, in memory.");
+    //m.def("f12_partial_deriv_core", &f12_deriv_core, "Computes a single contracted Gaussian-type geminal integral nuclear derivative tensor, in memory.");
+    //m.def("f12_squared_partial_deriv_core", &f12_squared_deriv_core, "Computes a single sqaured contracted Gaussian-type geminal integral nuclear derivative tensor, in memory.");
+    //m.def("f12g12_partial_deriv_core", &f12g12_deriv_core, "Computes a single contracted Gaussian-type geminal times Coulomb repulsion integral nuclear derivative tensor, in memory.");
+    //m.def("f12_double_commutator_partial_deriv_core", &f12_double_commutator_deriv_core, "Computes a single gradient norm of contracted Gaussian-type geminal integral nuclear derivative tensor, in memory.");
     //TODO partial derivative impl's
     //m.def("eri_partial_deriv_disk", &eri_partial_deriv_disk, "Computes a subset of the full coulomb integral nuclear derivative tensor and writes them to disk with HDF5");
      m.attr("LIBINT2_MAX_DERIV_ORDER") = LIBINT2_MAX_DERIV_ORDER;
diff --git a/quax/integrals/oei.py b/quax/integrals/oei.py
index 64de62b..1581abd 100644
--- a/quax/integrals/oei.py
+++ b/quax/integrals/oei.py
@@ -20,7 +20,7 @@ def __init__(self, basis1, basis2, xyz_path, max_deriv_order, mode):
         nbf1 = basis1.nbf()
         nbf2 = basis2.nbf()
 
-        if 'core' in mode and max_deriv_order > 0:
+        if mode == 'core' and max_deriv_order > 0:
             # A list of OEI derivative tensors, containing only unique elements
             # corresponding to upper hypertriangle (since derivative tensors are symmetric)
             # Length of tuple is maximum deriv order, each array is (upper triangle derivatives,nbf,nbf)
@@ -109,10 +109,13 @@ def overlap_deriv_impl(self, geom, deriv_vec):
         deriv_order = np.sum(deriv_vec)
         idx = get_deriv_vec_idx(deriv_vec)
 
-        if 'core' in self.mode:
+        if self.mode == 'core':
             S = self.overlap_derivatives[deriv_order-1][idx,:,:]
             return jnp.asarray(S)
-        elif 'disk' in self.mode:
+        if self.mode == 'f12':
+            S = libint_interface.overlap_deriv(deriv_vec)
+            return jnp.asarray(S).reshape(self.nbf1,self.nbf2)
+        elif self.mode == 'disk':
             if os.path.exists("oei_derivs.h5"):
                 file_name = "oei_derivs.h5"
                 dataset_name = "overlap_deriv" + str(deriv_order)
@@ -136,10 +139,13 @@ def kinetic_deriv_impl(self, geom, deriv_vec):
         deriv_order = np.sum(deriv_vec)
         idx = get_deriv_vec_idx(deriv_vec)
 
-        if 'core' in self.mode:
+        if self.mode == 'core':
             T = self.kinetic_derivatives[deriv_order-1][idx,:,:]
             return jnp.asarray(T)
-        elif 'disk' in self.mode:
+        if self.mode == 'f12':
+            T = libint_interface.kinetic_deriv(deriv_vec)
+            return jnp.asarray(T).reshape(self.nbf1,self.nbf2)
+        elif self.mode == 'disk':
             if os.path.exists("oei_derivs.h5"):
                 file_name = "oei_derivs.h5"
                 dataset_name = "kinetic_deriv" + str(deriv_order)
@@ -163,10 +169,13 @@ def potential_deriv_impl(self, geom, deriv_vec):
         deriv_order = np.sum(deriv_vec)
         idx = get_deriv_vec_idx(deriv_vec)
 
-        if 'core' in self.mode:
+        if self.mode == 'core':
             V = self.potential_derivatives[deriv_order-1][idx,:,:]
             return jnp.asarray(V)
-        elif 'disk' in self.mode:
+        if self.mode == 'f12':
+            V = libint_interface.potential_deriv(deriv_vec)
+            return jnp.asarray(V).reshape(self.nbf1,self.nbf2)
+        elif self.mode == 'disk':
             if os.path.exists("oei_derivs.h5"):
                 file_name = "oei_derivs.h5"
                 dataset_name = "potential_deriv" + str(deriv_order)
@@ -223,12 +232,13 @@ def potential_deriv_jvp(self, primals, tangents):
 
     # Define Batching rules, this is only needed since jax.jacfwd will call vmap on the JVP's
     # of each oei function
+    # When the input argument of deriv_batch is batched along the 0'th axis
+    # we want to evaluate every 2d slice, gather up a (ncart, n,n) array,
+    # (expand dims at 0 and concatenate at 0)
+    # and then return the results, indicating the out batch axis
+    # is in the 0th position (return results, 0)
+
     def overlap_deriv_batch(self, batched_args, batch_dims):
-        # When the input argument of deriv_batch is batched along the 0'th axis
-        # we want to evaluate every 2d slice, gather up a (ncart, n,n) array,
-        # (expand dims at 0 and concatenate at 0)
-        # and then return the results, indicating the out batch axis
-        # is in the 0th position (return results, 0)
         geom_batch, deriv_batch = batched_args
         geom_dim, deriv_dim = batch_dims
         results = []
diff --git a/quax/integrals/tei.py b/quax/integrals/tei.py
index 148f06c..c29852c 100644
--- a/quax/integrals/tei.py
+++ b/quax/integrals/tei.py
@@ -22,7 +22,7 @@ def __init__(self, basis1, basis2, basis3, basis4, xyz_path, max_deriv_order, op
         nbf3 = basis3.nbf()
         nbf4 = basis4.nbf()
 
-        if 'core' in mode and max_deriv_order > 0:
+        if mode == 'core' and max_deriv_order > 0:
             # A list of ERI derivative tensors, containing only unique elements
             # corresponding to upper hypertriangle (since derivative tensors are symmetric)
             # Length of tuple is maximum deriv order, each array is (upper triangle derivatives,nbf,nbf,nbf,nbf)
@@ -145,12 +145,16 @@ def eri_deriv_impl(self, geom, deriv_vec):
         idx = get_deriv_vec_idx(deriv_vec)
 
         # Use eri derivatives in memory
-        if 'core' in self.mode:
+        if self.mode == 'core':
             G = self.eri_derivatives[deriv_order-1][idx,:,:,:,:]
             return jnp.asarray(G)
 
+        if self.mode == 'f12':
+            G = libint_interface.eri_deriv(deriv_vec)
+            return jnp.asarray(G).reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
+
         # Read from disk
-        elif 'disk' in self.mode:
+        elif self.mode == 'disk':
             # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
             if os.path.exists("eri_derivs.h5"):
                 file_name = "eri_derivs.h5"
@@ -178,12 +182,12 @@ def f12_deriv_impl(self, geom, beta, deriv_vec):
         idx = get_deriv_vec_idx(deriv_vec)
 
         # Use f12 derivatives in memory
-        if 'core' in self.mode:
+        if self.mode == 'f12':
             F = libint_interface.f12_deriv(beta, deriv_vec)
             return jnp.asarray(F).reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
 
         # Read from disk
-        elif 'disk' in self.mode:
+        elif self.mode == 'disk':
             # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
             if os.path.exists("f12_derivs.h5"):
                 file_name = "f12_derivs.h5"
@@ -211,12 +215,12 @@ def f12_squared_deriv_impl(self, geom, beta, deriv_vec):
         idx = get_deriv_vec_idx(deriv_vec)
 
         # Use f12 squared derivatives in memory
-        if 'core' in self.mode:
+        if self.mode == 'f12':
             F = libint_interface.f12_squared_deriv(beta, deriv_vec)
             return jnp.asarray(F).reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
 
         # Read from disk
-        elif 'disk' in self.mode:
+        elif self.mode == 'disk':
             # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
             if os.path.exists("f12_squared_derivs.h5"):
                 file_name = "f12_squared_derivs.h5"
@@ -244,12 +248,12 @@ def f12g12_deriv_impl(self, geom, beta, deriv_vec):
         idx = get_deriv_vec_idx(deriv_vec)
 
         # Use f12g12 derivatives in memory
-        if 'core' in self.mode:
+        if self.mode == 'f12':
             F = libint_interface.f12g12_deriv(beta, deriv_vec)
             return jnp.asarray(F).reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
 
         # Read from disk
-        elif 'disk' in self.mode:
+        elif self.mode == 'disk':
             # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
             if os.path.exists("f12g12_derivs.h5"):
                 file_name = "f12g12_derivs.h5"
@@ -277,12 +281,12 @@ def f12_double_commutator_deriv_impl(self, geom, beta, deriv_vec):
         idx = get_deriv_vec_idx(deriv_vec)
 
         # Use f12 double commutator derivatives in memory
-        if 'core' in self.mode:
+        if self.mode == 'f12':
             F = libint_interface.f12_double_commutator_deriv(beta, deriv_vec)
             return jnp.asarray(F).reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
 
         # Read from disk
-        elif 'disk' in self.mode:
+        elif self.mode == 'disk':
             # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
             if os.path.exists("f12_double_commutator_derivs.h5"):
                 file_name = "f12_double_commutator_derivs.h5"
@@ -307,6 +311,8 @@ def f12_double_commutator_deriv_impl(self, geom, beta, deriv_vec):
     # Create Jacobian-vector product rule, which given some input args (primals)
     # and a tangent std basis vector (tangent), returns the function evaluated at that point (primals_out)
     # and the slice of the Jacobian (tangents_out)
+    # For high-order differentiation, we add the current value of deriv_vec to the incoming tangent vector
+
     def eri_jvp(self, primals, tangents):
         geom, = primals
         primals_out = self.eri(geom)
@@ -316,8 +322,6 @@ def eri_jvp(self, primals, tangents):
     def eri_deriv_jvp(self, primals, tangents):
         geom, deriv_vec = primals
         primals_out = self.eri_deriv(geom, deriv_vec)
-        # Here we add the current value of deriv_vec to the incoming tangent vector,
-        # so that nested higher order differentiation works
         tangents_out = self.eri_deriv(geom, deriv_vec + tangents[0])
         return primals_out, tangents_out
 
@@ -330,8 +334,6 @@ def f12_jvp(self, primals, tangents):
     def f12_deriv_jvp(self, primals, tangents):
         geom, beta, deriv_vec = primals
         primals_out = self.f12_deriv(geom, beta, deriv_vec)
-        # Here we add the current value of deriv_vec to the incoming tangent vector,
-        # so that nested higher order differentiation works
         tangents_out = self.f12_deriv(geom, beta, deriv_vec + tangents[0])
         return primals_out, tangents_out
 
@@ -344,8 +346,6 @@ def f12_squared_jvp(self, primals, tangents):
     def f12_squared_deriv_jvp(self, primals, tangents):
         geom, beta, deriv_vec = primals
         primals_out = self.f12_squared_deriv(geom, beta, deriv_vec)
-        # Here we add the current value of deriv_vec to the incoming tangent vector,
-        # so that nested higher order differentiation works
         tangents_out = self.f12_squared_deriv(geom, beta, deriv_vec + tangents[0])
         return primals_out, tangents_out
 
@@ -358,8 +358,6 @@ def f12g12_jvp(self, primals, tangents):
     def f12g12_deriv_jvp(self, primals, tangents):
         geom, beta, deriv_vec = primals
         primals_out = self.f12g12_deriv(geom, beta, deriv_vec)
-        # Here we add the current value of deriv_vec to the incoming tangent vector,
-        # so that nested higher order differentiation works
         tangents_out = self.f12g12_deriv(geom, beta, deriv_vec + tangents[0])
         return primals_out, tangents_out
 
@@ -372,18 +370,17 @@ def f12_double_commutator_jvp(self, primals, tangents):
     def f12_double_commutator_deriv_jvp(self, primals, tangents):
         geom, beta, deriv_vec = primals
         primals_out = self.f12_double_commutator_deriv(geom, beta, deriv_vec)
-        # Here we add the current value of deriv_vec to the incoming tangent vector,
-        # so that nested higher order differentiation works
         tangents_out = self.f12_double_commutator_deriv(geom, beta, deriv_vec + tangents[0])
         return primals_out, tangents_out
 
     # Define Batching rules, this is only needed since jax.jacfwd will call vmap on the JVP of tei
+    # When the input argument of deriv_batch is batched along the 0'th axis
+    # we want to evaluate every 4d slice, gather up a (ncart, n,n,n,n) array,
+    # (expand dims at 0 and concatenate at 0)
+    # and then return the results, indicating the out batch axis
+    # is in the 0th position (return results, 0)
+
     def eri_deriv_batch(self, batched_args, batch_dims):
-        # When the input argument of deriv_batch is batched along the 0'th axis
-        # we want to evaluate every 4d slice, gather up a (ncart, n,n,n,n) array,
-        # (expand dims at 0 and concatenate at 0)
-        # and then return the results, indicating the out batch axis
-        # is in the 0th position (return results, 0)
         geom_batch, deriv_batch = batched_args
         geom_dim, deriv_dim = batch_dims
         results = []
@@ -394,11 +391,6 @@ def eri_deriv_batch(self, batched_args, batch_dims):
         return results, 0
     
     def f12_deriv_batch(self, batched_args, batch_dims):
-        # When the input argument of deriv_batch is batched along the 0'th axis
-        # we want to evaluate every 4d slice, gather up a (ncart, n,n,n,n) array,
-        # (expand dims at 0 and concatenate at 0)
-        # and then return the results, indicating the out batch axis
-        # is in the 0th position (return results, 0)
         geom_batch, beta_batch, deriv_batch = batched_args
         geom_dim, beta_dim, deriv_dim = batch_dims
         results = []
@@ -409,11 +401,6 @@ def f12_deriv_batch(self, batched_args, batch_dims):
         return results, 0
 
     def f12_squared_deriv_batch(self, batched_args, batch_dims):
-        # When the input argument of deriv_batch is batched along the 0'th axis
-        # we want to evaluate every 4d slice, gather up a (ncart, n,n,n,n) array,
-        # (expand dims at 0 and concatenate at 0)
-        # and then return the results, indicating the out batch axis
-        # is in the 0th position (return results, 0)
         geom_batch, beta_batch, deriv_batch = batched_args
         geom_dim, beta_dim, deriv_dim = batch_dims
         results = []
@@ -424,11 +411,6 @@ def f12_squared_deriv_batch(self, batched_args, batch_dims):
         return results, 0
 
     def f12g12_deriv_batch(self, batched_args, batch_dims):
-        # When the input argument of deriv_batch is batched along the 0'th axis
-        # we want to evaluate every 4d slice, gather up a (ncart, n,n,n,n) array,
-        # (expand dims at 0 and concatenate at 0)
-        # and then return the results, indicating the out batch axis
-        # is in the 0th position (return results, 0)
         geom_batch, beta_batch, deriv_batch = batched_args
         geom_dim, beta_dim, deriv_dim = batch_dims
         results = []
@@ -439,11 +421,6 @@ def f12g12_deriv_batch(self, batched_args, batch_dims):
         return results, 0
 
     def f12_double_commutator_deriv_batch(self, batched_args, batch_dims):
-        # When the input argument of deriv_batch is batched along the 0'th axis
-        # we want to evaluate every 4d slice, gather up a (ncart, n,n,n,n) array,
-        # (expand dims at 0 and concatenate at 0)
-        # and then return the results, indicating the out batch axis
-        # is in the 0th position (return results, 0)
         geom_batch, beta_batch, deriv_batch = batched_args
         geom_dim, beta_dim, deriv_dim = batch_dims
         results = []
diff --git a/quax/methods/basis_utils.py b/quax/methods/basis_utils.py
new file mode 100644
index 0000000..a3b4501
--- /dev/null
+++ b/quax/methods/basis_utils.py
@@ -0,0 +1,120 @@
+import psi4
+import jax
+import jax.numpy as jnp
+from jax.lax import fori_loop
+
+from .ints import compute_f12_oeints
+from .energy_utils import symmetric_orthogonalization
+
+def build_RIBS(molecule, basis_set, cabs_name):
+    """
+    Builds basis set for
+    CABS procedure
+    """
+
+    # Libint uses the suffix 'cabs' but Psi4 uses 'optri'
+    basis_name = basis_set.name()
+    try:
+        psi4_name = cabs_name.lower().replace('cabs', 'optri')
+    except:
+        raise Exception("Must use a cc-pVXZ-F12 or aug-cc-pVXZ basis set for F12 methods.")
+
+    keys = ["BASIS","CABS_BASIS"]
+    targets = [basis_name, psi4_name]
+    roles = ["ORBITAL","F12"]
+    others = [basis_name, basis_name]
+
+    # Creates combined basis set in Python
+    ao_union = psi4.driver.qcdb.libmintsbasisset.BasisSet.pyconstruct_combined(molecule.save_string_xyz(), keys, targets, roles, others)
+    ao_union['name'] = cabs_name
+    ribs_set = psi4.core.BasisSet.construct_from_pydict(molecule, ao_union, 0)
+
+    return ribs_set
+
+def build_CABS(geom, basis_set, cabs_set, xyz_path, deriv_order, options):
+    """
+    Builds and returns 
+    CABS transformation matrix
+    """
+    # Make Thread Safe
+    threads = psi4.get_num_threads()
+    psi4.set_num_threads(1)
+
+    # Orthogonalize combined basis set
+    S_ao_ribs_ribs = compute_f12_oeints(geom, cabs_set, cabs_set, xyz_path, deriv_order, options, True)
+    C_ribs = symmetric_orthogonalization(S_ao_ribs_ribs, 1.0e-8)
+
+    # Compute the overlap matrix between OBS and RIBS
+    S_ao_obs_ribs = compute_f12_oeints(geom, basis_set, cabs_set, xyz_path, deriv_order, options, True)
+
+    U, S, Vt = svd_full(S_ao_obs_ribs @ C_ribs)
+
+    def loop_zero_vals(idx, count):
+        count += jax.lax.cond(abs(S[idx]) < 1.0e-6, lambda: 1, lambda: 0)
+        return count
+    ncabs = fori_loop(0, S.shape[0], loop_zero_vals, S.shape[0])
+
+    V_N = jnp.transpose(Vt[ncabs:, :])
+
+    C_cabs = jnp.dot(C_ribs, V_N)
+
+    psi4.set_num_threads(threads)
+
+    return C_cabs
+
+def F_ij(s, m):
+    """
+    Code from https://github.com/williamberman/svd-derivative/blob/main/svd-derivative.ipynb
+    """
+
+    F_ij = lambda i, j: jax.lax.cond(i == j, lambda: 0., lambda: 1 / (s[j]**2 - s[i]**2))
+    F_fun = jax.vmap(jax.vmap(F_ij, (None, 0)), (0, None))
+
+    indices = jnp.arange(m)
+    F = F_fun(indices, indices)
+
+    return F
+
+@jax.custom_jvp
+def svd_full(A):
+
+    U, S, Vt = jnp.linalg.svd(A)
+
+    return U, S, Vt
+
+@svd_full.defjvp
+def svd_full_jvp(primals, tangents):
+    A, = primals
+    dA, = tangents
+
+    m = A.shape[0]
+    n = A.shape[1]
+
+    U, S, Vt = svd_full(A)
+
+    dP = U.T @ dA @ Vt.T
+
+    dS = jnp.fill_diagonal(jnp.zeros((m, n)), 1, inplace=False) * dP
+
+    S1 = jnp.diag(S)
+
+    dP1 = dP[:, :m]
+
+    F = F_ij(S, m)
+
+    dU = U @ (F * (dP1 @ S1 + S1 @ dP1.T))
+
+    dD1 = F * (S1 @ dP1 + dP1.T @ S1)
+
+    dD2 = jnp.linalg.inv(S1) @ dP[:, m:]
+
+    dD3 = jnp.zeros((n-m, n-m))
+
+    dD_left = jnp.concatenate((dD1, dD2.T))
+    dD_right = jnp.concatenate((-dD2, dD3))
+
+    dD = jnp.concatenate((dD_left, dD_right), axis=1)
+
+    dV = Vt.T @ dD
+
+    return (U, S, Vt), (dU, jnp.diagonal(dS), dV.T)
\ No newline at end of file
diff --git a/quax/methods/energy_utils.py b/quax/methods/energy_utils.py
index 8df2234..bfcf9be 100644
--- a/quax/methods/energy_utils.py
+++ b/quax/methods/energy_utils.py
@@ -21,15 +21,7 @@ def symmetric_orthogonalization(S, cutoff = 1.0e-12):
     """
     evals, evecs = jnp.linalg.eigh(S)
 
-    def loop_evals(idx, M):
-        val = jax.lax.cond(abs(evals[idx]) > cutoff * jnp.max(abs(evals)),
-                           lambda: 1 / jnp.sqrt(evals[idx]),
-                           lambda: 0.0)
-        
-        M = M.at[idx, idx].set(val)
-        return M
-    
-    sqrtm = jax.lax.fori_loop(0, evals.shape[0], loop_evals, jnp.zeros(S.shape))
+    sqrtm = jnp.diag(jnp.where(abs(evals) > cutoff, 1 / jnp.sqrt(abs(evals)), 0.0))
 
     A = evecs @ sqrtm @ evecs.T
     return A
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index c48b73a..cb51a53 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -78,7 +78,7 @@ def compute_f12_oeints(geom, basis1, basis2, xyz_path, deriv_order, options, cab
 
         else:
             # Precompute OEI derivatives
-            oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'core')
+            oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'f12')
             # Compute integrals
             S = oei_obj.overlap(geom)
         
@@ -102,7 +102,7 @@ def compute_f12_oeints(geom, basis1, basis2, xyz_path, deriv_order, options, cab
 
         else:
             # Precompute OEI derivatives
-            oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'core')
+            oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'f12')
             # Compute integrals
             T = oei_obj.kinetic(geom)
             V = oei_obj.potential(geom)
@@ -158,7 +158,7 @@ def compute_f12_teints(geom, basis1, basis2, basis3, basis4, int_type, xyz_path,
 
     else:
         # Precompute TEI derivatives
-        tei_obj = TEI(basis1, basis2, basis3, basis4, xyz_path, deriv_order, options, 'core')
+        tei_obj = TEI(basis1, basis2, basis3, basis4, xyz_path, deriv_order, options, 'f12')
         # Compute integrals
         match int_type:
             case "f12":
diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index 1dfb01a..09aad86 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -6,9 +6,9 @@
 import sys
 jnp.set_printoptions(threshold=sys.maxsize, linewidth=100)
 
-from ..integrals.basis_utils import build_CABS
+from .basis_utils import build_CABS
 from .ints import compute_f12_oeints, compute_f12_teints
-from .energy_utils import partial_tei_transformation
+from .energy_utils import partial_tei_transformation, cartesian_product
 from .mp2 import restricted_mp2
 
 def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, options, cabs_set, deriv_order=0):
@@ -49,53 +49,44 @@ def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, optio
     G = two_body_mo_computer(geom, "eri", basis_set, basis_set, basis_set, basis_set,\
                              C_obs, C_obs, C_obs, C_obs, xyz_path, deriv_order, options)
     
-    # indices = jnp.asarray(jnp.triu_indices(ndocc)).reshape(2,-1).T
+    indices = jnp.asarray(jnp.triu_indices(ndocc)).reshape(2,-1).T
 
-    # def loop_energy(idx, f12_corr):
-        # i, j = indices[idx]
-    
-    dE_mp2f12 = 0.0
-    for i in range(ndocc):
-        for j in range(i, ndocc):
-            kd = jax.lax.cond(i == j, lambda: 1.0, lambda: 2.0)
-
-            D_ij = D[i, j, :, :]
+    def loop_energy(idx, f12_corr):
+        i, j = indices[idx]
+        kd = jax.lax.cond(i == j, lambda: 1.0, lambda: 2.0)
 
-            GD_ij = jnp.einsum('ab,ab->ab', G[i, j, ndocc:, ndocc:], D_ij, optimize='optimal')
-            V_ij = V[i, j, :, :] - jnp.einsum('klab,ab->kl', C, GD_ij, optimize='optimal')
+        D_ij = D[i, j, :, :]
 
-            V_s = 0.25 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd * (V_ij[i, j] + V_ij[j, i])
+        GD_ij = jnp.einsum('ab,ab->ab', G[i, j, ndocc:, ndocc:], D_ij, optimize='optimal')
+        V_ij = V[i, j, :, :] - jnp.einsum('klab,ab->kl', C, GD_ij, optimize='optimal')
 
-            V_t = 0.25 * jax.lax.cond(i != j, lambda: (t_(i, j, i, j) - t_(i, j, j, i))
-                                                   * kd * (V_ij[i, j] - V_ij[j, i]), lambda: 0.0)
+        V_s = 0.25 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd * (V_ij[i, j] + V_ij[j, i])
 
-            CD_ij = jnp.einsum('mnab,ab->mnab', C, D_ij, optimize='optimal')
-            B_ij = B - (X * (f[i, i] + f[j, j])) - jnp.einsum('klab,mnab->klmn', C, CD_ij, optimize='optimal')
+        V_t = 0.25 * jax.lax.cond(i != j, lambda: (t_(i, j, i, j) - t_(i, j, j, i))
+                                               * kd * (V_ij[i, j] - V_ij[j, i]), lambda: 0.0)
 
-            B_s = 0.125 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd \
-                         * (B_ij[i, j, i, j] + B_ij[j, i, i, j]) \
-                         * (t_(i, j, i, j) + t_(i, j, j, i)) * kd
+        CD_ij = jnp.einsum('mnab,ab->mnab', C, D_ij, optimize='optimal')
+        B_ij = B - (X * (f[i, i] + f[j, j])) - jnp.einsum('klab,mnab->klmn', C, CD_ij, optimize='optimal')
 
-            B_t = 0.125 * jax.lax.cond(i != j, lambda: (t_(i, j, i, j) - t_(i, j, j, i)) * kd
-                                                     * (B_ij[i, j, i, j] - B_ij[j, i, i, j])
-                                                     * (t_(i, j, i, j) - t_(i, j, j, i)) * kd,
-                                                     lambda: 0.0)
+        B_s = 0.125 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd \
+                     * (B_ij[i, j, i, j] + B_ij[j, i, i, j]) \
+                     * (t_(i, j, i, j) + t_(i, j, j, i)) * kd
 
-            E_s = kd * (2.0 * V_s + B_s)         # Singlet Pair Energy
-            E_t = 3.0 * kd * (2.0 * V_t + B_t)   # Triplet Pair Energy
+        B_t = 0.125 * jax.lax.cond(i != j, lambda: (t_(i, j, i, j) - t_(i, j, j, i)) * kd
+                                                 * (B_ij[i, j, i, j] - B_ij[j, i, i, j])
+                                                 * (t_(i, j, i, j) - t_(i, j, j, i)) * kd,
+                                                 lambda: 0.0)
 
-            # print(E_s)
-            # print(E_t)
+        f12_corr += kd * (2.0 * V_s + B_s)         # Singlet Pair Energy
+        f12_corr += 3.0 * kd * (2.0 * V_t + B_t)   # Triplet Pair Energy
 
-            dE_mp2f12 += E_s + E_t
+        return f12_corr
 
-    #     return f12_corr
+    dE_mp2f12 = fori_loop(0, indices.shape[0], loop_energy, 0.0)
 
-    # dE_mp2f12 = fori_loop(0, indices.shape[0], loop_energy, 0.0)
+    E_s = cabs_singles(f, ndocc, nri)
 
-    jax.debug.print("OG: {e}", e=dE_mp2f12)
-
-    return dE_mp2f12
+    return E_mp2 + dE_mp2f12 + E_s
 
 # Fixed Amplitude Ansatz
 @jax.jit
@@ -255,12 +246,15 @@ def cabs_singles(f, ndocc, nri):
     e_ij, C_ij = jnp.linalg.eigh(f[:ndocc, :ndocc])
     e_AB, C_AB = jnp.linalg.eigh(f[ndocc:, ndocc:])
 
-    f_iA = C_ij @ f[:ndocc, ndocc:] @ C_AB.T
+    f_iA = C_ij.T @ f[:ndocc, ndocc:] @ C_AB
 
-    E_s = 0.0
-    for A in range(all_vir):
-        for i in range(ndocc):
-            E_s += (2 * f_iA[i, A] ** 2) / (e_ij[i] - e_AB[A])
+    indices = cartesian_product(jnp.arange(ndocc), jnp.arange(all_vir))
+    
+    def loop_singles(idx, singles):
+        i, A = indices[idx]
+        singles += 2 * f_iA[i, A]**2 / (e_ij[i] - e_AB[A])
+        return singles
+    E_s = fori_loop(0, indices.shape[0], loop_singles, 0.0)
 
     return E_s
 

From be2fba04c6a9ce944f0e8e6f6119ea730e614566 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Thu, 4 Jan 2024 11:12:24 -0500
Subject: [PATCH 36/91] Docs update, commenting

---
 README.md                   | 179 ++----------------------------------
 quax/methods/basis_utils.py |   3 +-
 setup.py                    |  13 +--
 3 files changed, 18 insertions(+), 177 deletions(-)

diff --git a/README.md b/README.md
index 1c44c21..016b95c 100644
--- a/README.md
+++ b/README.md
@@ -110,12 +110,9 @@ Obviously, for large basis sets and molecules, these arrays get very big very fa
 Unless you have impressive computing resources, partial derivatives are recommended for higher order derivatives.
 
 ### Caveats
-Our integrals code is _slow_. Using the Libint interface is highly recommended. However, compiling Libint for support for very high order
-derivatives (5th, 6th) takes a very long time and causes the library size to be very large (sometimes so large it's uncompilable), so using the Quax integrals
-is the best bet at this time.
+The Libint interface is a necessary dependency for Quax. However, compiling Libint for support for very high order
+derivatives (5th, 6th) takes a very long time and causes the library size to be very large (sometimes so large it's uncompilable).
 We will incrementally roll out improvements which allow user specification for how to handle higher-order integral derivatives.
-For example, control over when to use disk vs core memory, and whether Libint or Quax integral derivatives are computed.
-In principle, the Quax integrals code could also be improved.
 Contributions and suggestions are welcome.
 
 Also, we do not recommend computing derivatives of systems with many degenerate orbitals.
@@ -128,27 +125,16 @@ Workarounds for this are coming soon.
 ### Anaconda Environment installation instructions
 To use Quax, only a few dependencies are needed. We recommend using a clean Anaconda environment: 
 ```
-conda create -n quax python=3.7
+conda create -n quax python=3.10
 conda activate quax
-conda install -c psi4 psi4
+conda install psi4 python=3.10 -c conda-forge/label/libint_dev -c conda-forge
 python setup.py install
 ```
 
-This is sufficient to use Quax without the Libint interface.
-
 ### Building the Libint Interface
-If you plan to use the Libint interface (highly recommnded), you can install those dependencies as well.
+For the Libint interface, you nust install those dependencies as well.
 ```
-conda install libstdcxx-ng
-conda install gcc_linux-64
-conda install gxx_linux-64
-conda install ninja
-conda install boost
-conda install eigen3
-conda install gmp
-conda install bzip2
-conda install cmake
-conda install pybind11
+conda install libstdcxx-ng gcc_linux-64 gxx_linux-64 ninja boost eigen3 gmp bzip2 cmake pybind11
 ```
 
 We note here that the default gcc version (4.8) that comes with `conda install gcc` is not recent enough to successfully compile the Quax-Libint interface.
@@ -171,7 +157,7 @@ cd libint
 mkdir BUILD
 cd BUILD
 mkdir PREFIX
- ../configure --prefix=/home/adabbott/Git/libint/libint/build/PREFIX --with-max-am=2 --with-opt-am=0 --enable-1body=4 --enable-eri=4 --with-multipole-max-order=0 --enable-eri3=no --enable-eri2=no --enable-g12=no --enable-g12dkh=no --with-pic --enable-static --enable-single-evaltype --enable-generic-code --disable-unrolling
+ ../configure --prefix=/path/to/libint/build/PREFIX --with-max-am=2 --with-opt-am=0 --enable-1body=4 --enable-eri=4 --with-multipole-max-order=0 --enable-eri3=no --enable-eri2=no --enable-g12=no --enable-g12dkh=no --with-pic --enable-static --enable-single-evaltype --enable-generic-code --disable-unrolling
 
 make export
 ```
@@ -205,7 +191,7 @@ Also note that Libint recommends using Ninja to build for performance reasons. T
 `cmake . -G Ninja -DCMAKE_INSTALL_PREFIX=/path/to/libint/PREFIX/ -DCMAKE_POSITION_INDEPENDENT_CODE=ON`
 
 ### Compiling the Libint-Quax interface
-Once Libint is installed, the makefile in `quax/external_integrals/makefile` needs to be edited with your compiler and the proper paths specifying the locations
+Once Libint is installed, the makefile in `quax/integrals/makefile` needs to be edited with your compiler and the proper paths specifying the locations
 of headers and libraries for Libint, pybind11, HDF5, and python. 
 
 The `LIBINT_PREFIX` path in the makefile is wherever you installed the headers and the static library `lib/libint2.a`. 
@@ -213,152 +199,7 @@ All of the required headers and libraries should be discoverable in the Anaconda
 After editing the paths appropriately and setting the CC compiler to `x86_64-conda_cos6-linux-gnu-gcc`, or 
 if you have a nice modern compiler available, use that.
 
-Running `make` in the directory `quax/external_integrals/` to compile the Libint interface.
-
-
-<!---
-The library requires several dependencies, most of which are taken care of with `setup.py`.
-To install, clone this repository, and run 
-```pip install .```
-If you plan to develop/edit/play around with the code,
-install with `pip install -e .` so that the installation will automatically update when changes are made.
-This takes care of the following dependencies, according to the contents of `setup.py`.
-```
-numpy
-jax
-jaxlib
-h5py
-```
-
-In addition to the dependencies in `setup.py`, this library requires an installation of Psi4.
-The easiest way to install psi4 is with Anaconda:
-`conda install -c psi4 psi4`
-If you do not want to use Anaconda, you can install Psi4 from source (much more difficult).
-These installation options (Psi4, and the dependencies in `setup.py`) are sufficient
-for computing derivatives of electronic structure methods.
-
-### Integral Derivative Computation
-A primary bottleneck of the code is the computation of nuclear derivatives of one and two electron integrals over Gaussian basis functions.
-We feature a very simple integral code built using entirely JAX utilities in the `integrals/oei.py` and `integrals/tei.py`. 
-This code works for arbitrary angular momentum and arbitary order derivatives, however it is quite slow and has high memory usage
-due to the overhead associated with JIT compilation and the derivative code generation which occurs every time the program is run.
-
-To avoid that performance issue, simply use the library with [Libint](https://github.com/evaleev/libint) (**strongly** recommended).
-Note that Libint needs to be configured for the order of differentation and maximum angular momentum
-you wish to support. By default, higher order derivatives of one and two electron integrals are not configured,
-they have to be specifically requested, e.g. for fourth derivatives, 
-it must be compiled with configure flags `--enable-1body=4 --enable-eri=4`. See the Libint installation instructions for details.
-Depending on these configuration options, the generation of a Libint library and subsequent compilation 
-can take a few days or even over a week. A preconfigured tarball which supports up to f functions and
-fourth order derivatives will be made available by some means in the future. 
-
-For building with Libint, more dependencies are introduced, some of which are needed for Libint, and others
-are needed for the Libint interface for this software. I strongly recommend dumping everything
-into a clean Anaconda environment.
-To generate a clean conda environment for running the code,
-```
-conda create -n psijax python=3.6
-conda activate psijax 
-conda install -c psi4 psi4
-```
-
-Then install the dependencies needed for the Libint interface:
-```
-conda install -c conda-forge pybind11
-conda install -c omnia eigen3
-conda install hdf5
-conda install gmp
-conda install bzip2
-conda install boost
-conda install cmake
-conda install libstdcxx-ng
-conda install -c conda-forge libcxx
-```
-
-
-NEW  have to install
-```
-conda create -n jax python=3.6
-conda activate jax
-conda install ninja
-conda install -c omnia eigen3
-conda install gcc
-conda install -c conda-forge pybind11
-conda install gcc_linux-64  ###THIS ONE
-conda install boost
-```
-
-Libint's gmp issues can be taken care of by installing `conda install gcc_linux-64`
-Also need `conda install gxx_linux-64` 
-
-
-### Building the Libint Interface
-
-The default gcc version 4.8 that comes with `conda install gcc` is not recent enough to successfully compile the Quax-Libint interface.
-You must instead use a more modern compiler. To do this in anaconda, we need to use
-`x86_64-conda_cos6-linux-gnu-gcc` as our compiler instead of gcc.
-This is available by installing `gcc_linux-64` and `gxx_linux-64`.
-Feel free to try other more 
-Thus a complete anaconda envrionment, containing everything you need to run the code and compile the Libint interface,
-would include:
-
-```
-conda create -n quax python=3.7
-conda activate quax 
-conda install -c psi4 psi4
-conda install gcc_linux-64
-conda install gxx_linux-64
-conda install ninja
-conda install boost
-conda install eigen3
-conda install gmp
-conda install bzip2
-conda install cmake
-conda install pybind11
-
-pip install jax
-pip install jaxlib
-conda install h5py
-```
-
-These are sufficient to compile the Libint interface.
-Head over to `external_integrals/` directory and edit the makefile with the appropriate paths.
-All of the required headers and libraries should be discoverable in the Anaconda environment's include and lib paths.
-After editing the paths appropriately and setting the CC compiler to `x86_64-conda_cos6-linux-gnu-gcc`, or 
-if you have a nice modern compiler available, use that.
-
-Libint's gmp issues can be taken care of by installing `conda install gcc_linux-64`
-Also need `conda install gxx_linux-64` 
-
-
-Now, given a Libint tarball which supports the desired maximum angular momentum and derivative order,
-we need to unpack the library, `cd` into it, and `mkdir PREFIX` where the headers and static library will be stored.
-Then it is built and compiled. The position independent code flag is required for Libint to play nice with pybind11.
-The `-j4` flag instructs how many processors to use in compilation, and can be adjusted according to your system. The `--target check` runs the Libint test suite; it is not required.
-The --target check runs test suite, and finally the install command installs the headers and static library into the PREFIX directory.
-```
-tar -xvf libint_*.tgz
-cd libint-*/
-mkdir PREFIX
-cmake . -DCMAKE_INSTALL_PREFIX=/path/to/libint/PREFIX/ -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-cmake --build . -- -j4
-cmake --build . --target check
-cmake --build . --target install
-```
-
-
-### Installing Libint in a clean conda environment
-Note that the cmake command may not find various libraries for the dependencies of Libint.
-`cmake . -DCMAKE_INSTALL_PREFIX=/path/to/libint/PREFIX/ -DCMAKE_POSITION_INDEPENDENT_CODE=ON`
-To fix this, you may need to explicitly point to it
-`export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/vulcan/adabbott/.conda/envs/quax/lib/`
-and then run the above cmake command.
-
-Also note that Libint recommends using Ninja to build for performance reasons. This can be done if Ninja is installed:
-`cmake . -G Ninja -DCMAKE_INSTALL_PREFIX=/path/to/libint/PREFIX/ -DCMAKE_POSITION_INDEPENDENT_CODE=ON`
-
-Once Libint is installed, the makefile in `external_integrals/makefile` needs to be edited to the proper paths specifying the locations
-of headers and libraries for Libint, pybind11, HDF5, and python. Then run `make` to compile the Libint interface.
+Running `make` in the directory `quax/integrals/` to compile the Libint interface.
 
 ### Citing Quax
 If you use Quax in your research, we would appreciate a citation:
@@ -374,5 +215,3 @@ If you use Quax in your research, we would appreciate a citation:
 }
 ```
 We also kindly request you give credit to the projects which make up the dependencies of Quax.
-
-
diff --git a/quax/methods/basis_utils.py b/quax/methods/basis_utils.py
index a3b4501..e33dc6b 100644
--- a/quax/methods/basis_utils.py
+++ b/quax/methods/basis_utils.py
@@ -65,6 +65,7 @@ def loop_zero_vals(idx, count):
 def F_ij(s, m):
     """
     Code from https://github.com/williamberman/svd-derivative/blob/main/svd-derivative.ipynb
+    Can be numerically unstable if singular values are degenerate
     """
 
     F_ij = lambda i, j: jax.lax.cond(i == j, lambda: 0., lambda: 1 / (s[j]**2 - s[i]**2))
@@ -106,7 +107,7 @@ def svd_full_jvp(primals, tangents):
 
     dD1 = F * (S1 @ dP1 + dP1.T @ S1)
 
-    dD2 = jnp.linalg.inv(S1) @ dP[:, m:]
+    dD2 = jnp.linalg.inv(S1) @ dP[:, m:] # Can be numerically unstable due to inversion
 
     dD3 = jnp.zeros((n-m, n-m))
 
diff --git a/setup.py b/setup.py
index 8b7562a..6f0e3b1 100644
--- a/setup.py
+++ b/setup.py
@@ -3,19 +3,20 @@
 if __name__ == "__main__":
     setuptools.setup(
         name='quax',
-        version="0.1.1",
+        version="0.2.0a1",
         description='Arbitrary order derivatives of electronic structure computations.',
-        author='Adam Abbott',
-        author_email='adabbott@uga.edu',
+        author='Adam Abbott, Erica Mitchell',
+        author_email='adabbott@uga.edu, emitchell@uga.edu',
         url="none",
         license='BSD-3C',
         packages=setuptools.find_packages(where="quax"),
         package_dir={"": "quax"},
         install_requires=[
-            'numpy>=1.7',
-            'jax>=0.2.9',
-            'jaxlib>=0.1.61',
+            'numpy>=1.23',
+            'jax>=0.4.19',
+            'jaxlib>=0.4.19',
             'h5py>=2.8.0'
+            'scipy>=1.9'
         ],
         extras_require={
             'tests': [

From d6625498d1ed78c2f3a486dad1bf449472d7d467 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 22 Jan 2024 10:24:08 -0500
Subject: [PATCH 37/91] Spectral shift for MP2-F12

---
 quax/methods/basis_utils.py  | 13 ++++++++++---
 quax/methods/hartree_fock.py |  3 +--
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/quax/methods/basis_utils.py b/quax/methods/basis_utils.py
index e33dc6b..727f176 100644
--- a/quax/methods/basis_utils.py
+++ b/quax/methods/basis_utils.py
@@ -2,6 +2,7 @@
 import jax
 import jax.numpy as jnp
 from jax.lax import fori_loop
+import functools
 
 from .ints import compute_f12_oeints
 from .energy_utils import symmetric_orthogonalization
@@ -42,12 +43,19 @@ def build_CABS(geom, basis_set, cabs_set, xyz_path, deriv_order, options):
 
     # Orthogonalize combined basis set
     S_ao_ribs_ribs = compute_f12_oeints(geom, cabs_set, cabs_set, xyz_path, deriv_order, options, True)
+
+    if options['spectral_shift']:
+        convergence = 1e-8
+        fudge = jnp.asarray(jnp.linspace(0, 1, S_ao_ribs_ribs.shape[0])) * convergence
+        shift = jnp.diag(fudge)
+        S_ao_ribs_ribs += shift
+
     C_ribs = symmetric_orthogonalization(S_ao_ribs_ribs, 1.0e-8)
 
     # Compute the overlap matrix between OBS and RIBS
     S_ao_obs_ribs = compute_f12_oeints(geom, basis_set, cabs_set, xyz_path, deriv_order, options, True)
 
-    U, S, Vt = svd_full(S_ao_obs_ribs @ C_ribs)
+    _, S, Vt = svd_full(S_ao_obs_ribs @ C_ribs)
 
     def loop_zero_vals(idx, count):
         count += jax.lax.cond(abs(S[idx]) < 1.0e-6, lambda: 1, lambda: 0)
@@ -64,7 +72,6 @@ def loop_zero_vals(idx, count):
 
 def F_ij(s, m):
     """
-    Code from https://github.com/williamberman/svd-derivative/blob/main/svd-derivative.ipynb
     Can be numerically unstable if singular values are degenerate
     """
 
@@ -118,4 +125,4 @@ def svd_full_jvp(primals, tangents):
 
     dV = Vt.T @ dD
 
-    return (U, S, Vt), (dU, jnp.diagonal(dS), dV.T)
\ No newline at end of file
+    return (U, S, Vt), (dU, jnp.diagonal(dS), dV.T)
diff --git a/quax/methods/hartree_fock.py b/quax/methods/hartree_fock.py
index ebaa392..026028a 100644
--- a/quax/methods/hartree_fock.py
+++ b/quax/methods/hartree_fock.py
@@ -1,7 +1,6 @@
 import jax 
 jax.config.update("jax_enable_x64", True)
 import jax.numpy as jnp
-import numpy as np
 import psi4
 
 from .ints import compute_integrals
@@ -36,7 +35,7 @@ def restricted_hartree_fock(geom, basis_set, xyz_path, nuclear_charges, charge,
     if spectral_shift:
         # Shifting eigenspectrum requires lower convergence.
         convergence = 1e-8 
-        fudge = jnp.asarray(np.linspace(0, 1, nbf)) * convergence
+        fudge = jnp.asarray(jnp.linspace(0, 1, nbf)) * convergence
         shift = jnp.diag(fudge)
     else:
         shift = jnp.zeros_like(S)

From c593054b45d76908ec778494918af19e576136bd Mon Sep 17 00:00:00 2001
From: Erica Mitchell <ericamitch5@gmail.com>
Date: Thu, 25 Jan 2024 10:07:30 -0500
Subject: [PATCH 38/91] Overhaul of libint_interface

---
 quax/integrals/libint_interface.cc | 3813 +++++++---------------------
 quax/integrals/oei.py              |   30 +-
 quax/integrals/tei.py              |  100 +-
 quax/methods/basis_utils.py        |   19 +-
 quax/methods/ints.py               |   96 +-
 5 files changed, 1105 insertions(+), 2953 deletions(-)

diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index f974a72..8e83cb4 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -121,8 +121,10 @@ void initialize(std::string xyzfilename, std::string basis1, std::string basis2,
     shell2atom_3 = bs3.shell2atom(atoms);
     shell2atom_4 = bs4.shell2atom(atoms);
 
-    max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()), std::max(bs3.max_nprim(), bs4.max_nprim()));
-    max_l = std::max(std::max(bs1.max_l(), bs2.max_l()), std::max(bs3.max_l(), bs4.max_l()));
+    max_nprim = std::max(std::max(bs1.max_nprim(), bs2.max_nprim()),
+                         std::max(bs3.max_nprim(), bs4.max_nprim()));
+    max_l = std::max(std::max(bs1.max_l(), bs2.max_l()),
+                     std::max(bs3.max_l(), bs4.max_l()));
 
     // Get number of OMP threads
 #ifdef _OPENMP
@@ -170,12 +172,15 @@ std::vector<std::pair<double, double>> take_square(std::vector<std::pair<double,
 
 // Cartesian product of arbitrary number of vectors, given a vector of vectors
 // Used to find all possible combinations of indices which correspond to desired nuclear derivatives
-// For example, if molecule has two atoms, A and B, and we want nuclear derivative d^2/dAz dBz, represented by deriv_vec = [0,0,1,0,0,1], 
-// and we are looping over 4 shells in ERI's, and the four shells are atoms (0,0,1,1), then possible indices 
+// For example, if molecule has two atoms, A and B, and we want nuclear derivative d^2/dAz dBz,
+// represented by deriv_vec = [0,0,1,0,0,1], and we are looping over 4 shells in ERI's,
+// and the four shells are atoms (0,0,1,1), then possible indices 
 // of the 0-11 shell cartesian component indices are {2,5} for d/dAz and {8,11} for d/dBz.
-// So the vector passed to cartesian_product is { {{2,5},{8,11}}, and all combinations of elements from first and second subvectors
-// are produced, and the total nuclear derivative of the shell is obtained by summing all of these pieces together.
-// These resulting indices are converted to flattened Libint buffer indices using the generate_*_lookup functions, explained below.
+// So the vector passed to cartesian_product is { {{2,5},{8,11}}, and all combinations of elements
+// from first and second subvectors are produced, and the total nuclear derivative of the shell
+// is obtained by summing all of these pieces together.
+// These resulting indices are converted to flattened Libint buffer indices using the generate_*_lookup functions,
+// explained below.
 std::vector<std::vector<int>> cartesian_product (const std::vector<std::vector<int>>& v) {
     std::vector<std::vector<int>> s = {{}};
     for (const auto& u : v) {
@@ -256,59 +261,28 @@ std::vector<std::vector<int>> generate_multi_index_lookup(int nparams, int deriv
     return combos;
 }
 
-// Compute overlap integrals
-py::array overlap() {
-    // Overlap integral engine
-    std::vector<libint2::Engine> s_engines(nthreads);
-    s_engines[0] = libint2::Engine(libint2::Operator::overlap, max_nprim, max_l);
-    for (size_t i = 1; i != nthreads; ++i) {
-        s_engines[i] = s_engines[0];
-    }
-
-    size_t length = nbf1 * nbf2;
-    std::vector<double> result(length); // vector to store integral array
-
-#pragma omp parallel for collapse(2) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            auto bf1 = shell2bf_1[s1];  // first basis function in first shell
-            auto n1 = bs1[s1].size(); // number of basis functions in first shell
-            auto bf2 = shell2bf_2[s2];  // first basis function in second shell
-            auto n2 = bs2[s2].size(); // number of basis functions in second shell
-
-            size_t thread_id = 0;
-#ifdef _OPENMP
-            thread_id = omp_get_thread_num();
-#endif
-            s_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-            const auto& buf_vec = s_engines[thread_id].results(); // will point to computed shell sets
-
-            auto ints_shellset = buf_vec[0];    // Location of the computed integrals
-            if (ints_shellset == nullptr)
-                continue;  // nullptr returned if the entire shell-set was screened out
-
-            // Loop over shell block, keeping a total count idx for the size of shell set
-            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                    result[(bf1 + f1) * nbf2 + bf2 + f2] = ints_shellset[idx];
-                }
-            }
-        }
+// Compute one-electron integral
+py::array compute_1e_int(std::string type) {
+    // Integral engine
+    std::vector<libint2::Engine> engines(nthreads);
+    
+    if (type == "overlap") {
+        engines[0] = libint2::Engine(libint2::Operator::overlap, max_nprim, max_l);
+    } else if (type == "kinetic") {
+        engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l);
+    } else if (type == "potential") {
+        engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l);
+        engines[0].set_params(make_point_charges(atoms));
+    } else {
+       throw std::invalid_argument("type must be overlap, kinetic, or potential");
     }
-    return py::array(result.size(), result.data()); 
-}
 
-// Compute kinetic energy integrals
-py::array kinetic() {
-    // Kinetic energy integral engine
-    std::vector<libint2::Engine> t_engines(nthreads);
-    t_engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l);
     for (size_t i = 1; i != nthreads; ++i) {
-        t_engines[i] = t_engines[0];
+        engines[i] = engines[0];
     }
 
     size_t length = nbf1 * nbf2;
-    std::vector<double> result(length);
+    std::vector<double> result(length); // vector to store integral array
 
 #pragma omp parallel for collapse(2) num_threads(nthreads)
     for(auto s1 = 0; s1 != bs1.size(); ++s1) {
@@ -322,8 +296,8 @@ py::array kinetic() {
 #ifdef _OPENMP
             thread_id = omp_get_thread_num();
 #endif
-            t_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-            const auto& buf_vec = t_engines[thread_id].results(); // will point to computed shell sets
+            engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+            const auto& buf_vec = engines[thread_id].results(); // will point to computed shell sets
 
             auto ints_shellset = buf_vec[0];    // Location of the computed integrals
             if (ints_shellset == nullptr)
@@ -337,58 +311,38 @@ py::array kinetic() {
             }
         }
     }
-    return py::array(result.size(), result.data());
+    return py::array(result.size(), result.data()); 
 }
 
-// Compute nuclear-electron potential energy integrals
-py::array potential() {
-    // Potential integral engine
-    std::vector<libint2::Engine> v_engines(nthreads);
-    v_engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l);
-    v_engines[0].set_params(make_point_charges(atoms));
-    for (size_t i = 1; i != nthreads; ++i) {
-        v_engines[i] = v_engines[0];
-    }
-
-    size_t length = nbf1 * nbf2;
-    std::vector<double> result(length);
-
-#pragma omp parallel for collapse(2) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            auto bf1 = shell2bf_1[s1];  // first basis function in first shell
-            auto n1 = bs1[s1].size(); // number of basis functions in first shell
-            auto bf2 = shell2bf_2[s2];  // first basis function in second shell
-            auto n2 = bs2[s2].size(); // number of basis functions in second shell
-
-            size_t thread_id = 0;
-#ifdef _OPENMP
-            thread_id = omp_get_thread_num();
-#endif
-            v_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-            const auto& buf_vec = v_engines[thread_id].results(); // will point to computed shell sets
-
-            auto ints_shellset = buf_vec[0];    // Location of the computed integrals
-            if (ints_shellset == nullptr)
-                continue;  // nullptr returned if the entire shell-set was screened out
+// Computes two-electron integrals
+py::array compute_2e_int(std::string type, double beta) {
+    // workaround for data copying: perhaps pass an empty numpy array, then populate it in C++?
+    // avoids last line, which copies
+    std::vector<libint2::Engine> eri_engines(nthreads);
 
-            // Loop over shell block, keeping a total count idx for the size of shell set
-            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                    // idx = x + (y * width) where x = bf2 + f2 and y = bf1 + f1 
-                    result[(bf1 + f1) * nbf2 + bf2 + f2] = ints_shellset[idx];
-                }
-            }
-        }
+    if (type == "eri") {
+        eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l);
+    } else if (type == "f12") {
+        auto cgtg_params = make_cgtg(beta);
+        eri_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l);
+        eri_engines[0].set_params(cgtg_params);
+    } else if (type == "f12g12") {
+        auto cgtg_params = make_cgtg(beta);
+        eri_engines[0] = libint2::Engine(libint2::Operator::cgtg_x_coulomb, max_nprim, max_l);
+        eri_engines[0].set_params(cgtg_params);
+    } else if (type == "f12_squared") {
+        auto cgtg_params = take_square(make_cgtg(beta));
+        eri_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l);
+        eri_engines[0].set_params(cgtg_params);
+    } else if (type == "f12_double_commutator") {
+        auto cgtg_params = make_cgtg(beta);
+        eri_engines[0] = libint2::Engine(libint2::Operator::delcgtg2, max_nprim, max_l, 0,
+                                            std::numeric_limits<libint2::scalar_type>::epsilon(),
+                                            cgtg_params, libint2::BraKet::xx_xx);
+    } else {
+        throw std::invalid_argument("type must be eri, f12, f12g12, f12_squared, or f12_double_commutator");
     }
-    return py::array(result.size(), result.data());
-}
 
-// Computes electron repulsion integrals
-py::array eri() {
-    // workaround for data copying: perhaps pass an empty numpy array, then populate it in C++? avoids last line, which copies
-    std::vector<libint2::Engine> eri_engines(nthreads);
-    eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l);
     for (size_t i = 1; i != nthreads; ++i) {
         eri_engines[i] = eri_engines[0];
     }
@@ -438,234 +392,267 @@ py::array eri() {
             }
         }
     }
-    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
+    return py::array(result.size(), result.data());
+    // This apparently copies data, but it should be fine right?
+    // https://github.com/pybind/pybind11/issues/1042 there's a workaround
 }
 
-// Computes integrals of contracted Gaussian-type geminal
-py::array f12(double beta) {
-    // workaround for data copying: perhaps pass an empty numpy array, then populate it in C++? avoids last line, which copies
-    auto cgtg_params = make_cgtg(beta);
-    std::vector<libint2::Engine> cgtg_engines(nthreads);
-    cgtg_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l);
-    cgtg_engines[0].set_params(cgtg_params);
+// Computes nuclear derivatives of one-electron integrals 
+py::array compute_1e_deriv(std::string type, std::vector<int> deriv_vec) {
+    assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
+    // Get order of differentiation
+    int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
+
+    // Convert deriv_vec to set of atom indices and their cartesian components which we are differentiating wrt
+    std::vector<int> desired_atom_indices;
+    std::vector<int> desired_coordinates;
+    process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
+
+    // Create mappings from 1d buffer index (flattened upper triangle shell derivative index)
+    // to multidimensional shell derivative index
+    // Potential integrals buffer is flattened upper triangle of (6 + NCART) dimensional deriv_order tensor
+    int d1_buf_idx = (type == "potential") ? 6 + ncart : 6;
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(d1_buf_idx, deriv_order);
+    
+
+    // One-electron integral derivative engine
+    std::vector<libint2::Engine> engines(nthreads);
+
+    if (type == "overlap") {
+        engines[0] = libint2::Engine(libint2::Operator::overlap, max_nprim, max_l, deriv_order);
+    } else if (type == "kinetic") {
+        engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l, deriv_order);
+    } else if (type == "potential") {
+        engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l, deriv_order);
+        engines[0].set_params(make_point_charges(atoms));
+    } else {
+       throw std::invalid_argument("type must be overlap, kinetic, or potential");
+    }
+
     for (size_t i = 1; i != nthreads; ++i) {
-        cgtg_engines[i] = cgtg_engines[0];
+        engines[i] = engines[0];
     }
 
-    size_t length = nbf1 * nbf2 * nbf3 * nbf4;
+    // Get size of derivative array and allocate
+    size_t length = nbf1 * nbf2;
     std::vector<double> result(length);
-    
-#pragma omp parallel for collapse(4) num_threads(nthreads)
+
+#pragma omp parallel for collapse(2) num_threads(nthreads)
     for(auto s1 = 0; s1 != bs1.size(); ++s1) {
         for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            for(auto s3=0; s3 != bs3.size(); ++s3) {
-                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                    auto bf1 = shell2bf_1[s1];  // first basis function in first shell
-                    auto n1 = bs1[s1].size(); // number of basis functions in first shell
-                    auto bf2 = shell2bf_2[s2];  // first basis function in second shell
-                    auto n2 = bs2[s2].size(); // number of basis functions in second shell
-                    auto bf3 = shell2bf_3[s3];  // first basis function in third shell
-                    auto n3 = bs3[s3].size(); // number of basis functions in third shell
-                    auto bf4 = shell2bf_4[s4];  // first basis function in fourth shell
-                    auto n4 = bs4[s4].size(); // number of basis functions in fourth shell
-
-                    size_t thread_id = 0;
-#ifdef _OPENMP
-                    thread_id = omp_get_thread_num();
-#endif
-                    cgtg_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                    const auto& buf_vec = cgtg_engines[thread_id].results(); // will point to computed shell sets
+            auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+            auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+            auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+            auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+            auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+            auto n2 = bs2[s2].size();    // number of basis functions in shell 2
 
-                    auto ints_shellset = buf_vec[0];    // Location of the computed integrals
-                    if (ints_shellset == nullptr)
-                        continue;  // nullptr returned if the entire shell-set was screened out
+            // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
+            std::vector<long> shell_atom_index_list{atom1, atom2};
 
-                    // Loop over shell block, keeping a total count idx for the size of shell set
-                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                        size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
-                        for(auto f2 = 0; f2 != n2; ++f2) {
-                            size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
-                            for(auto f3 = 0; f3 != n3; ++f3) {
-                                size_t offset_3 = (bf3 + f3) * nbf4;
-                                for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                    result[offset_1 + offset_2 + offset_3 + bf4 + f4] = ints_shellset[idx];
-                                }
-                            }
+            // For every desired atom derivative, check shell and nuclear indices for a match,
+            // add it to subvector for that derivative
+            // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
+            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+            for (int j = 0; j < desired_atom_indices.size(); j++){
+                int desired_atom_idx = desired_atom_indices[j];
+                // Shell indices
+                for (int i = 0; i < 2; i++){
+                    int atom_idx = shell_atom_index_list[i];
+                    if (atom_idx == desired_atom_idx) { 
+                        int tmp = 3 * i + desired_coordinates[j];
+                        indices[j].push_back(tmp);
+                    }
+                }
+                
+                if (type == "potential") {
+                    for (int i = 0; i < natom; i++){
+                        // i = shell_atom_index_list[i];
+                        if (i == desired_atom_idx) {
+                            int tmp = 3 * (i + 2) + desired_coordinates[j];
+                            indices[j].push_back(tmp);
                         }
                     }
                 }
             }
-        }
-    }
-    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
-}
-
-// Computes integrals of squared contracted Gaussian-type geminal
-py::array f12_squared(double beta) {
-    // workaround for data copying: perhaps pass an empty numpy array, then populate it in C++? avoids last line, which copies
-    auto cgtg_params = take_square(make_cgtg(beta));
-    std::vector<libint2::Engine> cgtg_squared_engines(nthreads);
-    cgtg_squared_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l);
-    cgtg_squared_engines[0].set_params(cgtg_params);
-    for (size_t i = 1; i != nthreads; ++i) {
-        cgtg_squared_engines[i] = cgtg_squared_engines[0];
-    }
+            
+            // Now indices is a vector of vectors, where each subvector is your choices
+            // for the first derivative operator, second, third, etc
+            // and the total number of subvectors is the order of differentiation
+            // Now we want all combinations where we pick exactly one index from each subvector.
+            // This is achievable through a cartesian product
+            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+            std::vector<int> buffer_indices;
 
-    size_t length = nbf1 * nbf2 * nbf3 * nbf4;
-    std::vector<double> result(length);
-    
-#pragma omp parallel for collapse(4) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            for(auto s3=0; s3 != bs3.size(); ++s3) {
-                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                    auto bf1 = shell2bf_1[s1];  // first basis function in first shell
-                    auto n1 = bs1[s1].size(); // number of basis functions in first shell
-                    auto bf2 = shell2bf_2[s2];  // first basis function in second shell
-                    auto n2 = bs2[s2].size(); // number of basis functions in second shell
-                    auto bf3 = shell2bf_3[s3];  // first basis function in third shell
-                    auto n3 = bs3[s3].size(); // number of basis functions in third shell
-                    auto bf4 = shell2bf_4[s4];  // first basis function in fourth shell
-                    auto n4 = bs4[s4].size(); // number of basis functions in fourth shell
+            // Collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+            for (auto vec : index_combos)  {
+                std::sort(vec.begin(), vec.end());
+                int buf_idx = 0;
+                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                buffer_indices.push_back(buf_idx);
+            }
 
-                    size_t thread_id = 0;
+            // Compute the integrals
+            size_t thread_id = 0;
 #ifdef _OPENMP
-                    thread_id = omp_get_thread_num();
+            thread_id = omp_get_thread_num();
 #endif
-                    cgtg_squared_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                    const auto& buf_vec = cgtg_squared_engines[thread_id].results(); // will point to computed shell sets
-
-                    auto ints_shellset = buf_vec[0];    // Location of the computed integrals
-                    if (ints_shellset == nullptr)
-                        continue;  // nullptr returned if the entire shell-set was screened out
+            engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+            const auto& buf_vec = engines[thread_id].results(); // will point to computed shell sets
 
-                    // Loop over shell block, keeping a total count idx for the size of shell set
-                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                        size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
-                        for(auto f2 = 0; f2 != n2; ++f2) {
-                            size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
-                            for(auto f3 = 0; f3 != n3; ++f3) {
-                                size_t offset_3 = (bf3 + f3) * nbf4;
-                                for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                    result[offset_1 + offset_2 + offset_3 + bf4 + f4] = ints_shellset[idx];
-                                }
-                            }
-                        }
+            // Loop over every buffer index and accumulate for every shell set.
+            for(auto i = 0; i < buffer_indices.size(); ++i) {
+                auto ints_shellset = buf_vec[buffer_indices[i]];
+                if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
+                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                    for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                        result[(bf1 + f1) * nbf2 + bf2 + f2] += ints_shellset[idx];
                     }
                 }
             }
         }
     }
-    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
+    return py::array(result.size(), result.data()); 
 }
 
-// Computes electron repulsion integrals of contracted Gaussian-type geminal
-py::array f12g12(double beta) {
-    // workaround for data copying: perhaps pass an empty numpy array, then populate it in C++? avoids last line, which copies
-    auto cgtg_params = make_cgtg(beta);
-    std::vector<libint2::Engine> cgtg_coulomb_engines(nthreads);
-    cgtg_coulomb_engines[0] = libint2::Engine(libint2::Operator::cgtg_x_coulomb, max_nprim, max_l);
-    cgtg_coulomb_engines[0].set_params(cgtg_params);
+// Computes nuclear derivatives of two-electron integrals
+py::array compute_2e_deriv(std::string type, double beta, std::vector<int> deriv_vec) {
+    assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
+    // Get order of differentiation
+    int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
+
+    // Convert deriv_vec to set of atom indices and their cartesian components which we are differentiating wrt
+    std::vector<int> desired_atom_indices;
+    std::vector<int> desired_coordinates;
+    process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
+
+    // Create mapping from 1d buffer index (flattened upper triangle shell derivative index)
+    // to multidimensional shell derivative index
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+
+    // ERI derivative integral engine
+    std::vector<libint2::Engine> eri_engines(nthreads);
+
+    if (type == "eri") {
+        eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
+    } else if (type == "f12") {
+        auto cgtg_params = make_cgtg(beta);
+        eri_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
+        eri_engines[0].set_params(cgtg_params);
+    } else if (type == "f12g12") {
+        auto cgtg_params = make_cgtg(beta);
+        eri_engines[0] = libint2::Engine(libint2::Operator::cgtg_x_coulomb, max_nprim, max_l, deriv_order);
+        eri_engines[0].set_params(cgtg_params);
+    } else if (type == "f12_squared") {
+        auto cgtg_params = take_square(make_cgtg(beta));
+        eri_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
+        eri_engines[0].set_params(cgtg_params);
+    } else if (type == "f12_double_commutator") {
+        auto cgtg_params = make_cgtg(beta);
+        eri_engines[0] = libint2::Engine(libint2::Operator::delcgtg2, max_nprim, max_l, deriv_order,
+                                            std::numeric_limits<libint2::scalar_type>::epsilon(),
+                                            cgtg_params, libint2::BraKet::xx_xx);
+    } else {
+        throw std::invalid_argument("type must be eri, f12, f12g12, f12_squared, or f12_double_commutator");
+    }
+
     for (size_t i = 1; i != nthreads; ++i) {
-        cgtg_coulomb_engines[i] = cgtg_coulomb_engines[0];
+        eri_engines[i] = eri_engines[0];
     }
 
     size_t length = nbf1 * nbf2 * nbf3 * nbf4;
     std::vector<double> result(length);
-    
+
 #pragma omp parallel for collapse(4) num_threads(nthreads)
     for(auto s1 = 0; s1 != bs1.size(); ++s1) {
         for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            for(auto s3=0; s3 != bs3.size(); ++s3) {
+            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
                 for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                    auto bf1 = shell2bf_1[s1];  // first basis function in first shell
-                    auto n1 = bs1[s1].size(); // number of basis functions in first shell
-                    auto bf2 = shell2bf_2[s2];  // first basis function in second shell
-                    auto n2 = bs2[s2].size(); // number of basis functions in second shell
-                    auto bf3 = shell2bf_3[s3];  // first basis function in third shell
-                    auto n3 = bs3[s3].size(); // number of basis functions in third shell
-                    auto bf4 = shell2bf_4[s4];  // first basis function in fourth shell
-                    auto n4 = bs4[s4].size(); // number of basis functions in fourth shell
-
-                    size_t thread_id = 0;
-#ifdef _OPENMP
-                    thread_id = omp_get_thread_num();
-#endif
-                    cgtg_coulomb_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                    const auto& buf_vec = cgtg_coulomb_engines[thread_id].results(); // will point to computed shell sets
-
-                    auto ints_shellset = buf_vec[0];    // Location of the computed integrals
-                    if (ints_shellset == nullptr)
-                        continue;  // nullptr returned if the entire shell-set was screened out
+                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
+                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
+                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
+                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
+                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
+                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
 
-                    // Loop over shell block, keeping a total count idx for the size of shell set
-                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                        size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
-                        for(auto f2 = 0; f2 != n2; ++f2) {
-                            size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
-                            for(auto f3 = 0; f3 != n3; ++f3) {
-                                size_t offset_3 = (bf3 + f3) * nbf4;
-                                for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                    result[offset_1 + offset_2 + offset_3 + bf4 + f4] = ints_shellset[idx];
-                                }
+                    // If the atoms are the same we ignore it as the derivatives will be zero.
+                    if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
+                    // Ensure all desired_atoms correspond to at least one shell atom to
+                    // ensure desired derivative exists. else, skip this shell quartet.
+                    bool atoms_not_present = false;
+                    for (int i = 0; i < deriv_order; i++){
+                        if (atom1 == desired_atom_indices[i]) continue; 
+                        else if (atom2 == desired_atom_indices[i]) continue;
+                        else if (atom3 == desired_atom_indices[i]) continue;
+                        else if (atom4 == desired_atom_indices[i]) continue;
+                        else {atoms_not_present = true; break;}
+                    }
+                    if (atoms_not_present) continue;
+
+                    // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
+                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
+                
+                    // For every desired atom derivative, check shell indices for a match,
+                    // add it to subvector for that derivative
+                    // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
+                    std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+                    for (int j = 0; j < desired_atom_indices.size(); j++){
+                        int desired_atom_idx = desired_atom_indices[j];
+                        // Shell indices
+                        for (int i = 0; i < 4; i++){
+                            int atom_idx = shell_atom_index_list[i];
+                            if (atom_idx == desired_atom_idx) {
+                                int tmp = 3 * i + desired_coordinates[j];
+                                indices[j].push_back(tmp);
                             }
                         }
                     }
-                }
-            }
-        }
-    }
-    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
-}
-
-// Computes gradient norm of contracted Gaussian-type geminal
-py::array f12_double_commutator(double beta) {
-    // workaround for data copying: perhaps pass an empty numpy array, then populate it in C++? avoids last line, which copies
-    auto cgtg_params = make_cgtg(beta);
-    std::vector<libint2::Engine> cgtg_del_engines(nthreads);
-    // Returns Runtime Error: bad any_cast if shorthand version is used, may be an error on the Libint side since Psi4 works with this as well
-    cgtg_del_engines[0] = libint2::Engine(libint2::Operator::delcgtg2, max_nprim, max_l, 0, 0., cgtg_params, libint2::BraKet::xx_xx);
-    for (size_t i = 1; i != nthreads; ++i) {
-        cgtg_del_engines[i] = cgtg_del_engines[0];
-    }
+                    
+                    // Now indices is a vector of vectors, where each subvector is your choices
+                    // for the first derivative operator, second, third, etc
+                    // and the total number of subvectors is the order of differentiation
+                    // Now we want all combinations where we pick exactly one index from each subvector.
+                    // This is achievable through a cartesian product 
+                    std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                    std::vector<int> buffer_indices;
 
-    size_t length = nbf1 * nbf2 * nbf3 * nbf4;
-    std::vector<double> result(length);
-    
-#pragma omp parallel for collapse(4) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            for(auto s3=0; s3 != bs3.size(); ++s3) {
-                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                    auto bf1 = shell2bf_1[s1];  // first basis function in first shell
-                    auto n1 = bs1[s1].size(); // number of basis functions in first shell
-                    auto bf2 = shell2bf_2[s2];  // first basis function in second shell
-                    auto n2 = bs2[s2].size(); // number of basis functions in second shell
-                    auto bf3 = shell2bf_3[s3];  // first basis function in third shell
-                    auto n3 = bs3[s3].size(); // number of basis functions in third shell
-                    auto bf4 = shell2bf_4[s4];  // first basis function in fourth shell
-                    auto n4 = bs4[s4].size(); // number of basis functions in fourth shell
+                    // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
+                    for (auto vec : index_combos)  {
+                        std::sort(vec.begin(), vec.end());
+                        int buf_idx = 0;
+                        // buffer_multidim_lookup
+                        auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                        if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                        buffer_indices.push_back(buf_idx);
+                    }
 
+                    // If we made it this far, the shell derivative we want is contained in the buffer. 
                     size_t thread_id = 0;
 #ifdef _OPENMP
                     thread_id = omp_get_thread_num();
 #endif
-                    cgtg_del_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                    const auto& buf_vec = cgtg_del_engines[thread_id].results(); // will point to computed shell sets
-
-                    auto ints_shellset = buf_vec[0];    // Location of the computed integrals
-                    if (ints_shellset == nullptr)
-                        continue;  // nullptr returned if the entire shell-set was screened out
+                    eri_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                    const auto& buf_vec = eri_engines[thread_id].results(); // will point to computed shell sets
 
-                    // Loop over shell block, keeping a total count idx for the size of shell set
-                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                        size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
-                        for(auto f2 = 0; f2 != n2; ++f2) {
-                            size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
-                            for(auto f3 = 0; f3 != n3; ++f3) {
-                                size_t offset_3 = (bf3 + f3) * nbf4;
-                                for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                    result[offset_1 + offset_2 + offset_3 + bf4 + f4] = ints_shellset[idx];
+                    for(auto i = 0; i < buffer_indices.size(); ++i) {
+                        auto ints_shellset = buf_vec[buffer_indices[i]];
+                        if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                            for(auto f2 = 0; f2 != n2; ++f2) {
+                                size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                                for(auto f3 = 0; f3 != n3; ++f3) {
+                                    size_t offset_3 = (bf3 + f3) * nbf4;
+                                    for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                        result[offset_1 + offset_2 + offset_3 + bf4 + f4] += ints_shellset[idx];
+                                    }
                                 }
                             }
                         }
@@ -674,2633 +661,793 @@ py::array f12_double_commutator(double beta) {
             }
         }
     }
-    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
+    // This is not the bottleneck
+    return py::array(result.size(), result.data()); 
+    // This apparently copies data, but it should be fine right?
+    // https://github.com/pybind/pybind11/issues/1042 there's a workaround
 }
 
-// Computes nuclear derivatives of overlap integrals
-py::array overlap_deriv(std::vector<int> deriv_vec) {
-    assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
-    // Get order of differentiation
-    int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
+// Write OEI derivatives up to `max_deriv_order` to disk
+// HDF5 File Name: oei_derivs.h5 
+//      HDF5 Dataset names within the file:
+//      oei_nbf1_nbf2_deriv1 
+//          shape (nbf,nbf,n_unique_1st_derivs)
+//      oei_nbf1_nbf2__deriv2 
+//          shape (nbf,nbf,n_unique_2nd_derivs)
+//      oei_nbf1_nbf2__deriv3 
+//          shape (nbf,nbf,n_unique_3rd_derivs)
+//      ...
+// The number of unique derivatives is essentially equal to the size of the
+// generalized upper triangle of the derivative tensor.
+void compute_1e_deriv_disk(std::string type, int max_deriv_order) {
+    std::cout << "Writing one-electron " << type << " integral derivative tensors up to order " 
+                                         << max_deriv_order << " to disk...";
+    long total_deriv_slices = 0;
+    for (int i = 1; i <= max_deriv_order; i++){
+        total_deriv_slices += how_many_derivs(natom, i);
+    }
 
-    // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
+    double check = (nbf1 * nbf2 * total_deriv_slices * 8) * (1e-9);
+    assert(check < 10 && "Total disk space required for ERI's exceeds 10 GB. Increase threshold and recompile to proceed.");
 
-    // Convert deriv_vec to set of atom indices and their cartesian components which we are differentiating wrt
-    std::vector<int> desired_atom_indices;
-    std::vector<int> desired_coordinates;
-    process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
+    // Create H5 File and prepare to fill with 0.0's
+    const H5std_string file_name("oei_derivs.h5");
+    H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
+    double fillvalue = 0.0;
+    DSetCreatPropList plist;
+    plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
 
-    // Overlap integral derivative engine
-    std::vector<libint2::Engine> s_engines(nthreads);
-    s_engines[0] = libint2::Engine(libint2::Operator::overlap, max_nprim, max_l, deriv_order);
-    for (size_t i = 1; i != nthreads; ++i) {
-        s_engines[i] = s_engines[0];
-    }
+    for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
+        // how many shell derivatives in the Libint buffer for overlap/kinetic integrals
+        // how many shell and operator derivatives for potential integrals
+        int nshell_derivs = how_many_derivs(2, deriv_order);
+        int nshell_derivs_potential = how_many_derivs(2, deriv_order, natom);
+        // how many unique cartesian nuclear derivatives (e.g., so we only save one of d^2/dx1dx2 and d^2/dx2dx1, etc)
+        unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
-    // Get size of overlap derivative array and allocate 
-    size_t length = nbf1 * nbf2;
-    std::vector<double> result(length);
+        // Create mappings from 1d buffer index (flattened upper triangle shell derivative index)
+        // to multidimensional shell derivative index
+        // Potential integrals buffer is flattened upper triangle of (6 + NCART) dimensional deriv_order tensor
+        int d1_buf_idx = (type == "potential") ? 6 + ncart : 6;
+        const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(d1_buf_idx, deriv_order);
 
-#pragma omp parallel for collapse(2) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-            auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-            auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-            auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-            auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-            auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-            // If the atoms are the same we ignore it as the derivatives will be zero.
-            if (atom1 == atom2) continue;
+        // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index)
+        // to multidimensional index
+        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
 
-            // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
-            std::vector<long> shell_atom_index_list{atom1, atom2};
+        // Define engines and buffers
+        std::vector<libint2::Engine> engines(nthreads), t_engines(nthreads), v_engines(nthreads);
+        
+        if (type == "overlap") {
+            engines[0] = libint2::Engine(libint2::Operator::overlap, max_nprim, max_l, deriv_order);
+        } else if (type == "kinetic") {
+            engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l, deriv_order);
+        } else if (type == "potential") {
+            engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l, deriv_order);
+            engines[0].set_params(make_point_charges(atoms));
+        } else {
+           throw std::invalid_argument("type must be overlap, kinetic, or potential");
+        }
 
-            // For every desired atom derivative, check shell and nuclear indices for a match, add it to subvector for that derivative
-            // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
-            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-            for (int j = 0; j < desired_atom_indices.size(); j++){
-                int desired_atom_idx = desired_atom_indices[j];
-                // Shell indices
-                for (int i = 0; i < 2; i++){
-                    int atom_idx = shell_atom_index_list[i];
-                    if (atom_idx == desired_atom_idx) { 
-                        int tmp = 3 * i + desired_coordinates[j];
-                        indices[j].push_back(tmp);
-                    }
-                }
-            }
+        for (size_t i = 1; i != nthreads; ++i) {
+            engines[i] = engines[0];
+        }
 
-            // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-            // and the total number of subvectors is the order of differentiation
-            // Now we want all combinations where we pick exactly one index from each subvector.
-            // This is achievable through a cartesian product
-            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-            std::vector<int> buffer_indices;
+        // Define HDF5 dataset names
+        const H5std_string dset_name(type + "_" + std::to_string(nbf1) + "_" + std::to_string(nbf2) 
+                                             + "_deriv" + std::to_string(deriv_order));
 
-            // Overlap/Kinetic integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
-            for (auto vec : index_combos)  {
-                std::sort(vec.begin(), vec.end());
-                int buf_idx = 0;
-                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                buffer_indices.push_back(buf_idx);
-            }
+        // Define rank and dimensions of data that will be written to the file
+        hsize_t file_dims[] = {nbf1, nbf2, nderivs_triu};
+        DataSpace fspace(3, file_dims);
+        // Create dataset for each integral type and write 0.0's into the file 
+        DataSet* dataset = new DataSet(file->createDataSet(dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
+        hsize_t stride[3] = {1, 1, 1}; // stride and block can be used to 
+        hsize_t block[3] = {1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
+        hsize_t zerostart[3] = {0, 0, 0};
 
-            // If we made it this far, the shell derivative we want is in the buffer, perhaps even more than once. 
-            size_t thread_id = 0;
+        /* Initialize lock */
+        omp_init_lock(&lock);
+
+#pragma omp parallel for collapse(2) num_threads(nthreads)
+        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+                auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+                std::vector<long> shell_atom_index_list{atom1, atom2};
+
+                size_t thread_id = 0;
 #ifdef _OPENMP
-            thread_id = omp_get_thread_num();
+                thread_id = omp_get_thread_num();
 #endif
-            s_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-            const auto& buf_vec = s_engines[thread_id].results(); // will point to computed shell sets
-
-            // Loop over every buffer index and accumulate for every shell set.
-            for(auto i = 0; i < buffer_indices.size(); ++i) {
-                auto ints_shellset = buf_vec[buffer_indices[i]];
-                if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
-                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                    for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                        result[(bf1 + f1) * nbf2 + bf2 + f2] += ints_shellset[idx];
-                    }
-                }
-            }
-        }
-    }
-    return py::array(result.size(), result.data()); 
-}
+                engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+                const auto& buffer = engines[thread_id].results(); // will point to computed shell sets
 
-// Computes nuclear derivatives of kinetic energy integrals
-py::array kinetic_deriv(std::vector<int> deriv_vec) {
-    assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
-    // Get order of differentiation
-    int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
+                // Define shell set slabs
+                double shellset_slab [n1][n2][nderivs_triu] = {};
 
-    // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
+                // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+                // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
+                for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                    // Look up multidimensional cartesian derivative index
+                    auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+                    // Create a vector of vectors called `indices`, where each subvector
+                    // is your possible choices for the first derivative operator, second, third, etc
+                    // and the total number of subvectors is order of differentiation
+                    // What follows fills these indices
+                    std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
 
-    // Convert deriv_vec to set of atom indices and their cartesian components which we are differentiating wrt
-    std::vector<int> desired_atom_indices;
-    std::vector<int> desired_coordinates;
-    process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
-
-    // Kinetic integral derivative engine
-    std::vector<libint2::Engine> t_engines(nthreads);
-    t_engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l, deriv_order);
-    for (size_t i = 1; i != nthreads; ++i) {
-        t_engines[i] = t_engines[0];
-    }
+                    // Loop over each cartesian coordinate index which we are differentiating wrt
+                    // for this nuclear cartesian derivative index and check to see if it is present
+                    // in the shell duet, and where it is present in the potential operator
+                    for (int j = 0; j < multi_cart_idx.size(); j++){
+                        int desired_atom_idx = multi_cart_idx[j] / 3;
+                        int desired_coord = multi_cart_idx[j] % 3;
+                        // Loop over shell indices
+                        for (int i = 0; i < 2; i++){
+                            int atom_idx = shell_atom_index_list[i];
+                            if (atom_idx == desired_atom_idx) {
+                                int tmp = 3 * i + desired_coord;
+                                indices[j].push_back(tmp);
+                            }
+                        }
+                        // Now for potentials only, loop over each atom in molecule, and if this derivative
+                        // differentiates wrt that atom, we also need to collect that index.
+                        if (type == "potential") {
+                            for (int i = 0; i < natom; i++){
+                                if (i == desired_atom_idx) {
+                                    int tmp = 3 * (i + 2) + desired_coord;
+                                    indices[j].push_back(tmp);
+                                }
+                            }
+                        }
+                    }
 
-    size_t length = nbf1 * nbf2;
-    std::vector<double> result(length);
+                    // Now indices is a vector of vectors, where each subvector is your choices
+                    // for the first derivative operator, second, third, etc
+                    // and the total number of subvectors is the order of differentiation
+                    // Now we want all combinations where we pick exactly one index from each subvector.
+                    // This is achievable through a cartesian product
+                    std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                    std::vector<int> buffer_indices;
+                    // Collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+                    for (auto vec : index_combos)  {
+                        std::sort(vec.begin(), vec.end());
+                        int buf_idx = 0;
+                        auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                        if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                        buffer_indices.push_back(buf_idx);
+                    }
+                    // Loop over shell block for each buffer index which contributes to this derivative
+                    for(auto i = 0; i < buffer_indices.size(); ++i) {
+                        auto shellset = buffer[buffer_indices[i]];
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                                shellset_slab[f1][f2][nuc_idx] += shellset[idx];
+                            }
+                        }
+                    }
+                } // Unique nuclear cartesian derivative indices loop
 
-#pragma omp parallel for collapse(2) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-            auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-            auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-            auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-            auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-            auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-            // If the atoms are the same we ignore it as the derivatives will be zero.
-            if (atom1 == atom2) continue;
+                /* Serialize HDF dataset writing using OpenMP lock */
+                omp_set_lock(&lock);
 
-            // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
-            std::vector<long> shell_atom_index_list{atom1, atom2};
+                // Now write this shell set slab to HDF5 file
+                // Create file space hyperslab, defining where to write data to in file
+                hsize_t count[3] = {n1, n2, nderivs_triu};
+                hsize_t start[3] = {bf1, bf2, 0};
+                fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
+                // Create dataspace defining for memory dataset to write to file
+                hsize_t mem_dims[] = {n1, n2, nderivs_triu};
+                DataSpace mspace(3, mem_dims);
+                mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
+                // Write buffer data 'shellset_slab' with data type double from
+                // memory dataspace `mspace` to file dataspace `fspace`
+                dataset->write(shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
 
-            // For every desired atom derivative, check shell and nuclear indices for a match, add it to subvector for that derivative
-            // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
-            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-            for (int j = 0; j < desired_atom_indices.size(); j++){
-                int desired_atom_idx = desired_atom_indices[j];
-                // Shell indices
-                for (int i = 0; i < 2; i++){
-                    int atom_idx = shell_atom_index_list[i];
-                    if (atom_idx == desired_atom_idx) {
-                        int tmp = 3 * i + desired_coordinates[j];
-                        indices[j].push_back(tmp);
-                    }
-                }
+                /* Release lock */
+                omp_unset_lock(&lock);
             }
+        } // shell duet loops
+        // Delete datasets for this derivative order
+        delete dataset;
+    } // deriv order loop
 
-            // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-            // and the total number of subvectors is the order of differentiation
-            // Now we want all combinations where we pick exactly one index from each subvector.
-            // This is achievable through a cartesian product
-            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-            std::vector<int> buffer_indices;
+    /* Finished lock mechanism, destroy it */
+    omp_destroy_lock(&lock);
+    // close the file
+    delete file;
+    std::cout << " done" << std::endl;
+} // compute_1e_deriv_disk 
 
-            // Overlap/Kinetic integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
-            for (auto vec : index_combos)  {
-                std::sort(vec.begin(), vec.end());
-                int buf_idx = 0;
-                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                buffer_indices.push_back(buf_idx);
-            }
 
-            // If we made it this far, the shell derivative we want is in the buffer, perhaps even more than once.
-            size_t thread_id = 0;
-#ifdef _OPENMP
-            thread_id = omp_get_thread_num();
-#endif
-            t_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-            const auto& buf_vec = t_engines[thread_id].results(); // will point to computed shell sets
+// Writes TEI derivatives up to `max_deriv_order` to disk.
+// HDF5 File Name: tei_derivs.h5 
+//      HDF5 Dataset names within the file:
+//      tei_nbf1_nbf2_nbf3_nbf4_deriv1 
+//          shape (nbf,nbf,nbf,nbf,n_unique_1st_derivs)
+//      tei_nbf1_nbf2_nbf3_nbf4_deriv2
+//          shape (nbf,nbf,nbf,nbf,n_unique_2nd_derivs)
+//      tei_nbf1_nbf2_nbf3_nbf4_deriv3
+//          shape (nbf,nbf,nbf,nbf,n_unique_3rd_derivs)
+//      ...
+void compute_2e_deriv_disk(std::string type, double beta, int max_deriv_order) { 
+    std::cout << "Writing two-electron " << type << " integral derivative tensors up to order " 
+                                         << max_deriv_order << " to disk...";
+    const H5std_string file_name("tei_derivs.h5");
+    H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
+    double fillvalue = 0.0;
+    DSetCreatPropList plist;
+    plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
 
-            // Loop over every buffer index and accumulate for every shell set.
-            for(auto i = 0; i < buffer_indices.size(); ++i) {
-                auto ints_shellset = buf_vec[buffer_indices[i]];
-                if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
-                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                    for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                        result[(bf1 + f1) * nbf2 + bf2 + f2] += ints_shellset[idx];
-                    }
-                }
-            }
-        }
+    // Check to make sure you are not flooding the disk.
+    long total_deriv_slices = 0;
+    for (int i = 1; i <= max_deriv_order; i++){
+        total_deriv_slices += how_many_derivs(natom, i);
     }
-    return py::array(result.size(), result.data()); 
-}
+    double check = (nbf1 * nbf2 * nbf3 * nbf4 * total_deriv_slices * 8) * (1e-9);
+    assert(check < 50 && "Total disk space required for ERI's exceeds 50 GB. Increase threshold and recompile to proceed.");
 
-// Computes nuclear derivatives of potential energy integrals 
-py::array potential_deriv(std::vector<int> deriv_vec) {
-    assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
-    // Get order of differentiation
-    int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
+    for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
+        // Number of unique shell derivatives output by libint (number of indices in buffer)
+        int nshell_derivs = how_many_derivs(4, deriv_order);
+        // Number of unique nuclear derivatives of ERI's
+        unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
-    // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-    // Potential integrals buffer is flattened upper triangle of (6 + NCART) dimensional deriv_order tensor
-    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6 + ncart, deriv_order);
+        // Create mapping from 1d buffer index (flattened upper triangle shell derivative index)
+        // to multidimensional shell derivative index
+        const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
 
-    // Convert deriv_vec to set of atom indices and their cartesian components which we are differentiating wrt
-    std::vector<int> desired_atom_indices;
-    std::vector<int> desired_coordinates;
-    process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
+        // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index)
+        // to multidimensional index
+        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
 
-    // Potential integral derivative engine
-    std::vector<libint2::Engine> v_engines(nthreads);
-    v_engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l, deriv_order);
-    v_engines[0].set_params(make_point_charges(atoms));
-    for (size_t i = 1; i != nthreads; ++i) {
-        v_engines[i] = v_engines[0];
-    }
+        // Libint engine for computing shell quartet derivatives
+        std::vector<libint2::Engine> eri_engines(nthreads);
 
-    // Get size of potential derivative array and allocate
-    size_t length = nbf1 * nbf2;
-    std::vector<double> result(length);
+        if (type == "eri") {
+            eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
+        } else if (type == "f12") {
+            auto cgtg_params = make_cgtg(beta);
+            eri_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
+            eri_engines[0].set_params(cgtg_params);
+        } else if (type == "f12g12") {
+            auto cgtg_params = make_cgtg(beta);
+            eri_engines[0] = libint2::Engine(libint2::Operator::cgtg_x_coulomb, max_nprim, max_l, deriv_order);
+            eri_engines[0].set_params(cgtg_params);
+        } else if (type == "f12_squared") {
+            auto cgtg_params = take_square(make_cgtg(beta));
+            eri_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
+            eri_engines[0].set_params(cgtg_params);
+        } else if (type == "f12_double_commutator") {
+            auto cgtg_params = make_cgtg(beta);
+            eri_engines[0] = libint2::Engine(libint2::Operator::delcgtg2, max_nprim, max_l, deriv_order,
+                                                std::numeric_limits<libint2::scalar_type>::epsilon(),
+                                                cgtg_params, libint2::BraKet::xx_xx);
+        } else {
+            throw std::invalid_argument("type must be eri, f12, f12g12, f12_squared, or f12_double_commutator");
+        }
 
-#pragma omp parallel for collapse(2) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-            auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-            auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-            auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-            auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-            auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+        for (size_t i = 1; i != nthreads; ++i) {
+            eri_engines[i] = eri_engines[0];
+        }
 
-            // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
-            std::vector<long> shell_atom_index_list{atom1, atom2};
+        // Define HDF5 dataset name
+        const H5std_string eri_dset_name(type + "_" + std::to_string(nbf1) + "_" + std::to_string(nbf2)
+                                         + "_" + std::to_string(nbf3) + "_" + std::to_string(nbf4)
+                                         + "_deriv" + std::to_string(deriv_order));
+        hsize_t file_dims[] = {nbf1, nbf2, nbf3, nbf4, nderivs_triu};
+        DataSpace fspace(5, file_dims);
+        // Create dataset for each integral type and write 0.0's into the file 
+        DataSet* eri_dataset = new DataSet(file->createDataSet(eri_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
+        hsize_t stride[5] = {1, 1, 1, 1, 1}; // stride and block can be used to 
+        hsize_t block[5] = {1, 1, 1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
+        hsize_t zerostart[5] = {0, 0, 0, 0, 0};
 
-            // For every desired atom derivative, check shell and nuclear indices for a match, add it to subvector for that derivative
-            // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
-            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-            for (int j = 0; j < desired_atom_indices.size(); j++){
-                int desired_atom_idx = desired_atom_indices[j];
-                // Shell indices
-                for (int i = 0; i < 2; i++){
-                    int atom_idx = shell_atom_index_list[i];
-                    if (atom_idx == desired_atom_idx) { 
-                        int tmp = 3 * i + desired_coordinates[j];
-                        indices[j].push_back(tmp);
-                    }
-                }
-                
-                for (int i = 0; i < natom; i++){
-                    // i = shell_atom_index_list[i];
-                    if (i == desired_atom_idx) {
-                        int tmp = 3 * (i + 2) + desired_coordinates[j];
-                        indices[j].push_back(tmp);
-                    }
-                }
-            }
-            
-            // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-            // and the total number of subvectors is the order of differentiation
-            // Now we want all combinations where we pick exactly one index from each subvector.
-            // This is achievable through a cartesian product
-            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-            std::vector<int> buffer_indices;
+        /* Initialize lock */
+        omp_init_lock(&lock);
 
-            // Potential integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
-            for (auto vec : index_combos)  {
-                std::sort(vec.begin(), vec.end());
-                int buf_idx = 0;
-                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                buffer_indices.push_back(buf_idx);
-            }
+#pragma omp parallel for collapse(4) num_threads(nthreads)
+        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+                for(auto s3 = 0; s3 != bs3.size(); ++s3) {
+                    for(auto s4 = 0; s4 != bs4.size(); ++s4) {
+                        auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                        auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                        auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                        auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                        auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                        auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+                        auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
+                        auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
+                        auto n3 = bs3[s3].size();    // number of basis functions in shell 3
+                        auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
+                        auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
+                        auto n4 = bs4[s4].size();    // number of basis functions in shell 4
 
-            // Compute the integrals
-            size_t thread_id = 0;
+                        if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
+                        std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
+
+                        size_t thread_id = 0;
 #ifdef _OPENMP
-            thread_id = omp_get_thread_num();
+                        thread_id = omp_get_thread_num();
 #endif
-            v_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-            const auto& buf_vec = v_engines[thread_id].results(); // will point to computed shell sets
+                        eri_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                        const auto& eri_buffer = eri_engines[thread_id].results(); // will point to computed shell sets
 
-            // Loop over every buffer index and accumulate for every shell set.
-            for(auto i = 0; i < buffer_indices.size(); ++i) {
-                auto ints_shellset = buf_vec[buffer_indices[i]];
-                if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
-                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                    for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                        result[(bf1 + f1) * nbf2 + bf2 + f2] += ints_shellset[idx];
-                    }
-                }
-            }
-        }
-    }
-    return py::array(result.size(), result.data()); 
-}
+                        // Define shell set slab, with extra dimension for unique derivatives, initialized with 0.0's
+                        double eri_shellset_slab [n1][n2][n3][n4][nderivs_triu] = {};
+                        // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+                        for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                            // Look up multidimensional cartesian derivative index
+                            auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+    
+                            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+    
+                            // Find out which 
+                            for (int j = 0; j < multi_cart_idx.size(); j++){
+                                int desired_atom_idx = multi_cart_idx[j] / 3;
+                                int desired_coord = multi_cart_idx[j] % 3;
+                                for (int i = 0; i < 4; i++){
+                                    int atom_idx = shell_atom_index_list[i];
+                                    if (atom_idx == desired_atom_idx) {
+                                        int tmp = 3 * i + desired_coord;
+                                        indices[j].push_back(tmp);
+                                    }
+                                }
+                            }
 
-// Computes nuclear derivatives of electron repulsion integrals
-py::array eri_deriv(std::vector<int> deriv_vec) {
-    int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
+                            // Now indices is a vector of vectors, where each subvector is your choices
+                            // for the first derivative operator, second, third, etc
+                            // and the total number of subvectors is the order of differentiation
+                            // Now we want all combinations where we pick exactly one index from each subvector.
+                            // This is achievable through a cartesian product 
+                            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                            std::vector<int> buffer_indices;
 
-    // Convert deriv_vec to set of atom indices and their cartesian components which we are differentiating wrt
-    std::vector<int> desired_atom_indices;
-    std::vector<int> desired_coordinates;
-    process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
+                            // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
+                            for (auto vec : index_combos)  {
+                                std::sort(vec.begin(), vec.end());
+                                int buf_idx = 0;
+                                // buffer_multidim_lookup
+                                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                                buffer_indices.push_back(buf_idx);
+                            }
 
-    assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
+                            // Loop over shell block, keeping a total count idx for the size of shell set
+                            for(auto i = 0; i < buffer_indices.size(); ++i) {
+                                auto eri_shellset = eri_buffer[buffer_indices[i]];
+                                if (eri_shellset == nullptr) continue;
+                                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                                    for(auto f2 = 0; f2 != n2; ++f2) {
+                                        for(auto f3 = 0; f3 != n3; ++f3) {
+                                            for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                                eri_shellset_slab[f1][f2][f3][f4][nuc_idx] += eri_shellset[idx];
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        } // For every nuc_idx 0, nderivs_triu
 
-    // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+                        /* Serialize HDF dataset writing using OpenMP lock */
+                        omp_set_lock(&lock);
 
-    // ERI derivative integral engine
-    std::vector<libint2::Engine> eri_engines(nthreads);
-    eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
-    for (size_t i = 1; i != nthreads; ++i) {
-        eri_engines[i] = eri_engines[0];
+                        // Now write this shell set slab to HDF5 file
+                        hsize_t count[5] = {n1, n2, n3, n4, nderivs_triu};
+                        hsize_t start[5] = {bf1, bf2, bf3, bf4, 0};
+                        fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
+                        // Create dataspace defining for memory dataset to write to file
+                        hsize_t mem_dims[] = {n1, n2, n3, n4, nderivs_triu};
+                        DataSpace mspace(5, mem_dims);
+                        mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
+                        // Write buffer data 'shellset_slab' with data type double from
+                        // memory dataspace `mspace` to file dataspace `fspace`
+                        eri_dataset->write(eri_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+
+                        /* Release lock */
+                        omp_unset_lock(&lock);
+                    }
+                }
+            }
+        } // shell quartet loops
+        // Close the dataset for this derivative order
+        delete eri_dataset;
+    } // deriv order loop
+
+    /* Finished lock mechanism, destroy it */
+    omp_destroy_lock(&lock);
+    // Close the file
+    delete file;
+    std::cout << " done" << std::endl;
+} // compute_2e_deriv_disk function
+
+// The following function writes all overlap, kinetic, and potential derivatives up to `max_deriv_order` to disk
+// HDF5 File Name: oei_derivs.h5 
+//      HDF5 Dataset names within the file:
+//      overlap_nbf1_nbf2_deriv1 
+//          shape (nbf,nbf,n_unique_1st_derivs)
+//      overlap_nbf1_nbf2_deriv2 
+//          shape (nbf,nbf,n_unique_2nd_derivs)
+//      overlap_nbf1_nbf2_deriv3 
+//          shape (nbf,nbf,n_unique_3rd_derivs)
+//      ...
+//      kinetic_nbf1_nbf2_deriv1 
+//          shape (nbf,nbf,n_unique_1st_derivs)
+//      kinetic_nbf1_nbf2_deriv2 
+//          shape (nbf,nbf,n_unique_2nd_derivs)
+//      kinetic_nbf1_nbf2_deriv3 
+//          shape (nbf,nbf,n_unique_3rd_derivs)
+//      ...
+//      potential_nbf1_nbf2_deriv1 
+//          shape (nbf,nbf,n_unique_1st_derivs)
+//      potential_nbf1_nbf2_deriv2 
+//          shape (nbf,nbf,n_unique_2nd_derivs)
+//      potential_nbf1_nbf2_deriv3 
+//          shape (nbf,nbf,n_unique_3rd_derivs)
+// The number of unique derivatives is essentially equal to the size of the generalized upper triangle of the derivative tensor.
+void oei_deriv_disk(int max_deriv_order) {
+    std::cout << "Writing one-electron integral derivative tensors up to order " << max_deriv_order << " to disk...";
+    long total_deriv_slices = 0;
+    for (int i = 1; i <= max_deriv_order; i++){
+        total_deriv_slices += how_many_derivs(natom, i);
     }
 
-    size_t length = nbf1 * nbf2 * nbf3 * nbf4;
-    std::vector<double> result(length);
+    // Create H5 File and prepare to fill with 0.0's
+    const H5std_string file_name("oei_derivs.h5");
+    H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
+    double fillvalue = 0.0;
+    DSetCreatPropList plist;
+    plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
 
-#pragma omp parallel for collapse(4) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
-                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
-                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
-                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
-                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
-                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
-                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
+    for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
+        // how many shell derivatives in the Libint buffer for overlap/kinetic integrals
+        // how many shell and operator derivatives for potential integrals
+        int nshell_derivs = how_many_derivs(2, deriv_order);
+        int nshell_derivs_potential = how_many_derivs(2, deriv_order, natom);
+        // how many unique cartesian nuclear derivatives (e.g., so we only save one of d^2/dx1dx2 and d^2/dx2dx1, etc)
+        unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
-                    // If the atoms are the same we ignore it as the derivatives will be zero.
-                    if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
-                    // Ensure all desired_atoms correspond to at least one shell atom to ensure desired derivative exists. else, skip this shell quartet.
-                    bool atoms_not_present = false;
-                    for (int i = 0; i < deriv_order; i++){
-                        if (atom1 == desired_atom_indices[i]) continue; 
-                        else if (atom2 == desired_atom_indices[i]) continue;
-                        else if (atom3 == desired_atom_indices[i]) continue;
-                        else if (atom4 == desired_atom_indices[i]) continue;
-                        else {atoms_not_present = true; break;}
-                    }
-                    if (atoms_not_present) continue;
+        // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+        // Overlap and kinetic have different mappings than potential since potential has more elements in the buffer 
+        const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
+        // Potential integrals buffer is flattened upper triangle of (6 + NCART) dimensional deriv_order tensor
+        const std::vector<std::vector<int>> potential_buffer_multidim_lookup = generate_multi_index_lookup(6 + ncart, deriv_order);
 
-                    // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
-                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
-                
-                    // For every desired atom derivative, check shell indices for a match, add it to subvector for that derivative
-                    // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
+        // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
+        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
+
+        // Define engines and buffers
+        std::vector<libint2::Engine> s_engines(nthreads), t_engines(nthreads), v_engines(nthreads);
+        s_engines[0] = libint2::Engine(libint2::Operator::overlap, max_nprim, max_l, deriv_order);
+        t_engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l, deriv_order);
+        v_engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l, deriv_order);
+        v_engines[0].set_params(make_point_charges(atoms));
+        for (size_t i = 1; i != nthreads; ++i) {
+            s_engines[i] = s_engines[0];
+            t_engines[i] = t_engines[0];
+            v_engines[i] = v_engines[0];
+        }
+
+        // Define HDF5 dataset names
+        const H5std_string overlap_dset_name("overlap_" + std::to_string(nbf1) + "_" + std::to_string(nbf2) 
+                                              + "_deriv" + std::to_string(deriv_order));
+        const H5std_string kinetic_dset_name("kinetic_" + std::to_string(nbf1) + "_" + std::to_string(nbf2) 
+                                              + "_deriv" + std::to_string(deriv_order));
+        const H5std_string potential_dset_name("potential_" + std::to_string(nbf1) + "_" + std::to_string(nbf2) 
+                                                + "_deriv" + std::to_string(deriv_order));
+
+        // Define rank and dimensions of data that will be written to the file
+        hsize_t file_dims[] = {nbf1, nbf2, nderivs_triu};
+        DataSpace fspace(3, file_dims);
+        // Create dataset for each integral type and write 0.0's into the file 
+        DataSet* overlap_dataset = new DataSet(file->createDataSet(overlap_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
+        DataSet* kinetic_dataset = new DataSet(file->createDataSet(kinetic_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
+        DataSet* potential_dataset = new DataSet(file->createDataSet(potential_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
+        hsize_t stride[3] = {1, 1, 1}; // stride and block can be used to 
+        hsize_t block[3] = {1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
+        hsize_t zerostart[3] = {0, 0, 0};
+
+        /* Initialize lock */
+        omp_init_lock(&lock);
+
+#pragma omp parallel for collapse(2) num_threads(nthreads)
+        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
+            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
+                auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+                auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+                auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+                auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+                auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+                auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+                std::vector<long> shell_atom_index_list{atom1, atom2};
+
+                size_t thread_id = 0;
+#ifdef _OPENMP
+                thread_id = omp_get_thread_num();
+#endif
+                s_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+                t_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+                v_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+                const auto& overlap_buffer = s_engines[thread_id].results(); // will point to computed shell sets
+                const auto& kinetic_buffer = t_engines[thread_id].results(); // will point to computed shell sets
+                const auto& potential_buffer = v_engines[thread_id].results(); // will point to computed shell sets;
+
+                // Define shell set slabs
+                double overlap_shellset_slab [n1][n2][nderivs_triu] = {};
+                double kinetic_shellset_slab [n1][n2][nderivs_triu] = {};
+                double potential_shellset_slab [n1][n2][nderivs_triu] = {};
+
+                // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+                // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
+                for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                    // Look up multidimensional cartesian derivative index
+                    auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+                    // For overlap/kinetic and potential sepearately, create a vector of vectors called `indices`, where each subvector
+                    // is your possible choices for the first derivative operator, second, third, etc and the total number of subvectors is order of differentiation
+                    // What follows fills these indices
                     std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-                    for (int j = 0; j < desired_atom_indices.size(); j++){
-                        int desired_atom_idx = desired_atom_indices[j];
-                        // Shell indices
-                        for (int i = 0; i < 4; i++){
+                    std::vector<std::vector<int>> potential_indices(deriv_order, std::vector<int> (0,0));
+
+                    // Loop over each cartesian coordinate index which we are differentiating wrt for this nuclear cartesian derivative index
+                    // and check to see if it is present in the shell duet, and where it is present in the potential operator
+                    for (int j = 0; j < multi_cart_idx.size(); j++){
+                        int desired_atom_idx = multi_cart_idx[j] / 3;
+                        int desired_coord = multi_cart_idx[j] % 3;
+                        // Loop over shell indices
+                        for (int i = 0; i < 2; i++){
                             int atom_idx = shell_atom_index_list[i];
                             if (atom_idx == desired_atom_idx) {
-                                int tmp = 3 * i + desired_coordinates[j];
+                                int tmp = 3 * i + desired_coord;
                                 indices[j].push_back(tmp);
+                                potential_indices[j].push_back(tmp);
+                            }
+                        }
+                        // Now for potentials only, loop over each atom in molecule, and if this derivative
+                        // differentiates wrt that atom, we also need to collect that index.
+                        for (int i = 0; i < natom; i++){
+                            if (i == desired_atom_idx) {
+                                int tmp = 3 * (i + 2) + desired_coord;
+                                potential_indices[j].push_back(tmp);
                             }
                         }
                     }
-                    
+
                     // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
                     // and the total number of subvectors is the order of differentiation
                     // Now we want all combinations where we pick exactly one index from each subvector.
-                    // This is achievable through a cartesian product 
+                    // This is achievable through a cartesian product
                     std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                    std::vector<std::vector<int>> potential_index_combos = cartesian_product(potential_indices);
                     std::vector<int> buffer_indices;
-
-                    // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
+                    std::vector<int> potential_buffer_indices;
+                    // Overlap/Kinetic integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
                     for (auto vec : index_combos)  {
                         std::sort(vec.begin(), vec.end());
                         int buf_idx = 0;
-                        // buffer_multidim_lookup
                         auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
                         if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
                         buffer_indices.push_back(buf_idx);
                     }
-
-                    // If we made it this far, the shell derivative we want is contained in the buffer. 
-                    size_t thread_id = 0;
-#ifdef _OPENMP
-                    thread_id = omp_get_thread_num();
-#endif
-                    eri_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                    const auto& buf_vec = eri_engines[thread_id].results(); // will point to computed shell sets
-
-                    for(auto i = 0; i < buffer_indices.size(); ++i) {
-                        auto ints_shellset = buf_vec[buffer_indices[i]];
-                        if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
-                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                            size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
-                            for(auto f2 = 0; f2 != n2; ++f2) {
-                                size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
-                                for(auto f3 = 0; f3 != n3; ++f3) {
-                                    size_t offset_3 = (bf3 + f3) * nbf4;
-                                    for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                        result[offset_1 + offset_2 + offset_3 + bf4 + f4] += ints_shellset[idx];
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-    // This is not the bottleneck
-    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
-}
-
-// Computes nuclear derivatives of contracted Gaussian-type geminal integrals
-py::array f12_deriv(double beta, std::vector<int> deriv_vec) {
-    int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
-
-    // Convert deriv_vec to set of atom indices and their cartesian components which we are differentiating wrt
-    std::vector<int> desired_atom_indices;
-    std::vector<int> desired_coordinates;
-    process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
-
-    assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
-
-    // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
-
-    // F12 derivative integral engine
-    auto cgtg_params = make_cgtg(beta);
-    std::vector<libint2::Engine> cgtg_engines(nthreads);
-    cgtg_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
-    cgtg_engines[0].set_params(cgtg_params);
-    for (size_t i = 1; i != nthreads; ++i) {
-        cgtg_engines[i] = cgtg_engines[0];
-    }
-
-    size_t length = nbf1 * nbf2 * nbf3 * nbf4;
-    std::vector<double> result(length);
-
-#pragma omp parallel for collapse(4) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
-                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
-                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
-                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
-                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
-                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
-                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
-
-                    // If the atoms are the same we ignore it as the derivatives will be zero.
-                    if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
-                    // Ensure all desired_atoms correspond to at least one shell atom to ensure desired derivative exists. else, skip this shell quartet.
-                    bool atoms_not_present = false;
-                    for (int i = 0; i < deriv_order; i++){
-                        if (atom1 == desired_atom_indices[i]) continue; 
-                        else if (atom2 == desired_atom_indices[i]) continue;
-                        else if (atom3 == desired_atom_indices[i]) continue;
-                        else if (atom4 == desired_atom_indices[i]) continue;
-                        else {atoms_not_present = true; break;}
-                    }
-                    if (atoms_not_present) continue;
-
-                    // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
-                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
-                
-                    // For every desired atom derivative, check shell indices for a match, add it to subvector for that derivative
-                    // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
-                    std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-                    for (int j = 0; j < desired_atom_indices.size(); j++){
-                        int desired_atom_idx = desired_atom_indices[j];
-                        // Shell indices
-                        for (int i = 0; i < 4; i++){
-                            int atom_idx = shell_atom_index_list[i];
-                            if (atom_idx == desired_atom_idx) {
-                                int tmp = 3 * i + desired_coordinates[j];
-                                indices[j].push_back(tmp);
-                            }
-                        }
-                    }
-                    
-                    // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                    // and the total number of subvectors is the order of differentiation
-                    // Now we want all combinations where we pick exactly one index from each subvector.
-                    // This is achievable through a cartesian product 
-                    std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                    std::vector<int> buffer_indices;
-
-                    // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                    for (auto vec : index_combos)  {
+                    // Potential integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+                    for (auto vec : potential_index_combos)  {
                         std::sort(vec.begin(), vec.end());
                         int buf_idx = 0;
-                        // buffer_multidim_lookup
-                        auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                        if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                        buffer_indices.push_back(buf_idx);
+                        auto it = lower_bound(potential_buffer_multidim_lookup.begin(), potential_buffer_multidim_lookup.end(), vec);
+                        if (it != potential_buffer_multidim_lookup.end()) buf_idx = it - potential_buffer_multidim_lookup.begin();
+                        potential_buffer_indices.push_back(buf_idx);
                     }
 
-                    // If we made it this far, the shell derivative we want is contained in the buffer. 
-                    size_t thread_id = 0;
-#ifdef _OPENMP
-                    thread_id = omp_get_thread_num();
-#endif
-                    cgtg_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                    const auto& buf_vec = cgtg_engines[thread_id].results(); // will point to computed shell sets
-
+                    // Loop over shell block for each buffer index which contributes to this derivative
+                    // Overlap and Kinetic
                     for(auto i = 0; i < buffer_indices.size(); ++i) {
-                        auto ints_shellset = buf_vec[buffer_indices[i]];
-                        if (ints_shellset == nullptr) continue;
+                        auto overlap_shellset = overlap_buffer[buffer_indices[i]];
+                        auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
                         for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                            size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
-                            for(auto f2 = 0; f2 != n2; ++f2) {
-                                size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
-                                for(auto f3 = 0; f3 != n3; ++f3) {
-                                    size_t offset_3 = (bf3 + f3) * nbf4;
-                                    for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                        result[offset_1 + offset_2 + offset_3 + bf4 + f4] += ints_shellset[idx];
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-    // This is not the bottleneck
-    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
-}
-
-// Computes nuclear derivatives of squared contracted Gaussian-type geminal integrals
-py::array f12_squared_deriv(double beta, std::vector<int> deriv_vec) {
-    int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
-
-    // Convert deriv_vec to set of atom indices and their cartesian components which we are differentiating wrt
-    std::vector<int> desired_atom_indices;
-    std::vector<int> desired_coordinates;
-    process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
-
-    assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
-
-    // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
-
-    // F12 Squared derivative integral engine
-    auto cgtg_params = take_square(make_cgtg(beta));
-    std::vector<libint2::Engine> cgtg_squared_engines(nthreads);
-    cgtg_squared_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
-    cgtg_squared_engines[0].set_params(cgtg_params);
-    for (size_t i = 1; i != nthreads; ++i) {
-        cgtg_squared_engines[i] = cgtg_squared_engines[0];
-    }
-
-    size_t length = nbf1 * nbf2 * nbf3 * nbf4;
-    std::vector<double> result(length);
-
-#pragma omp parallel for collapse(4) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
-                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
-                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
-                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
-                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
-                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
-                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
-
-                    // If the atoms are the same we ignore it as the derivatives will be zero.
-                    if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
-                    // Ensure all desired_atoms correspond to at least one shell atom to ensure desired derivative exists. else, skip this shell quartet.
-                    bool atoms_not_present = false;
-                    for (int i = 0; i < deriv_order; i++){
-                        if (atom1 == desired_atom_indices[i]) continue; 
-                        else if (atom2 == desired_atom_indices[i]) continue;
-                        else if (atom3 == desired_atom_indices[i]) continue;
-                        else if (atom4 == desired_atom_indices[i]) continue;
-                        else {atoms_not_present = true; break;}
-                    }
-                    if (atoms_not_present) continue;
-
-                    // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
-                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
-                
-                    // For every desired atom derivative, check shell indices for a match, add it to subvector for that derivative
-                    // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
-                    std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-                    for (int j = 0; j < desired_atom_indices.size(); j++){
-                        int desired_atom_idx = desired_atom_indices[j];
-                        // Shell indices
-                        for (int i = 0; i < 4; i++){
-                            int atom_idx = shell_atom_index_list[i];
-                            if (atom_idx == desired_atom_idx) {
-                                int tmp = 3 * i + desired_coordinates[j];
-                                indices[j].push_back(tmp);
+                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                                overlap_shellset_slab[f1][f2][nuc_idx] += overlap_shellset[idx];
+                                kinetic_shellset_slab[f1][f2][nuc_idx] += kinetic_shellset[idx];
                             }
                         }
                     }
-                    
-                    // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                    // and the total number of subvectors is the order of differentiation
-                    // Now we want all combinations where we pick exactly one index from each subvector.
-                    // This is achievable through a cartesian product 
-                    std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                    std::vector<int> buffer_indices;
-
-                    // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                    for (auto vec : index_combos)  {
-                        std::sort(vec.begin(), vec.end());
-                        int buf_idx = 0;
-                        // buffer_multidim_lookup
-                        auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                        if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                        buffer_indices.push_back(buf_idx);
-                    }
-
-                    // If we made it this far, the shell derivative we want is contained in the buffer. 
-                    size_t thread_id = 0;
-#ifdef _OPENMP
-                    thread_id = omp_get_thread_num();
-#endif
-                    cgtg_squared_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                    const auto& buf_vec = cgtg_squared_engines[thread_id].results(); // will point to computed shell sets
-
-                    for(auto i = 0; i < buffer_indices.size(); ++i) {
-                        auto ints_shellset = buf_vec[buffer_indices[i]];
-                        if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
-                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                            size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
-                            for(auto f2 = 0; f2 != n2; ++f2) {
-                                size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
-                                for(auto f3 = 0; f3 != n3; ++f3) {
-                                    size_t offset_3 = (bf3 + f3) * nbf4;
-                                    for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                        result[offset_1 + offset_2 + offset_3 + bf4 + f4] += ints_shellset[idx];
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-    // This is not the bottleneck
-    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
-}
-
-// Computes nuclear derivatives of contracted Gaussian-type geminal times Coulomb replusion integrals
-py::array f12g12_deriv(double beta, std::vector<int> deriv_vec) {
-    int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
-
-    // Convert deriv_vec to set of atom indices and their cartesian components which we are differentiating wrt
-    std::vector<int> desired_atom_indices;
-    std::vector<int> desired_coordinates;
-    process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
-
-    assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
-
-    // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
-
-    // F12 derivative integral engine
-    auto cgtg_params = make_cgtg(beta);
-    std::vector<libint2::Engine> cgtg_coulomb_engines(nthreads);
-    cgtg_coulomb_engines[0] = libint2::Engine(libint2::Operator::cgtg_x_coulomb, max_nprim, max_l, deriv_order);
-    cgtg_coulomb_engines[0].set_params(cgtg_params);
-    for (size_t i = 1; i != nthreads; ++i) {
-        cgtg_coulomb_engines[i] = cgtg_coulomb_engines[0];
-    }
-
-    size_t length = nbf1 * nbf2 * nbf3 * nbf4;
-    std::vector<double> result(length);
-
-#pragma omp parallel for collapse(4) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
-                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
-                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
-                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
-                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
-                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
-                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
-
-                    // If the atoms are the same we ignore it as the derivatives will be zero.
-                    if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
-                    // Ensure all desired_atoms correspond to at least one shell atom to ensure desired derivative exists. else, skip this shell quartet.
-                    bool atoms_not_present = false;
-                    for (int i = 0; i < deriv_order; i++){
-                        if (atom1 == desired_atom_indices[i]) continue; 
-                        else if (atom2 == desired_atom_indices[i]) continue;
-                        else if (atom3 == desired_atom_indices[i]) continue;
-                        else if (atom4 == desired_atom_indices[i]) continue;
-                        else {atoms_not_present = true; break;}
-                    }
-                    if (atoms_not_present) continue;
-
-                    // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
-                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
-                
-                    // For every desired atom derivative, check shell indices for a match, add it to subvector for that derivative
-                    // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
-                    std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-                    for (int j = 0; j < desired_atom_indices.size(); j++){
-                        int desired_atom_idx = desired_atom_indices[j];
-                        // Shell indices
-                        for (int i = 0; i < 4; i++){
-                            int atom_idx = shell_atom_index_list[i];
-                            if (atom_idx == desired_atom_idx) {
-                                int tmp = 3 * i + desired_coordinates[j];
-                                indices[j].push_back(tmp);
-                            }
-                        }
-                    }
-                    
-                    // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                    // and the total number of subvectors is the order of differentiation
-                    // Now we want all combinations where we pick exactly one index from each subvector.
-                    // This is achievable through a cartesian product 
-                    std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                    std::vector<int> buffer_indices;
-
-                    // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                    for (auto vec : index_combos)  {
-                        std::sort(vec.begin(), vec.end());
-                        int buf_idx = 0;
-                        // buffer_multidim_lookup
-                        auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                        if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                        buffer_indices.push_back(buf_idx);
-                    }
-
-                    // If we made it this far, the shell derivative we want is contained in the buffer. 
-                    size_t thread_id = 0;
-#ifdef _OPENMP
-                    thread_id = omp_get_thread_num();
-#endif
-                    cgtg_coulomb_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                    const auto& buf_vec = cgtg_coulomb_engines[thread_id].results(); // will point to computed shell sets
-
-                    for(auto i = 0; i < buffer_indices.size(); ++i) {
-                        auto ints_shellset = buf_vec[buffer_indices[i]];
-                        if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
-                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                            size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
-                            for(auto f2 = 0; f2 != n2; ++f2) {
-                                size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
-                                for(auto f3 = 0; f3 != n3; ++f3) {
-                                    size_t offset_3 = (bf3 + f3) * nbf4;
-                                    for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                        result[offset_1 + offset_2 + offset_3 + bf4 + f4] += ints_shellset[idx];
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-    // This is not the bottleneck
-    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
-}
-
-// Computes nuclear derivatives of gradient norm of contracted Gaussian-type geminal integrals
-py::array f12_double_commutator_deriv(double beta, std::vector<int> deriv_vec) {
-    int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
-
-    // Convert deriv_vec to set of atom indices and their cartesian components which we are differentiating wrt
-    std::vector<int> desired_atom_indices;
-    std::vector<int> desired_coordinates;
-    process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
-
-    assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
-
-    // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
-
-    // F12 derivative integral engine
-    auto cgtg_params = make_cgtg(beta);
-    std::vector<libint2::Engine> cgtg_del_engines(nthreads);
-    // Returns Runtime Error: bad any_cast if shorthand version is used, may be an error on the Libint side since Psi4 works with this as well
-    cgtg_del_engines[0] = libint2::Engine(libint2::Operator::delcgtg2, max_nprim, max_l, deriv_order, 0., cgtg_params, libint2::BraKet::xx_xx);
-    for (size_t i = 1; i != nthreads; ++i) {
-        cgtg_del_engines[i] = cgtg_del_engines[0];
-    }
-
-    size_t length = nbf1 * nbf2 * nbf3 * nbf4;
-    std::vector<double> result(length);
-
-#pragma omp parallel for collapse(4) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
-                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
-                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
-                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
-                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
-                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
-                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
-
-                    // If the atoms are the same we ignore it as the derivatives will be zero.
-                    if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
-                    // Ensure all desired_atoms correspond to at least one shell atom to ensure desired derivative exists. else, skip this shell quartet.
-                    bool atoms_not_present = false;
-                    for (int i = 0; i < deriv_order; i++){
-                        if (atom1 == desired_atom_indices[i]) continue; 
-                        else if (atom2 == desired_atom_indices[i]) continue;
-                        else if (atom3 == desired_atom_indices[i]) continue;
-                        else if (atom4 == desired_atom_indices[i]) continue;
-                        else {atoms_not_present = true; break;}
-                    }
-                    if (atoms_not_present) continue;
-
-                    // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
-                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
-                
-                    // For every desired atom derivative, check shell indices for a match, add it to subvector for that derivative
-                    // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
-                    std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-                    for (int j = 0; j < desired_atom_indices.size(); j++){
-                        int desired_atom_idx = desired_atom_indices[j];
-                        // Shell indices
-                        for (int i = 0; i < 4; i++){
-                            int atom_idx = shell_atom_index_list[i];
-                            if (atom_idx == desired_atom_idx) {
-                                int tmp = 3 * i + desired_coordinates[j];
-                                indices[j].push_back(tmp);
-                            }
-                        }
-                    }
-                    
-                    // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                    // and the total number of subvectors is the order of differentiation
-                    // Now we want all combinations where we pick exactly one index from each subvector.
-                    // This is achievable through a cartesian product 
-                    std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                    std::vector<int> buffer_indices;
-
-                    // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                    for (auto vec : index_combos)  {
-                        std::sort(vec.begin(), vec.end());
-                        int buf_idx = 0;
-                        // buffer_multidim_lookup
-                        auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                        if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                        buffer_indices.push_back(buf_idx);
-                    }
-
-                    // If we made it this far, the shell derivative we want is contained in the buffer. 
-                    size_t thread_id = 0;
-#ifdef _OPENMP
-                    thread_id = omp_get_thread_num();
-#endif
-                    cgtg_del_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                    const auto& buf_vec = cgtg_del_engines[thread_id].results(); // will point to computed shell sets
-
-                    for(auto i = 0; i < buffer_indices.size(); ++i) {
-                        auto ints_shellset = buf_vec[buffer_indices[i]];
-                        if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
-                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                            size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
-                            for(auto f2 = 0; f2 != n2; ++f2) {
-                                size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
-                                for(auto f3 = 0; f3 != n3; ++f3) {
-                                    size_t offset_3 = (bf3 + f3) * nbf4;
-                                    for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                        result[offset_1 + offset_2 + offset_3 + bf4 + f4] += ints_shellset[idx];
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-    // This is not the bottleneck
-    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
-}
-
-// The following function writes all overlap, kinetic, and potential derivatives up to `max_deriv_order` to disk
-// HDF5 File Name: oei_derivs.h5 
-//      HDF5 Dataset names within the file:
-//      overlap_deriv1 
-//          shape (nbf,nbf,n_unique_1st_derivs)
-//      overlap_deriv2 
-//          shape (nbf,nbf,n_unique_2nd_derivs)
-//      overlap_deriv3 
-//          shape (nbf,nbf,n_unique_3rd_derivs)
-//      ...
-//      kinetic_deriv1 
-//          shape (nbf,nbf,n_unique_1st_derivs)
-//      kinetic_deriv2 
-//          shape (nbf,nbf,n_unique_2nd_derivs)
-//      kinetic_deriv3 
-//          shape (nbf,nbf,n_unique_3rd_derivs)
-//      ...
-//      potential_deriv1 
-//          shape (nbf,nbf,n_unique_1st_derivs)
-//      potential_deriv2 
-//          shape (nbf,nbf,n_unique_2nd_derivs)
-//      potential_deriv3 
-//          shape (nbf,nbf,n_unique_3rd_derivs)
-// The number of unique derivatives is essentially equal to the size of the generalized upper triangle of the derivative tensor.
-void oei_deriv_disk(int max_deriv_order) {
-    std::cout << "Writing one-electron integral derivative tensors up to order " << max_deriv_order << " to disk...";
-    long total_deriv_slices = 0;
-    for (int i = 1; i <= max_deriv_order; i++){
-        total_deriv_slices += how_many_derivs(natom, i);
-    }
-
-    // Create H5 File and prepare to fill with 0.0's
-    const H5std_string file_name("oei_derivs.h5");
-    H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
-    double fillvalue = 0.0;
-    DSetCreatPropList plist;
-    plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
-
-    for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
-        // how many shell derivatives in the Libint buffer for overlap/kinetic integrals
-        // how many shell and operator derivatives for potential integrals
-        int nshell_derivs = how_many_derivs(2, deriv_order);
-        int nshell_derivs_potential = how_many_derivs(2, deriv_order, natom);
-        // how many unique cartesian nuclear derivatives (e.g., so we only save one of d^2/dx1dx2 and d^2/dx2dx1, etc)
-        unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
-
-        // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-        // Overlap and kinetic have different mappings than potential since potential has more elements in the buffer 
-        const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
-        // Potential integrals buffer is flattened upper triangle of (6 + NCART) dimensional deriv_order tensor
-        const std::vector<std::vector<int>> potential_buffer_multidim_lookup = generate_multi_index_lookup(6 + ncart, deriv_order);
-
-        // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
-        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
-
-        // Define engines and buffers
-        std::vector<libint2::Engine> s_engines(nthreads), t_engines(nthreads), v_engines(nthreads);
-        s_engines[0] = libint2::Engine(libint2::Operator::overlap, max_nprim, max_l, deriv_order);
-        t_engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l, deriv_order);
-        v_engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l, deriv_order);
-        v_engines[0].set_params(make_point_charges(atoms));
-        for (size_t i = 1; i != nthreads; ++i) {
-            s_engines[i] = s_engines[0];
-            t_engines[i] = t_engines[0];
-            v_engines[i] = v_engines[0];
-        }
-
-        // Define HDF5 dataset names
-        const H5std_string overlap_dset_name("overlap_deriv" + std::to_string(deriv_order));
-        const H5std_string kinetic_dset_name("kinetic_deriv" + std::to_string(deriv_order));
-        const H5std_string potential_dset_name("potential_deriv" + std::to_string(deriv_order));
-
-        // Define rank and dimensions of data that will be written to the file
-        hsize_t file_dims[] = {nbf1, nbf2, nderivs_triu};
-        DataSpace fspace(3, file_dims);
-        // Create dataset for each integral type and write 0.0's into the file 
-        DataSet* overlap_dataset = new DataSet(file->createDataSet(overlap_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
-        DataSet* kinetic_dataset = new DataSet(file->createDataSet(kinetic_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
-        DataSet* potential_dataset = new DataSet(file->createDataSet(potential_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
-        hsize_t stride[3] = {1, 1, 1}; // stride and block can be used to 
-        hsize_t block[3] = {1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
-        hsize_t zerostart[3] = {0, 0, 0};
-
-        /* Initialize lock */
-        omp_init_lock(&lock);
-
-#pragma omp parallel for collapse(2) num_threads(nthreads)
-        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-                auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                std::vector<long> shell_atom_index_list{atom1, atom2};
-
-                size_t thread_id = 0;
-#ifdef _OPENMP
-                thread_id = omp_get_thread_num();
-#endif
-                s_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-                t_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-                v_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-                const auto& overlap_buffer = s_engines[thread_id].results(); // will point to computed shell sets
-                const auto& kinetic_buffer = t_engines[thread_id].results(); // will point to computed shell sets
-                const auto& potential_buffer = v_engines[thread_id].results(); // will point to computed shell sets;
-
-                // Define shell set slabs
-                double overlap_shellset_slab [n1][n2][nderivs_triu] = {};
-                double kinetic_shellset_slab [n1][n2][nderivs_triu] = {};
-                double potential_shellset_slab [n1][n2][nderivs_triu] = {};
-
-                // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-                // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
-                for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                    // Look up multidimensional cartesian derivative index
-                    auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
-                    // For overlap/kinetic and potential sepearately, create a vector of vectors called `indices`, where each subvector
-                    // is your possible choices for the first derivative operator, second, third, etc and the total number of subvectors is order of differentiation
-                    // What follows fills these indices
-                    std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-                    std::vector<std::vector<int>> potential_indices(deriv_order, std::vector<int> (0,0));
-
-                    // Loop over each cartesian coordinate index which we are differentiating wrt for this nuclear cartesian derivative index
-                    // and check to see if it is present in the shell duet, and where it is present in the potential operator
-                    for (int j = 0; j < multi_cart_idx.size(); j++){
-                        int desired_atom_idx = multi_cart_idx[j] / 3;
-                        int desired_coord = multi_cart_idx[j] % 3;
-                        // Loop over shell indices
-                        for (int i = 0; i < 2; i++){
-                            int atom_idx = shell_atom_index_list[i];
-                            if (atom_idx == desired_atom_idx) {
-                                int tmp = 3 * i + desired_coord;
-                                indices[j].push_back(tmp);
-                                potential_indices[j].push_back(tmp);
-                            }
-                        }
-                        // Now for potentials only, loop over each atom in molecule, and if this derivative
-                        // differentiates wrt that atom, we also need to collect that index.
-                        for (int i = 0; i < natom; i++){
-                            if (i == desired_atom_idx) {
-                                int tmp = 3 * (i + 2) + desired_coord;
-                                potential_indices[j].push_back(tmp);
-                            }
-                        }
-                    }
-
-                    // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                    // and the total number of subvectors is the order of differentiation
-                    // Now we want all combinations where we pick exactly one index from each subvector.
-                    // This is achievable through a cartesian product
-                    std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                    std::vector<std::vector<int>> potential_index_combos = cartesian_product(potential_indices);
-                    std::vector<int> buffer_indices;
-                    std::vector<int> potential_buffer_indices;
-                    // Overlap/Kinetic integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
-                    for (auto vec : index_combos)  {
-                        std::sort(vec.begin(), vec.end());
-                        int buf_idx = 0;
-                        auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                        if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                        buffer_indices.push_back(buf_idx);
-                    }
-                    // Potential integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
-                    for (auto vec : potential_index_combos)  {
-                        std::sort(vec.begin(), vec.end());
-                        int buf_idx = 0;
-                        auto it = lower_bound(potential_buffer_multidim_lookup.begin(), potential_buffer_multidim_lookup.end(), vec);
-                        if (it != potential_buffer_multidim_lookup.end()) buf_idx = it - potential_buffer_multidim_lookup.begin();
-                        potential_buffer_indices.push_back(buf_idx);
-                    }
-
-                    // Loop over shell block for each buffer index which contributes to this derivative
-                    // Overlap and Kinetic
-                    for(auto i = 0; i < buffer_indices.size(); ++i) {
-                        auto overlap_shellset = overlap_buffer[buffer_indices[i]];
-                        auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
-                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                                overlap_shellset_slab[f1][f2][nuc_idx] += overlap_shellset[idx];
-                                kinetic_shellset_slab[f1][f2][nuc_idx] += kinetic_shellset[idx];
-                            }
-                        }
-                    }
-                    // Potential
-                    for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
-                        auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
-                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                                potential_shellset_slab[f1][f2][nuc_idx] += potential_shellset[idx];
-                            }
-                        }
-                    }
-                } // Unique nuclear cartesian derivative indices loop
-
-                /* Serialize HDF dataset writing using OpenMP lock */
-                omp_set_lock(&lock);
-
-                // Now write this shell set slab to HDF5 file
-                // Create file space hyperslab, defining where to write data to in file
-                hsize_t count[3] = {n1, n2, nderivs_triu};
-                hsize_t start[3] = {bf1, bf2, 0};
-                fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
-                // Create dataspace defining for memory dataset to write to file
-                hsize_t mem_dims[] = {n1, n2, nderivs_triu};
-                DataSpace mspace(3, mem_dims);
-                mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
-                // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
-                overlap_dataset->write(overlap_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
-                kinetic_dataset->write(kinetic_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
-                potential_dataset->write(potential_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
-
-                /* Release lock */
-                omp_unset_lock(&lock);
-            }
-        } // shell duet loops
-        // Delete datasets for this derivative order
-        delete overlap_dataset;
-        delete kinetic_dataset;
-        delete potential_dataset;
-    } // deriv order loop
-
-    /* Finished lock mechanism, destroy it */
-    omp_destroy_lock(&lock);
-    // close the file
-    delete file;
-    std::cout << " done" << std::endl;
-} //oei_deriv_disk 
-
-
-// Writes all ERI's up to `max_deriv_order` to disk.
-// HDF5 File Name: eri_derivs.h5 
-//      HDF5 Dataset names within the file:
-//      eri_deriv1 
-//          shape (nbf,nbf,nbf,nbf,n_unique_1st_derivs)
-//      eri_deriv2
-//          shape (nbf,nbf,nbf,nbf,n_unique_2nd_derivs)
-//      eri_deriv3
-//          shape (nbf,nbf,nbf,nbf,n_unique_3rd_derivs)
-//      ...
-void eri_deriv_disk(int max_deriv_order) { 
-    std::cout << "Writing two-electron integral derivative tensors up to order " << max_deriv_order << " to disk...";
-    const H5std_string file_name("eri_derivs.h5");
-    H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
-    double fillvalue = 0.0;
-    DSetCreatPropList plist;
-    plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
-
-    // Check to make sure you are not flooding the disk.
-    long total_deriv_slices = 0;
-    for (int i = 1; i <= max_deriv_order; i++){
-        total_deriv_slices += how_many_derivs(natom, i);
-    }
-    double check = (nbf1 * nbf2 * nbf3 * nbf4 * total_deriv_slices * 8) * (1e-9);
-    assert(check < 10 && "Total disk space required for ERI's exceeds 10 GB. Increase threshold and recompile to proceed.");
-
-    for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
-        // Number of unique shell derivatives output by libint (number of indices in buffer)
-        int nshell_derivs = how_many_derivs(4, deriv_order);
-        // Number of unique nuclear derivatives of ERI's
-        unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
-
-        // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-        const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
-
-        // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
-        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
-
-        // Libint engine for computing shell quartet derivatives
-        std::vector<libint2::Engine> eri_engines(nthreads);
-        eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
-        for (size_t i = 1; i != nthreads; ++i) {
-            eri_engines[i] = eri_engines[0];
-        }
-
-        // Define HDF5 dataset name
-        const H5std_string eri_dset_name("eri_deriv" + std::to_string(deriv_order));
-        hsize_t file_dims[] = {nbf1, nbf2, nbf3, nbf4, nderivs_triu};
-        DataSpace fspace(5, file_dims);
-        // Create dataset for each integral type and write 0.0's into the file 
-        DataSet* eri_dataset = new DataSet(file->createDataSet(eri_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
-        hsize_t stride[5] = {1, 1, 1, 1, 1}; // stride and block can be used to 
-        hsize_t block[5] = {1, 1, 1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
-        hsize_t zerostart[5] = {0, 0, 0, 0, 0};
-
-        /* Initialize lock */
-        omp_init_lock(&lock);
-
-#pragma omp parallel for collapse(4) num_threads(nthreads)
-        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-                for(auto s3 = 0; s3 != bs3.size(); ++s3) {
-                    for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                        auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                        auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                        auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                        auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                        auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                        auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                        auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
-                        auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
-                        auto n3 = bs3[s3].size();    // number of basis functions in shell 3
-                        auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
-                        auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
-                        auto n4 = bs4[s4].size();    // number of basis functions in shell 4
-
-                        if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
-                        std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
-
-                        size_t thread_id = 0;
-#ifdef _OPENMP
-                        thread_id = omp_get_thread_num();
-#endif
-                        eri_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                        const auto& eri_buffer = eri_engines[thread_id].results(); // will point to computed shell sets
-
-                        // Define shell set slab, with extra dimension for unique derivatives, initialized with 0.0's
-                        double eri_shellset_slab [n1][n2][n3][n4][nderivs_triu] = {};
-                        // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-                        for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                            // Look up multidimensional cartesian derivative index
-                            auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
-    
-                            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-    
-                            // Find out which 
-                            for (int j = 0; j < multi_cart_idx.size(); j++){
-                                int desired_atom_idx = multi_cart_idx[j] / 3;
-                                int desired_coord = multi_cart_idx[j] % 3;
-                                for (int i = 0; i < 4; i++){
-                                    int atom_idx = shell_atom_index_list[i];
-                                    if (atom_idx == desired_atom_idx) {
-                                        int tmp = 3 * i + desired_coord;
-                                        indices[j].push_back(tmp);
-                                    }
-                                }
-                            }
-
-                            // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                            // and the total number of subvectors is the order of differentiation
-                            // Now we want all combinations where we pick exactly one index from each subvector.
-                            // This is achievable through a cartesian product 
-                            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                            std::vector<int> buffer_indices;
-
-                            // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                            for (auto vec : index_combos)  {
-                                std::sort(vec.begin(), vec.end());
-                                int buf_idx = 0;
-                                // buffer_multidim_lookup
-                                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                                buffer_indices.push_back(buf_idx);
-                            }
-
-                            // Loop over shell block, keeping a total count idx for the size of shell set
-                            for(auto i = 0; i < buffer_indices.size(); ++i) {
-                                auto eri_shellset = eri_buffer[buffer_indices[i]];
-                                if (eri_shellset == nullptr) continue;
-                                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                                    for(auto f2 = 0; f2 != n2; ++f2) {
-                                        for(auto f3 = 0; f3 != n3; ++f3) {
-                                            for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                                eri_shellset_slab[f1][f2][f3][f4][nuc_idx] += eri_shellset[idx];
-                                            }
-                                        }
-                                    }
-                                }
-                            }
-                        } // For every nuc_idx 0, nderivs_triu
-
-                        /* Serialize HDF dataset writing using OpenMP lock */
-                        omp_set_lock(&lock);
-
-                        // Now write this shell set slab to HDF5 file
-                        hsize_t count[5] = {n1, n2, n3, n4, nderivs_triu};
-                        hsize_t start[5] = {bf1, bf2, bf3, bf4, 0};
-                        fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
-                        // Create dataspace defining for memory dataset to write to file
-                        hsize_t mem_dims[] = {n1, n2, n3, n4, nderivs_triu};
-                        DataSpace mspace(5, mem_dims);
-                        mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
-                        // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
-                        eri_dataset->write(eri_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
-
-                        /* Release lock */
-                        omp_unset_lock(&lock);
-                    }
-                }
-            }
-        } // shell quartet loops
-        // Close the dataset for this derivative order
-        delete eri_dataset;
-    } // deriv order loop
-
-    /* Finished lock mechanism, destroy it */
-    omp_destroy_lock(&lock);
-    // Close the file
-    delete file;
-    std::cout << " done" << std::endl;
-} // eri_deriv_disk function
-
-// Writes all F12 ints up to `max_deriv_order` to disk.
-// HDF5 File Name: f12_derivs.h5 
-//      HDF5 Dataset names within the file:
-//      f12_deriv1 
-//          shape (nbf,nbf,nbf,nbf,n_unique_1st_derivs)
-//      f12_deriv2
-//          shape (nbf,nbf,nbf,nbf,n_unique_2nd_derivs)
-//      f12_deriv3
-//          shape (nbf,nbf,nbf,nbf,n_unique_3rd_derivs)
-//      ...
-void f12_deriv_disk(double beta, int max_deriv_order) { 
-    std::cout << "Writing two-electron F12 integral derivative tensors up to order " << max_deriv_order << " to disk...";
-    const H5std_string file_name("f12_derivs.h5");
-    H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
-    double fillvalue = 0.0;
-    DSetCreatPropList plist;
-    plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
-
-    // Check to make sure you are not flooding the disk.
-    long total_deriv_slices = 0;
-    for (int i = 1; i <= max_deriv_order; i++){
-        total_deriv_slices += how_many_derivs(natom, i);
-    }
-    double check = (nbf1 * nbf2 * nbf3 * nbf4 * total_deriv_slices * 8) * (1e-9);
-    assert(check < 10 && "Total disk space required for ERI's exceeds 10 GB. Increase threshold and recompile to proceed.");
-
-    auto cgtg_params = make_cgtg(beta);
-    
-    for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
-        // Number of unique shell derivatives output by libint (number of indices in buffer)
-        int nshell_derivs = how_many_derivs(4, deriv_order);
-        // Number of unique nuclear derivatives of ERI's
-        unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
-
-        // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-        const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
-
-        // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
-        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
-
-        // Libint engine for computing shell quartet derivatives
-        std::vector<libint2::Engine> cgtg_engines(nthreads);
-        cgtg_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
-        cgtg_engines[0].set_params(cgtg_params);
-        for (size_t i = 1; i != nthreads; ++i) {
-            cgtg_engines[i] = cgtg_engines[0];
-        }
-
-        // Define HDF5 dataset name
-        const H5std_string eri_dset_name("f12_deriv" + std::to_string(deriv_order));
-        hsize_t file_dims[] = {nbf1, nbf2, nbf3, nbf4, nderivs_triu};
-        DataSpace fspace(5, file_dims);
-        // Create dataset for each integral type and write 0.0's into the file 
-        DataSet* f12_dataset = new DataSet(file->createDataSet(eri_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
-        hsize_t stride[5] = {1, 1, 1, 1, 1}; // stride and block can be used to 
-        hsize_t block[5] = {1, 1, 1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
-        hsize_t zerostart[5] = {0, 0, 0, 0, 0};
-
-        /* Initialize lock */
-        omp_init_lock(&lock);
-
-#pragma omp parallel for collapse(4) num_threads(nthreads)
-        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-                for(auto s3 = 0; s3 != bs3.size(); ++s3) {
-                    for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                        auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                        auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                        auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                        auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                        auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                        auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                        auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
-                        auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
-                        auto n3 = bs3[s3].size();    // number of basis functions in shell 3
-                        auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
-                        auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
-                        auto n4 = bs4[s4].size();    // number of basis functions in shell 4
-
-                        if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
-                        std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
-
-                        size_t thread_id = 0;
-#ifdef _OPENMP
-                        thread_id = omp_get_thread_num();
-#endif
-                        cgtg_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                        const auto& f12_buffer = cgtg_engines[thread_id].results(); // will point to computed shell sets
-
-                        // Define shell set slab, with extra dimension for unique derivatives, initialized with 0.0's
-                        double f12_shellset_slab [n1][n2][n3][n4][nderivs_triu] = {};
-                        // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-                        for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                            // Look up multidimensional cartesian derivative index
-                            auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
-    
-                            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-    
-                            // Find out which 
-                            for (int j = 0; j < multi_cart_idx.size(); j++){
-                                int desired_atom_idx = multi_cart_idx[j] / 3;
-                                int desired_coord = multi_cart_idx[j] % 3;
-                                for (int i = 0; i < 4; i++){
-                                    int atom_idx = shell_atom_index_list[i];
-                                    if (atom_idx == desired_atom_idx) {
-                                        int tmp = 3 * i + desired_coord;
-                                        indices[j].push_back(tmp);
-                                    }
-                                }
-                            }
-
-                            // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                            // and the total number of subvectors is the order of differentiation
-                            // Now we want all combinations where we pick exactly one index from each subvector.
-                            // This is achievable through a cartesian product 
-                            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                            std::vector<int> buffer_indices;
-                            
-                            // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                            for (auto vec : index_combos)  {
-                                std::sort(vec.begin(), vec.end());
-                                int buf_idx = 0;
-                                // buffer_multidim_lookup
-                                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                                buffer_indices.push_back(buf_idx);
-                            }
-
-                            // Loop over shell block, keeping a total count idx for the size of shell set
-                            for(auto i = 0; i < buffer_indices.size(); ++i) {
-                                auto f12_shellset = f12_buffer[buffer_indices[i]];
-                                if (f12_shellset == nullptr) continue;
-                                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                                    for(auto f2 = 0; f2 != n2; ++f2) {
-                                        for(auto f3 = 0; f3 != n3; ++f3) {
-                                            for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                                f12_shellset_slab[f1][f2][f3][f4][nuc_idx] += f12_shellset[idx];
-                                            }
-                                        }
-                                    }
-                                }
-                            }
-                        } // For every nuc_idx 0, nderivs_triu
-
-                        /* Serialize HDF dataset writing using OpenMP lock */
-                        omp_set_lock(&lock);
-
-                        // Now write this shell set slab to HDF5 file
-                        hsize_t count[5] = {n1, n2, n3, n4, nderivs_triu};
-                        hsize_t start[5] = {bf1, bf2, bf3, bf4, 0};
-                        fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
-                        // Create dataspace defining for memory dataset to write to file
-                        hsize_t mem_dims[] = {n1, n2, n3, n4, nderivs_triu};
-                        DataSpace mspace(5, mem_dims);
-                        mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
-                        // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
-                        f12_dataset->write(f12_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
-
-                        /* Release lock */
-                        omp_unset_lock(&lock);
-                    }
-                }
-            }
-        } // shell quartet loops
-        // Close the dataset for this derivative order
-        delete f12_dataset;
-    } // deriv order loop
-
-    /* Finished lock mechanism, destroy it */
-    omp_destroy_lock(&lock);
-    // Close the file
-    delete file;
-    std::cout << " done" << std::endl;
-} // f12_deriv_disk function
-
-// Writes all F12 Squared ints up to `max_deriv_order` to disk.
-// HDF5 File Name: f12_squared_derivs.h5 
-//      HDF5 Dataset names within the file:
-//      f12_squared_deriv1 
-//          shape (nbf,nbf,nbf,nbf,n_unique_1st_derivs)
-//      f12_squared_deriv2
-//          shape (nbf,nbf,nbf,nbf,n_unique_2nd_derivs)
-//      f12_squared_deriv3
-//          shape (nbf,nbf,nbf,nbf,n_unique_3rd_derivs)
-//      ...
-void f12_squared_deriv_disk(double beta, int max_deriv_order) { 
-    std::cout << "Writing two-electron F12 squared integral derivative tensors up to order " << max_deriv_order << " to disk...";
-    const H5std_string file_name("f12_squared_derivs.h5");
-    H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
-    double fillvalue = 0.0;
-    DSetCreatPropList plist;
-    plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
-
-    // Check to make sure you are not flooding the disk.
-    long total_deriv_slices = 0;
-    for (int i = 1; i <= max_deriv_order; i++){
-        total_deriv_slices += how_many_derivs(natom, i);
-    }
-    double check = (nbf1 * nbf2 * nbf3 * nbf4 * total_deriv_slices * 8) * (1e-9);
-    assert(check < 10 && "Total disk space required for ERI's exceeds 10 GB. Increase threshold and recompile to proceed.");
-
-    auto cgtg_params = take_square(make_cgtg(beta));
-    
-    for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
-        // Number of unique shell derivatives output by libint (number of indices in buffer)
-        int nshell_derivs = how_many_derivs(4, deriv_order);
-        // Number of unique nuclear derivatives of ERI's
-        unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
-
-        // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-        const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
-
-        // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
-        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
-
-        // Libint engine for computing shell quartet derivatives
-        std::vector<libint2::Engine> cgtg_squared_engines(nthreads);
-        cgtg_squared_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
-        cgtg_squared_engines[0].set_params(cgtg_params);
-        for (size_t i = 1; i != nthreads; ++i) {
-            cgtg_squared_engines[i] = cgtg_squared_engines[0];
-        }
-
-        // Define HDF5 dataset name
-        const H5std_string eri_dset_name("f12_squared_deriv" + std::to_string(deriv_order));
-        hsize_t file_dims[] = {nbf1, nbf2, nbf3, nbf4, nderivs_triu};
-        DataSpace fspace(5, file_dims);
-        // Create dataset for each integral type and write 0.0's into the file 
-        DataSet* f12_squared_dataset = new DataSet(file->createDataSet(eri_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
-        hsize_t stride[5] = {1, 1, 1, 1, 1}; // stride and block can be used to 
-        hsize_t block[5] = {1, 1, 1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
-        hsize_t zerostart[5] = {0, 0, 0, 0, 0};
-
-        /* Initialize lock */
-        omp_init_lock(&lock);
-
-#pragma omp parallel for collapse(4) num_threads(nthreads)
-        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-                for(auto s3 = 0; s3 != bs3.size(); ++s3) {
-                    for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                        auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                        auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                        auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                        auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                        auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                        auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                        auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
-                        auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
-                        auto n3 = bs3[s3].size();    // number of basis functions in shell 3
-                        auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
-                        auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
-                        auto n4 = bs4[s4].size();    // number of basis functions in shell 4
-
-                        if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
-                        std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
-
-                        size_t thread_id = 0;
-#ifdef _OPENMP
-                        thread_id = omp_get_thread_num();
-#endif
-                        cgtg_squared_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                        const auto& f12_squared_buffer = cgtg_squared_engines[thread_id].results(); // will point to computed shell sets
-
-                        // Define shell set slab, with extra dimension for unique derivatives, initialized with 0.0's
-                        double f12_squared_shellset_slab [n1][n2][n3][n4][nderivs_triu] = {};
-                        // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-                        for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                            // Look up multidimensional cartesian derivative index
-                            auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
-    
-                            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-    
-                            // Find out which 
-                            for (int j = 0; j < multi_cart_idx.size(); j++){
-                                int desired_atom_idx = multi_cart_idx[j] / 3;
-                                int desired_coord = multi_cart_idx[j] % 3;
-                                for (int i = 0; i < 4; i++){
-                                    int atom_idx = shell_atom_index_list[i];
-                                    if (atom_idx == desired_atom_idx) {
-                                        int tmp = 3 * i + desired_coord;
-                                        indices[j].push_back(tmp);
-                                    }
-                                }
-                            }
-
-                            // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                            // and the total number of subvectors is the order of differentiation
-                            // Now we want all combinations where we pick exactly one index from each subvector.
-                            // This is achievable through a cartesian product 
-                            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                            std::vector<int> buffer_indices;
-                            
-                            // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                            for (auto vec : index_combos)  {
-                                std::sort(vec.begin(), vec.end());
-                                int buf_idx = 0;
-                                // buffer_multidim_lookup
-                                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                                buffer_indices.push_back(buf_idx);
-                            }
-
-                            // Loop over shell block, keeping a total count idx for the size of shell set
-                            for(auto i = 0; i < buffer_indices.size(); ++i) {
-                                auto f12_squared_shellset = f12_squared_buffer[buffer_indices[i]];
-                                if (f12_squared_shellset == nullptr) continue;
-                                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                                    for(auto f2 = 0; f2 != n2; ++f2) {
-                                        for(auto f3 = 0; f3 != n3; ++f3) {
-                                            for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                                f12_squared_shellset_slab[f1][f2][f3][f4][nuc_idx] += f12_squared_shellset[idx];
-                                            }
-                                        }
-                                    }
-                                }
-                            }
-                        } // For every nuc_idx 0, nderivs_triu
-
-                        /* Serialize HDF dataset writing using OpenMP lock */
-                        omp_set_lock(&lock);
-
-                        // Now write this shell set slab to HDF5 file
-                        hsize_t count[5] = {n1, n2, n3, n4, nderivs_triu};
-                        hsize_t start[5] = {bf1, bf2, bf3, bf4, 0};
-                        fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
-                        // Create dataspace defining for memory dataset to write to file
-                        hsize_t mem_dims[] = {n1, n2, n3, n4, nderivs_triu};
-                        DataSpace mspace(5, mem_dims);
-                        mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
-                        // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
-                        f12_squared_dataset->write(f12_squared_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
-
-                        /* Release lock */
-                        omp_unset_lock(&lock);
-                    }
-                }
-            }
-        } // shell quartet loops
-        // Close the dataset for this derivative order
-        delete f12_squared_dataset;
-    } // deriv order loop
-
-    /* Finished lock mechanism, destroy it */
-    omp_destroy_lock(&lock);
-    // Close the file
-    delete file;
-    std::cout << " done" << std::endl;
-} // f12_squared_deriv_disk function
-
-// Writes all F12G12 ints up to `max_deriv_order` to disk.
-// HDF5 File Name: f12g12_derivs.h5 
-//      HDF5 Dataset names within the file:
-//      f12g12_deriv1 
-//          shape (nbf,nbf,nbf,nbf,n_unique_1st_derivs)
-//      f12g12_deriv2
-//          shape (nbf,nbf,nbf,nbf,n_unique_2nd_derivs)
-//      f12g12_deriv3
-//          shape (nbf,nbf,nbf,nbf,n_unique_3rd_derivs)
-//      ...
-void f12g12_deriv_disk(double beta, int max_deriv_order) { 
-    std::cout << "Writing two-electron F12G12 integral derivative tensors up to order " << max_deriv_order << " to disk...";
-    const H5std_string file_name("f12g12_derivs.h5");
-    H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
-    double fillvalue = 0.0;
-    DSetCreatPropList plist;
-    plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
-
-    // Check to make sure you are not flooding the disk.
-    long total_deriv_slices = 0;
-    for (int i = 1; i <= max_deriv_order; i++){
-        total_deriv_slices += how_many_derivs(natom, i);
-    }
-    double check = (nbf1 * nbf2 * nbf3 * nbf4 * total_deriv_slices * 8) * (1e-9);
-    assert(check < 10 && "Total disk space required for ERI's exceeds 10 GB. Increase threshold and recompile to proceed.");
-
-    auto cgtg_params = make_cgtg(beta);
-    
-    for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
-        // Number of unique shell derivatives output by libint (number of indices in buffer)
-        int nshell_derivs = how_many_derivs(4, deriv_order);
-        // Number of unique nuclear derivatives of ERI's
-        unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
-
-        // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-        const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
-
-        // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
-        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
-
-        // Libint engine for computing shell quartet derivatives
-        std::vector<libint2::Engine> cgtg_coulomb_engines(nthreads);
-        cgtg_coulomb_engines[0] = libint2::Engine(libint2::Operator::cgtg_x_coulomb, max_nprim, max_l, deriv_order);
-        cgtg_coulomb_engines[0].set_params(cgtg_params);
-        for (size_t i = 1; i != nthreads; ++i) {
-            cgtg_coulomb_engines[i] = cgtg_coulomb_engines[0];
-        }
-
-        // Define HDF5 dataset name
-        const H5std_string eri_dset_name("f12g12_deriv" + std::to_string(deriv_order));
-        hsize_t file_dims[] = {nbf1, nbf2, nbf3, nbf4, nderivs_triu};
-        DataSpace fspace(5, file_dims);
-        // Create dataset for each integral type and write 0.0's into the file 
-        DataSet* f12g12_dataset = new DataSet(file->createDataSet(eri_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
-        hsize_t stride[5] = {1, 1, 1, 1, 1}; // stride and block can be used to 
-        hsize_t block[5] = {1, 1, 1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
-        hsize_t zerostart[5] = {0, 0, 0, 0, 0};
-
-        /* Initialize lock */
-        omp_init_lock(&lock);
-
-#pragma omp parallel for collapse(4) num_threads(nthreads)
-        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-                for(auto s3 = 0; s3 != bs3.size(); ++s3) {
-                    for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                        auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                        auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                        auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                        auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                        auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                        auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                        auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
-                        auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
-                        auto n3 = bs3[s3].size();    // number of basis functions in shell 3
-                        auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
-                        auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
-                        auto n4 = bs4[s4].size();    // number of basis functions in shell 4
-
-                        if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
-                        std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
-
-                        size_t thread_id = 0;
-#ifdef _OPENMP
-                        thread_id = omp_get_thread_num();
-#endif
-                        cgtg_coulomb_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                        const auto& f12g12_buffer = cgtg_coulomb_engines[thread_id].results(); // will point to computed shell sets
-
-                        // Define shell set slab, with extra dimension for unique derivatives, initialized with 0.0's
-                        double f12g12_shellset_slab [n1][n2][n3][n4][nderivs_triu] = {};
-                        // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-                        for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                            // Look up multidimensional cartesian derivative index
-                            auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
-    
-                            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-    
-                            // Find out which 
-                            for (int j = 0; j < multi_cart_idx.size(); j++){
-                                int desired_atom_idx = multi_cart_idx[j] / 3;
-                                int desired_coord = multi_cart_idx[j] % 3;
-                                for (int i = 0; i < 4; i++){
-                                    int atom_idx = shell_atom_index_list[i];
-                                    if (atom_idx == desired_atom_idx) {
-                                        int tmp = 3 * i + desired_coord;
-                                        indices[j].push_back(tmp);
-                                    }
-                                }
-                            }
-
-                            // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                            // and the total number of subvectors is the order of differentiation
-                            // Now we want all combinations where we pick exactly one index from each subvector.
-                            // This is achievable through a cartesian product 
-                            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                            std::vector<int> buffer_indices;
-                            
-                            // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                            for (auto vec : index_combos)  {
-                                std::sort(vec.begin(), vec.end());
-                                int buf_idx = 0;
-                                // buffer_multidim_lookup
-                                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                                buffer_indices.push_back(buf_idx);
-                            }
-
-                            // Loop over shell block, keeping a total count idx for the size of shell set
-                            for(auto i = 0; i < buffer_indices.size(); ++i) {
-                                auto f12g12_shellset = f12g12_buffer[buffer_indices[i]];
-                                if (f12g12_shellset == nullptr) continue;
-                                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                                    for(auto f2 = 0; f2 != n2; ++f2) {
-                                        for(auto f3 = 0; f3 != n3; ++f3) {
-                                            for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                                f12g12_shellset_slab[f1][f2][f3][f4][nuc_idx] += f12g12_shellset[idx];
-                                            }
-                                        }
-                                    }
-                                }
-                            }
-                        } // For every nuc_idx 0, nderivs_triu
-
-                        /* Serialize HDF dataset writing using OpenMP lock */
-                        omp_set_lock(&lock);
-
-                        // Now write this shell set slab to HDF5 file
-                        hsize_t count[5] = {n1, n2, n3, n4, nderivs_triu};
-                        hsize_t start[5] = {bf1, bf2, bf3, bf4, 0};
-                        fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
-                        // Create dataspace defining for memory dataset to write to file
-                        hsize_t mem_dims[] = {n1, n2, n3, n4, nderivs_triu};
-                        DataSpace mspace(5, mem_dims);
-                        mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
-                        // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
-                        f12g12_dataset->write(f12g12_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
-
-                        /* Release lock */
-                        omp_unset_lock(&lock);
-                    }
-                }
-            }
-        } // shell quartet loops
-        // Close the dataset for this derivative order
-        delete f12g12_dataset;
-    } // deriv order loop
-
-    /* Finished lock mechanism, destroy it */
-    omp_destroy_lock(&lock);
-    // Close the file
-    delete file;
-    std::cout << " done" << std::endl;
-} // f12g12_deriv_disk function
-
-// Writes all F12 Double Commutator ints up to `max_deriv_order` to disk.
-// HDF5 File Name: f12_derivs.h5 
-//      HDF5 Dataset names within the file:
-//      f12_double_commutator_deriv1 
-//          shape (nbf,nbf,nbf,nbf,n_unique_1st_derivs)
-//      f12_double_commutator_deriv2
-//          shape (nbf,nbf,nbf,nbf,n_unique_2nd_derivs)
-//      f12_double_commutator_deriv3
-//          shape (nbf,nbf,nbf,nbf,n_unique_3rd_derivs)
-//      ...
-void f12_double_commutator_deriv_disk(double beta, int max_deriv_order) { 
-    std::cout << "Writing two-electron F12 Double Commutator integral derivative tensors up to order " << max_deriv_order << " to disk...";
-    const H5std_string file_name("f12_double_commutator_derivs.h5");
-    H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
-    double fillvalue = 0.0;
-    DSetCreatPropList plist;
-    plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
-
-    // Check to make sure you are not flooding the disk.
-    long total_deriv_slices = 0;
-    for (int i = 1; i <= max_deriv_order; i++){
-        total_deriv_slices += how_many_derivs(natom, i);
-    }
-    double check = (nbf1 * nbf2 * nbf3 * nbf4 * total_deriv_slices * 8) * (1e-9);
-    assert(check < 10 && "Total disk space required for ERI's exceeds 10 GB. Increase threshold and recompile to proceed.");
-
-    auto cgtg_params = make_cgtg(beta);
-    
-    for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
-        // Number of unique shell derivatives output by libint (number of indices in buffer)
-        int nshell_derivs = how_many_derivs(4, deriv_order);
-        // Number of unique nuclear derivatives of ERI's
-        unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
-
-        // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-        const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
-
-        // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
-        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
-
-        // Libint engine for computing shell quartet derivatives
-        std::vector<libint2::Engine> cgtg_del_engines(nthreads);
-        // Returns Runtime Error: bad any_cast if shorthand version is used, may be an error on the Libint side since Psi4 works with this as well
-        cgtg_del_engines[0] = libint2::Engine(libint2::Operator::delcgtg2, max_nprim, max_l, deriv_order, 0., cgtg_params, libint2::BraKet::xx_xx);
-        for (size_t i = 1; i != nthreads; ++i) {
-            cgtg_del_engines[i] = cgtg_del_engines[0];
-        }
-
-        // Define HDF5 dataset name
-        const H5std_string eri_dset_name("f12_double_commutator_deriv" + std::to_string(deriv_order));
-        hsize_t file_dims[] = {nbf1, nbf2, nbf3, nbf4, nderivs_triu};
-        DataSpace fspace(5, file_dims);
-        // Create dataset for each integral type and write 0.0's into the file 
-        DataSet* f12_double_commutator_dataset = new DataSet(file->createDataSet(eri_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
-        hsize_t stride[5] = {1, 1, 1, 1, 1}; // stride and block can be used to 
-        hsize_t block[5] = {1, 1, 1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
-        hsize_t zerostart[5] = {0, 0, 0, 0, 0};
-
-        /* Initialize lock */
-        omp_init_lock(&lock);
-
-#pragma omp parallel for collapse(4) num_threads(nthreads)
-        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-                for(auto s3 = 0; s3 != bs3.size(); ++s3) {
-                    for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                        auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                        auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                        auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                        auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                        auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                        auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                        auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
-                        auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
-                        auto n3 = bs3[s3].size();    // number of basis functions in shell 3
-                        auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
-                        auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
-                        auto n4 = bs4[s4].size();    // number of basis functions in shell 4
-
-                        if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
-                        std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
-
-                        size_t thread_id = 0;
-#ifdef _OPENMP
-                        thread_id = omp_get_thread_num();
-#endif
-                        cgtg_del_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                        const auto& f12_double_commutator_buffer = cgtg_del_engines[thread_id].results(); // will point to computed shell sets
-
-                        // Define shell set slab, with extra dimension for unique derivatives, initialized with 0.0's
-                        double f12_double_commutator_shellset_slab [n1][n2][n3][n4][nderivs_triu] = {};
-                        // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-                        for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                            // Look up multidimensional cartesian derivative index
-                            auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
-    
-                            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-    
-                            // Find out which 
-                            for (int j = 0; j < multi_cart_idx.size(); j++){
-                                int desired_atom_idx = multi_cart_idx[j] / 3;
-                                int desired_coord = multi_cart_idx[j] % 3;
-                                for (int i = 0; i < 4; i++){
-                                    int atom_idx = shell_atom_index_list[i];
-                                    if (atom_idx == desired_atom_idx) {
-                                        int tmp = 3 * i + desired_coord;
-                                        indices[j].push_back(tmp);
-                                    }
-                                }
-                            }
-
-                            // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                            // and the total number of subvectors is the order of differentiation
-                            // Now we want all combinations where we pick exactly one index from each subvector.
-                            // This is achievable through a cartesian product 
-                            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                            std::vector<int> buffer_indices;
-                            
-                            // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                            for (auto vec : index_combos)  {
-                                std::sort(vec.begin(), vec.end());
-                                int buf_idx = 0;
-                                // buffer_multidim_lookup
-                                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                                buffer_indices.push_back(buf_idx);
-                            }
-
-                            // Loop over shell block, keeping a total count idx for the size of shell set
-                            for(auto i = 0; i < buffer_indices.size(); ++i) {
-                                auto f12_double_commutator_shellset = f12_double_commutator_buffer[buffer_indices[i]];
-                                if (f12_double_commutator_shellset == nullptr) continue;
-                                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                                    for(auto f2 = 0; f2 != n2; ++f2) {
-                                        for(auto f3 = 0; f3 != n3; ++f3) {
-                                            for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                                f12_double_commutator_shellset_slab[f1][f2][f3][f4][nuc_idx] += f12_double_commutator_shellset[idx];
-                                            }
-                                        }
-                                    }
-                                }
-                            }
-                        } // For every nuc_idx 0, nderivs_triu
-
-                        /* Serialize HDF dataset writing using OpenMP lock */
-                        omp_set_lock(&lock);
-
-                        // Now write this shell set slab to HDF5 file
-                        hsize_t count[5] = {n1, n2, n3, n4, nderivs_triu};
-                        hsize_t start[5] = {bf1, bf2, bf3, bf4, 0};
-                        fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
-                        // Create dataspace defining for memory dataset to write to file
-                        hsize_t mem_dims[] = {n1, n2, n3, n4, nderivs_triu};
-                        DataSpace mspace(5, mem_dims);
-                        mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
-                        // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
-                        f12_double_commutator_dataset->write(f12_double_commutator_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
-
-                        /* Release lock */
-                        omp_unset_lock(&lock);
-                    }
-                }
-            }
-        } // shell quartet loops
-        // Close the dataset for this derivative order
-        delete f12_double_commutator_dataset;
-    } // deriv order loop
-
-    /* Finished lock mechanism, destroy it */
-    omp_destroy_lock(&lock);
-    // Close the file
-    delete file;
-    std::cout << " done" << std::endl;
-} // f12_double_commutator_deriv_disk function
-
-// Computes a single 'deriv_order' derivative tensor of OEIs, keeps everything in core memory
-std::vector<py::array> oei_deriv_core(int deriv_order) {
-    // how many shell derivatives in the Libint buffer for overlap/kinetic integrals
-    // how many shell and operator derivatives for potential integrals
-    int nshell_derivs = how_many_derivs(2, deriv_order);
-    int nshell_derivs_potential = how_many_derivs(2, deriv_order, natom);
-    // how many unique cartesian nuclear derivatives (e.g., so we only save one of d^2/dx1dx2 and d^2/dx2dx1, etc)
-    unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
-
-    // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-    // Overlap and kinetic have different mappings than potential since potential has more elements in the buffer
-    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
-    // Potential integrals buffer is flattened upper triangle of (6 + NCART) dimensional deriv_order tensor
-    const std::vector<std::vector<int>> potential_buffer_multidim_lookup = generate_multi_index_lookup(6 + ncart, deriv_order);
-
-    // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
-    const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
-
-    // Define engines and buffers
-    std::vector<libint2::Engine> s_engines(nthreads), t_engines(nthreads), v_engines(nthreads);
-    s_engines[0] = libint2::Engine(libint2::Operator::overlap, max_nprim, max_l, deriv_order);
-    t_engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l, deriv_order);
-    v_engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l, deriv_order);
-    v_engines[0].set_params(make_point_charges(atoms));
-    for (size_t i = 1; i != nthreads; ++i) {
-        s_engines[i] = s_engines[0];
-        t_engines[i] = t_engines[0];
-        v_engines[i] = v_engines[0];
-    }
-
-    size_t length = nbf1 * nbf2 * nderivs_triu;
-    std::vector<double> S(length);
-    std::vector<double> T(length);
-    std::vector<double> V(length);
-
-#pragma omp parallel for collapse(2) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-            auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-            auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-            auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-            auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-            auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-            std::vector<long> shell_atom_index_list{atom1, atom2};
-
-            size_t thread_id = 0;
-#ifdef _OPENMP
-            thread_id = omp_get_thread_num();
-#endif
-            s_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-            t_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-            v_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-            const auto& overlap_buffer = s_engines[thread_id].results(); // will point to computed shell sets
-            const auto& kinetic_buffer = t_engines[thread_id].results(); // will point to computed shell sets
-            const auto& potential_buffer = v_engines[thread_id].results(); // will point to computed shell sets
-
-            // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-            // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
-            for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                size_t offset_nuc_idx = nuc_idx * nbf1 * nbf2;
-
-                // Look up multidimensional cartesian derivative index
-                auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
-                // For overlap/kinetic and potential sepearately, create a vector of vectors called `indices`, where each subvector
-                // is your possible choices for the first derivative operator, second, third, etc and the total number of subvectors is order of differentiation
-                // What follows fills these indices
-                std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-                std::vector<std::vector<int>> potential_indices(deriv_order, std::vector<int> (0,0));
-
-                // Loop over each cartesian coordinate index which we are differentiating wrt for this nuclear cartesian derivative index
-                // and check to see if it is present in the shell duet, and where it is present in the potential operator
-                for (int j = 0; j < multi_cart_idx.size(); j++){
-                    int desired_atom_idx = multi_cart_idx[j] / 3;
-                    int desired_coord = multi_cart_idx[j] % 3;
-                    // Loop over shell indices
-                    for (int i = 0; i < 2; i++){
-                        int atom_idx = shell_atom_index_list[i];
-                        if (atom_idx == desired_atom_idx) {
-                            int tmp = 3 * i + desired_coord;
-                            indices[j].push_back(tmp);
-                            potential_indices[j].push_back(tmp);
-                        }
-                    }
-                    // Now for potentials only, loop over each atom in molecule, and if this derivative
-                    // differentiates wrt that atom, we also need to collect that index.
-                    for (int i = 0; i < natom; i++){
-                        if (i == desired_atom_idx) {
-                            int tmp = 3 * (i + 2) + desired_coord;
-                            potential_indices[j].push_back(tmp);
-                        }
-                    }
-                }
-
-                // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                // and the total number of subvectors is the order of differentiation
-                // Now we want all combinations where we pick exactly one index from each subvector.
-                // This is achievable through a cartesian product
-                std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                std::vector<std::vector<int>> potential_index_combos = cartesian_product(potential_indices);
-                std::vector<int> buffer_indices;
-                std::vector<int> potential_buffer_indices;
-                // Overlap/Kinetic integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
-                for (auto vec : index_combos)  {
-                    std::sort(vec.begin(), vec.end());
-                    int buf_idx = 0;
-                    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                    buffer_indices.push_back(buf_idx);
-                }
-                // Potential integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
-                for (auto vec : potential_index_combos)  {
-                    std::sort(vec.begin(), vec.end());
-                    int buf_idx = 0;
-                    auto it = lower_bound(potential_buffer_multidim_lookup.begin(), potential_buffer_multidim_lookup.end(), vec);
-                    if (it != potential_buffer_multidim_lookup.end()) buf_idx = it - potential_buffer_multidim_lookup.begin();
-                    potential_buffer_indices.push_back(buf_idx);
-                }
-
-                // Loop over shell block for each buffer index which contributes to this derivative
-                // Overlap and Kinetic
-                for(auto i = 0; i < buffer_indices.size(); ++i) {
-                    auto overlap_shellset = overlap_buffer[buffer_indices[i]];
-                    auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
-                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                        for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                            S[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += overlap_shellset[idx];
-                            T[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += kinetic_shellset[idx];
-                        }
-                    }
-                }
-                // Potential
-                for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
-                    auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
-                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                        for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                            V[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += potential_shellset[idx];
-                        }
-                    }
-                }
-            } // Unique nuclear cartesian derivative indices loop
-        }
-    } // shell duet loops
-    return {py::array(S.size(), S.data()), py::array(T.size(), T.data()), py::array(V.size(), V.data())}; // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
-} // oei_deriv_core function
-
-// Computes a single 'deriv_order' derivative tensor of electron repulsion integrals, keeps everything in core memory
-py::array eri_deriv_core(int deriv_order) {
-    // Number of unique shell derivatives output by libint (number of indices in buffer)
-    int nshell_derivs = how_many_derivs(4, deriv_order);
-    // Number of unique nuclear derivatives of ERI's
-    unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
-
-    // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
-
-    // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
-    const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
-
-    // Libint engine for computing shell quartet derivatives
-    std::vector<libint2::Engine> eri_engines(nthreads);
-    eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
-    for (size_t i = 1; i != nthreads; ++i) {
-        eri_engines[i] = eri_engines[0];
-    }
-
-    size_t length = nbf1 * nbf2 * nbf3 * nbf4 * nderivs_triu;
-    std::vector<double> result(length);
-
-    // Begin shell quartet loops
-#pragma omp parallel for collapse(4) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
-                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
-                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
-                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
-                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
-                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
-                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
-
-                    if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
-                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
-
-                    size_t thread_id = 0;
-#ifdef _OPENMP
-                    thread_id = omp_get_thread_num();
-#endif
-                    eri_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                    const auto& eri_buffer = eri_engines[thread_id].results(); // will point to computed shell sets
-
-                    // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-                    for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                        size_t offset_nuc_idx = nuc_idx * nbf1 * nbf2 * nbf3 * nbf4;
-
-                        // Look up multidimensional cartesian derivative index
-                        auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
-    
-                        // Find out which shell derivatives provided by Libint correspond to this nuclear cartesian derivative
-                        std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-                        for (int j = 0; j < multi_cart_idx.size(); j++){
-                            int desired_atom_idx = multi_cart_idx[j] / 3;
-                            int desired_coord = multi_cart_idx[j] % 3;
-                            for (int i = 0; i < 4; i++){
-                                int atom_idx = shell_atom_index_list[i];
-                                if (atom_idx == desired_atom_idx) {
-                                    int tmp = 3 * i + desired_coord;
-                                    indices[j].push_back(tmp);
-                                }
-                            }
-                        }
-
-                        // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                        // and the total number of subvectors is the order of differentiation
-                        // Now we want all combinations where we pick exactly one index from each subvector.
-                        // This is achievable through a cartesian product 
-                        std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                        std::vector<int> buffer_indices;
-                        
-                        // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                        for (auto vec : index_combos)  {
-                            std::sort(vec.begin(), vec.end());
-                            int buf_idx = 0;
-                            // buffer_multidim_lookup
-                            auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                            if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                            buffer_indices.push_back(buf_idx);
-                        }
-
-                        // Loop over shell block, keeping a total count idx for the size of shell set
-                        for(auto i = 0; i < buffer_indices.size(); ++i) {
-                            auto eri_shellset = eri_buffer[buffer_indices[i]];
-                            if (eri_shellset == nullptr) continue;
-                            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                                size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
-                                for(auto f2 = 0; f2 != n2; ++f2) {
-                                    size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
-                                    for(auto f3 = 0; f3 != n3; ++f3) {
-                                        size_t offset_3 = (bf3 + f3) * nbf4;
-                                        for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                            result[offset_1 + offset_2 + offset_3 + bf4 + f4 + offset_nuc_idx] += eri_shellset[idx];
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    } // For every nuc_idx 0, nderivs_triu
-                }
-            }
-        }
-    } // shell quartet loops
-    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
-} // eri_deriv_core function
-
-// Computes a single 'deriv_order' derivative tensor of contracted Gaussian-type geminal integrals, keeps everything in core memory
-py::array f12_deriv_core(double beta, int deriv_order) {
-    // Number of unique shell derivatives output by libint (number of indices in buffer)
-    int nshell_derivs = how_many_derivs(4, deriv_order);
-    // Number of unique nuclear derivatives of ERI's
-    unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
-
-    // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
-
-    // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
-    const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
-
-    // Libint engine for computing shell quartet derivatives
-    auto cgtg_params = make_cgtg(beta);
-    std::vector<libint2::Engine> cgtg_engines(nthreads);
-    cgtg_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
-    cgtg_engines[0].set_params(cgtg_params);
-    for (size_t i = 1; i != nthreads; ++i) {
-        cgtg_engines[i] = cgtg_engines[0];
-    }
-
-    size_t length = nbf1 * nbf2 * nbf3 * nbf4 * nderivs_triu;
-    std::vector<double> result(length);
-
-    // Begin shell quartet loops
-#pragma omp parallel for collapse(4) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
-                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
-                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
-                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
-                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
-                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
-                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
-
-                    if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
-                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
-
-                    size_t thread_id = 0;
-#ifdef _OPENMP
-                    thread_id = omp_get_thread_num();
-#endif
-                    cgtg_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                    const auto& f12_buffer = cgtg_engines[thread_id].results(); // will point to computed shell sets
-
-                    // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-                    for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                        size_t offset_nuc_idx = nuc_idx * nbf1 * nbf2 * nbf3 * nbf4;
-
-                        // Look up multidimensional cartesian derivative index
-                        auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
-    
-                        // Find out which shell derivatives provided by Libint correspond to this nuclear cartesian derivative
-                        std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-                        for (int j = 0; j < multi_cart_idx.size(); j++){
-                            int desired_atom_idx = multi_cart_idx[j] / 3;
-                            int desired_coord = multi_cart_idx[j] % 3;
-                            for (int i = 0; i < 4; i++){
-                                int atom_idx = shell_atom_index_list[i];
-                                if (atom_idx == desired_atom_idx) {
-                                    int tmp = 3 * i + desired_coord;
-                                    indices[j].push_back(tmp);
-                                }
-                            }
-                        }
-
-                        // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                        // and the total number of subvectors is the order of differentiation
-                        // Now we want all combinations where we pick exactly one index from each subvector.
-                        // This is achievable through a cartesian product 
-                        std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                        std::vector<int> buffer_indices;
-                        
-                        // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                        for (auto vec : index_combos)  {
-                            std::sort(vec.begin(), vec.end());
-                            int buf_idx = 0;
-                            // buffer_multidim_lookup
-                            auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                            if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                            buffer_indices.push_back(buf_idx);
-                        }
-
-                        // Loop over shell block, keeping a total count idx for the size of shell set
-                        for(auto i = 0; i < buffer_indices.size(); ++i) {
-                            auto f12_shellset = f12_buffer[buffer_indices[i]];
-                            if (f12_shellset == nullptr) continue;
-                            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                                size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
-                                for(auto f2 = 0; f2 != n2; ++f2) {
-                                    size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
-                                    for(auto f3 = 0; f3 != n3; ++f3) {
-                                        size_t offset_3 = (bf3 + f3) * nbf4;
-                                        for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                            result[offset_1 + offset_2 + offset_3 + bf4 + f4 + offset_nuc_idx] += f12_shellset[idx];
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    } // For every nuc_idx 0, nderivs_triu
-                }
-            }
-        }
-    } // shell quartet loops
-    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
-} // f12_deriv_core function
-
-// Computes a single 'deriv_order' derivative tensor of squared contracted Gaussian-type geminal integrals, keeps everything in core memory
-py::array f12_squared_deriv_core(double beta, int deriv_order) {
-    // Number of unique shell derivatives output by libint (number of indices in buffer)
-    int nshell_derivs = how_many_derivs(4, deriv_order);
-    // Number of unique nuclear derivatives of ERI's
-    unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
-
-    // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
-
-    // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
-    const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
-
-    // Libint engine for computing shell quartet derivatives
-    auto cgtg_params = take_square(make_cgtg(beta));
-    std::vector<libint2::Engine> cgtg_squared_engines(nthreads);
-    cgtg_squared_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
-    cgtg_squared_engines[0].set_params(cgtg_params);
-    for (size_t i = 1; i != nthreads; ++i) {
-        cgtg_squared_engines[i] = cgtg_squared_engines[0];
-    }
-
-    size_t length = nbf1 * nbf2 * nbf3 * nbf4 * nderivs_triu;
-    std::vector<double> result(length);
-
-    // Begin shell quartet loops
-#pragma omp parallel for collapse(4) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
-                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
-                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
-                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
-                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
-                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
-                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
-
-                    if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
-                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
-
-                    size_t thread_id = 0;
-#ifdef _OPENMP
-                    thread_id = omp_get_thread_num();
-#endif
-                    cgtg_squared_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                    const auto& f12_squared_buffer = cgtg_squared_engines[thread_id].results(); // will point to computed shell sets
-
-                    // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-                    for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                        size_t offset_nuc_idx = nuc_idx * nbf1 * nbf2 * nbf3 * nbf4;
-
-                        // Look up multidimensional cartesian derivative index
-                        auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
-    
-                        // Find out which shell derivatives provided by Libint correspond to this nuclear cartesian derivative
-                        std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-                        for (int j = 0; j < multi_cart_idx.size(); j++){
-                            int desired_atom_idx = multi_cart_idx[j] / 3;
-                            int desired_coord = multi_cart_idx[j] % 3;
-                            for (int i = 0; i < 4; i++){
-                                int atom_idx = shell_atom_index_list[i];
-                                if (atom_idx == desired_atom_idx) {
-                                    int tmp = 3 * i + desired_coord;
-                                    indices[j].push_back(tmp);
-                                }
-                            }
-                        }
-
-                        // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                        // and the total number of subvectors is the order of differentiation
-                        // Now we want all combinations where we pick exactly one index from each subvector.
-                        // This is achievable through a cartesian product 
-                        std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                        std::vector<int> buffer_indices;
-                        
-                        // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                        for (auto vec : index_combos)  {
-                            std::sort(vec.begin(), vec.end());
-                            int buf_idx = 0;
-                            // buffer_multidim_lookup
-                            auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                            if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                            buffer_indices.push_back(buf_idx);
-                        }
-
-                        // Loop over shell block, keeping a total count idx for the size of shell set
-                        for(auto i = 0; i < buffer_indices.size(); ++i) {
-                            auto f12_squared_shellset = f12_squared_buffer[buffer_indices[i]];
-                            if (f12_squared_shellset == nullptr) continue;
-                            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                                size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
-                                for(auto f2 = 0; f2 != n2; ++f2) {
-                                    size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
-                                    for(auto f3 = 0; f3 != n3; ++f3) {
-                                        size_t offset_3 = (bf3 + f3) * nbf4;
-                                        for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                            result[offset_1 + offset_2 + offset_3 + bf4 + f4 + offset_nuc_idx] += f12_squared_shellset[idx];
-                                        }
-                                    }
-                                }
+                    // Potential
+                    for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
+                        auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                                potential_shellset_slab[f1][f2][nuc_idx] += potential_shellset[idx];
                             }
                         }
-                    } // For every nuc_idx 0, nderivs_triu
-                }
+                    }
+                } // Unique nuclear cartesian derivative indices loop
+
+                /* Serialize HDF dataset writing using OpenMP lock */
+                omp_set_lock(&lock);
+
+                // Now write this shell set slab to HDF5 file
+                // Create file space hyperslab, defining where to write data to in file
+                hsize_t count[3] = {n1, n2, nderivs_triu};
+                hsize_t start[3] = {bf1, bf2, 0};
+                fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
+                // Create dataspace defining for memory dataset to write to file
+                hsize_t mem_dims[] = {n1, n2, nderivs_triu};
+                DataSpace mspace(3, mem_dims);
+                mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
+                // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
+                overlap_dataset->write(overlap_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+                kinetic_dataset->write(kinetic_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+                potential_dataset->write(potential_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+
+                /* Release lock */
+                omp_unset_lock(&lock);
             }
-        }
-    } // shell quartet loops
-    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
-} // f12_squared_deriv_core function
+        } // shell duet loops
+        // Delete datasets for this derivative order
+        delete overlap_dataset;
+        delete kinetic_dataset;
+        delete potential_dataset;
+    } // deriv order loop
 
-// Computes a single 'deriv_order' derivative tensor of contracted Gaussian-type geminal times Coulomb replusion integrals, keeps everything in core memory
-py::array f12g12_deriv_core(double beta, int deriv_order) {
-    // Number of unique shell derivatives output by libint (number of indices in buffer)
-    int nshell_derivs = how_many_derivs(4, deriv_order);
-    // Number of unique nuclear derivatives of ERI's
+    /* Finished lock mechanism, destroy it */
+    omp_destroy_lock(&lock);
+    // close the file
+    delete file;
+    std::cout << " done" << std::endl;
+} //oei_deriv_disk 
+
+// Computes a single 'deriv_order' derivative tensor of OEIs, keeps everything in core memory
+std::vector<py::array> oei_deriv_core(int deriv_order) {
+    // how many shell derivatives in the Libint buffer for overlap/kinetic integrals
+    // how many shell and operator derivatives for potential integrals
+    int nshell_derivs = how_many_derivs(2, deriv_order);
+    int nshell_derivs_potential = how_many_derivs(2, deriv_order, natom);
+    // how many unique cartesian nuclear derivatives (e.g., so we only save one of d^2/dx1dx2 and d^2/dx2dx1, etc)
     unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
-    // Create mapping from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
-    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
+    // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+    // Overlap and kinetic have different mappings than potential since potential has more elements in the buffer
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
+    // Potential integrals buffer is flattened upper triangle of (6 + NCART) dimensional deriv_order tensor
+    const std::vector<std::vector<int>> potential_buffer_multidim_lookup = generate_multi_index_lookup(6 + ncart, deriv_order);
 
     // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
     const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
 
-    // Libint engine for computing shell quartet derivatives
-    auto cgtg_params = make_cgtg(beta);
-    std::vector<libint2::Engine> cgtg_coulomb_engines(nthreads);
-    cgtg_coulomb_engines[0] = libint2::Engine(libint2::Operator::cgtg_x_coulomb, max_nprim, max_l, deriv_order);
-    cgtg_coulomb_engines[0].set_params(cgtg_params);
+    // Define engines and buffers
+    std::vector<libint2::Engine> s_engines(nthreads), t_engines(nthreads), v_engines(nthreads);
+    s_engines[0] = libint2::Engine(libint2::Operator::overlap, max_nprim, max_l, deriv_order);
+    t_engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l, deriv_order);
+    v_engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l, deriv_order);
+    v_engines[0].set_params(make_point_charges(atoms));
     for (size_t i = 1; i != nthreads; ++i) {
-        cgtg_coulomb_engines[i] = cgtg_coulomb_engines[0];
+        s_engines[i] = s_engines[0];
+        t_engines[i] = t_engines[0];
+        v_engines[i] = v_engines[0];
     }
 
-    size_t length = nbf1 * nbf2 * nbf3 * nbf4 * nderivs_triu;
-    std::vector<double> result(length);
+    size_t length = nbf1 * nbf2 * nderivs_triu;
+    std::vector<double> S(length);
+    std::vector<double> T(length);
+    std::vector<double> V(length);
 
-    // Begin shell quartet loops
-#pragma omp parallel for collapse(4) num_threads(nthreads)
+#pragma omp parallel for collapse(2) num_threads(nthreads)
     for(auto s1 = 0; s1 != bs1.size(); ++s1) {
         for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
-                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
-                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
-                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
-                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
-                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
-                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
-
-                    if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
-                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
+            auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
+            auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
+            auto n1 = bs1[s1].size();    // number of basis functions in shell 1
+            auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
+            auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
+            auto n2 = bs2[s2].size();    // number of basis functions in shell 2
+            std::vector<long> shell_atom_index_list{atom1, atom2};
 
-                    size_t thread_id = 0;
+            size_t thread_id = 0;
 #ifdef _OPENMP
-                    thread_id = omp_get_thread_num();
+            thread_id = omp_get_thread_num();
 #endif
-                    cgtg_coulomb_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                    const auto& f12g12_buffer = cgtg_coulomb_engines[thread_id].results(); // will point to computed shell sets
+            s_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+            t_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+            v_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
+            const auto& overlap_buffer = s_engines[thread_id].results(); // will point to computed shell sets
+            const auto& kinetic_buffer = t_engines[thread_id].results(); // will point to computed shell sets
+            const auto& potential_buffer = v_engines[thread_id].results(); // will point to computed shell sets
 
-                    // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-                    for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                        size_t offset_nuc_idx = nuc_idx * nbf1 * nbf2 * nbf3 * nbf4;
+            // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+            // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
+            for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                size_t offset_nuc_idx = nuc_idx * nbf1 * nbf2;
 
-                        // Look up multidimensional cartesian derivative index
-                        auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
-    
-                        // Find out which shell derivatives provided by Libint correspond to this nuclear cartesian derivative
-                        std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-                        for (int j = 0; j < multi_cart_idx.size(); j++){
-                            int desired_atom_idx = multi_cart_idx[j] / 3;
-                            int desired_coord = multi_cart_idx[j] % 3;
-                            for (int i = 0; i < 4; i++){
-                                int atom_idx = shell_atom_index_list[i];
-                                if (atom_idx == desired_atom_idx) {
-                                    int tmp = 3 * i + desired_coord;
-                                    indices[j].push_back(tmp);
-                                }
-                            }
-                        }
+                // Look up multidimensional cartesian derivative index
+                auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+                // For overlap/kinetic and potential sepearately, create a vector of vectors called `indices`, where each subvector
+                // is your possible choices for the first derivative operator, second, third, etc and the total number of subvectors is order of differentiation
+                // What follows fills these indices
+                std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+                std::vector<std::vector<int>> potential_indices(deriv_order, std::vector<int> (0,0));
 
-                        // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                        // and the total number of subvectors is the order of differentiation
-                        // Now we want all combinations where we pick exactly one index from each subvector.
-                        // This is achievable through a cartesian product 
-                        std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                        std::vector<int> buffer_indices;
-                        
-                        // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                        for (auto vec : index_combos)  {
-                            std::sort(vec.begin(), vec.end());
-                            int buf_idx = 0;
-                            // buffer_multidim_lookup
-                            auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                            if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                            buffer_indices.push_back(buf_idx);
+                // Loop over each cartesian coordinate index which we are differentiating wrt for this nuclear cartesian derivative index
+                // and check to see if it is present in the shell duet, and where it is present in the potential operator
+                for (int j = 0; j < multi_cart_idx.size(); j++){
+                    int desired_atom_idx = multi_cart_idx[j] / 3;
+                    int desired_coord = multi_cart_idx[j] % 3;
+                    // Loop over shell indices
+                    for (int i = 0; i < 2; i++){
+                        int atom_idx = shell_atom_index_list[i];
+                        if (atom_idx == desired_atom_idx) {
+                            int tmp = 3 * i + desired_coord;
+                            indices[j].push_back(tmp);
+                            potential_indices[j].push_back(tmp);
+                        }
+                    }
+                    // Now for potentials only, loop over each atom in molecule, and if this derivative
+                    // differentiates wrt that atom, we also need to collect that index.
+                    for (int i = 0; i < natom; i++){
+                        if (i == desired_atom_idx) {
+                            int tmp = 3 * (i + 2) + desired_coord;
+                            potential_indices[j].push_back(tmp);
                         }
+                    }
+                }
 
-                        // Loop over shell block, keeping a total count idx for the size of shell set
-                        for(auto i = 0; i < buffer_indices.size(); ++i) {
-                            auto f12g12_shellset = f12g12_buffer[buffer_indices[i]];
-                            if (f12g12_shellset == nullptr) continue;
-                            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                                size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
-                                for(auto f2 = 0; f2 != n2; ++f2) {
-                                    size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
-                                    for(auto f3 = 0; f3 != n3; ++f3) {
-                                        size_t offset_3 = (bf3 + f3) * nbf4;
-                                        for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                            result[offset_1 + offset_2 + offset_3 + bf4 + f4 + offset_nuc_idx] += f12g12_shellset[idx];
-                                        }
-                                    }
-                                }
-                            }
+                // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+                // and the total number of subvectors is the order of differentiation
+                // Now we want all combinations where we pick exactly one index from each subvector.
+                // This is achievable through a cartesian product
+                std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                std::vector<std::vector<int>> potential_index_combos = cartesian_product(potential_indices);
+                std::vector<int> buffer_indices;
+                std::vector<int> potential_buffer_indices;
+                // Overlap/Kinetic integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+                for (auto vec : index_combos)  {
+                    std::sort(vec.begin(), vec.end());
+                    int buf_idx = 0;
+                    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                    buffer_indices.push_back(buf_idx);
+                }
+                // Potential integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+                for (auto vec : potential_index_combos)  {
+                    std::sort(vec.begin(), vec.end());
+                    int buf_idx = 0;
+                    auto it = lower_bound(potential_buffer_multidim_lookup.begin(), potential_buffer_multidim_lookup.end(), vec);
+                    if (it != potential_buffer_multidim_lookup.end()) buf_idx = it - potential_buffer_multidim_lookup.begin();
+                    potential_buffer_indices.push_back(buf_idx);
+                }
+
+                // Loop over shell block for each buffer index which contributes to this derivative
+                // Overlap and Kinetic
+                for(auto i = 0; i < buffer_indices.size(); ++i) {
+                    auto overlap_shellset = overlap_buffer[buffer_indices[i]];
+                    auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
+                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                        for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                            S[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += overlap_shellset[idx];
+                            T[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += kinetic_shellset[idx];
                         }
-                    } // For every nuc_idx 0, nderivs_triu
+                    }
                 }
-            }
+                // Potential
+                for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
+                    auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
+                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                        for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                            V[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += potential_shellset[idx];
+                        }
+                    }
+                }
+            } // Unique nuclear cartesian derivative indices loop
         }
-    } // shell quartet loops
-    return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
-} // f12g12_deriv_core function
+    } // shell duet loops
+    return {py::array(S.size(), S.data()), py::array(T.size(), T.data()), py::array(V.size(), V.data())}; // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
+} // oei_deriv_core function
 
-// Computes a single 'deriv_order' derivative tensor of gradient norm of contracted Gaussian-type geminal integrals, keeps everything in core memory
-py::array f12_double_commutator_deriv_core(double beta, int deriv_order) {
+// Computes a single 'deriv_order' derivative tensor of electron repulsion integrals, keeps everything in core memory
+py::array eri_deriv_core(int deriv_order) {
     // Number of unique shell derivatives output by libint (number of indices in buffer)
     int nshell_derivs = how_many_derivs(4, deriv_order);
     // Number of unique nuclear derivatives of ERI's
@@ -3313,12 +1460,10 @@ py::array f12_double_commutator_deriv_core(double beta, int deriv_order) {
     const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
 
     // Libint engine for computing shell quartet derivatives
-    auto cgtg_params = make_cgtg(beta);
-    std::vector<libint2::Engine> cgtg_del_engines(nthreads);
-    // Returns Runtime Error: bad any_cast if shorthand version is used, may be an error on the Libint side since Psi4 works with this as well
-    cgtg_del_engines[0] = libint2::Engine(libint2::Operator::delcgtg2, max_nprim, max_l, deriv_order, 0., cgtg_params, libint2::BraKet::xx_xx);
+    std::vector<libint2::Engine> eri_engines(nthreads);
+    eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
     for (size_t i = 1; i != nthreads; ++i) {
-        cgtg_del_engines[i] = cgtg_del_engines[0];
+        eri_engines[i] = eri_engines[0];
     }
 
     size_t length = nbf1 * nbf2 * nbf3 * nbf4 * nderivs_triu;
@@ -3350,8 +1495,8 @@ py::array f12_double_commutator_deriv_core(double beta, int deriv_order) {
 #ifdef _OPENMP
                     thread_id = omp_get_thread_num();
 #endif
-                    cgtg_del_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                    const auto& f12_double_commutator_buffer = cgtg_del_engines[thread_id].results(); // will point to computed shell sets
+                    eri_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                    const auto& eri_buffer = eri_engines[thread_id].results(); // will point to computed shell sets
 
                     // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
                     for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
@@ -3393,8 +1538,8 @@ py::array f12_double_commutator_deriv_core(double beta, int deriv_order) {
 
                         // Loop over shell block, keeping a total count idx for the size of shell set
                         for(auto i = 0; i < buffer_indices.size(); ++i) {
-                            auto f12_double_commutator_shellset = f12_double_commutator_buffer[buffer_indices[i]];
-                            if (f12_double_commutator_shellset == nullptr) continue;
+                            auto eri_shellset = eri_buffer[buffer_indices[i]];
+                            if (eri_shellset == nullptr) continue;
                             for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                                 size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
                                 for(auto f2 = 0; f2 != n2; ++f2) {
@@ -3402,7 +1547,7 @@ py::array f12_double_commutator_deriv_core(double beta, int deriv_order) {
                                     for(auto f3 = 0; f3 != n3; ++f3) {
                                         size_t offset_3 = (bf3 + f3) * nbf4;
                                         for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                            result[offset_1 + offset_2 + offset_3 + bf4 + f4 + offset_nuc_idx] += f12_double_commutator_shellset[idx];
+                                            result[offset_1 + offset_2 + offset_3 + bf4 + f4 + offset_nuc_idx] += eri_shellset[idx];
                                         }
                                     }
                                 }
@@ -3414,7 +1559,7 @@ py::array f12_double_commutator_deriv_core(double beta, int deriv_order) {
         }
     } // shell quartet loops
     return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
-} // f12_double_commutator_deriv_core function
+} // eri_deriv_core function
 
 // Define module named 'libint_interface' which can be imported with python
 // The second arg, 'm' defines a variable py::module_ which can be used to create
@@ -3423,36 +1568,18 @@ PYBIND11_MODULE(libint_interface, m) {
     m.doc() = "pybind11 libint interface to molecular integrals"; // optional module docstring
     m.def("initialize", &initialize, "Initializes libint, builds geom and basis, assigns globals");
     m.def("finalize", &finalize, "Kills libint");
-    m.def("overlap", &overlap, "Computes overlap integrals with libint");
-    m.def("kinetic", &kinetic, "Computes kinetic integrals with libint");
-    m.def("potential", &potential, "Computes potential integrals with libint");
-    m.def("eri", &eri, "Computes electron repulsion integrals with libint");
-    m.def("f12", &f12, "Computes contracted Gaussian-type geminal integrals with libint");
-    m.def("f12_squared", &f12_squared, "Computes sqaured contracted Gaussian-type geminal integrals with libint");
-    m.def("f12g12", &f12g12, "Computes contracted Gaussian-type geminal times Coulomb repulsion integrals with libint");
-    m.def("f12_double_commutator", &f12_double_commutator, "Computes gradient norm of contracted Gaussian-type geminal integrals with libint");
-    m.def("overlap_deriv", &overlap_deriv, "Computes overlap integral nuclear derivatives with libint");
-    m.def("kinetic_deriv", &kinetic_deriv, "Computes kinetic integral nuclear derivatives with libint");
-    m.def("potential_deriv", &potential_deriv, "Computes potential integral nuclear derivatives with libint");
-    m.def("eri_deriv", &eri_deriv, "Computes electron repulsion integral nuclear derivatives with libint");
-    m.def("f12_deriv", &f12_deriv, "Computes contracted Gaussian-type geminal integral nuclear derivatives with libint");
-    m.def("f12_squared_deriv", &f12_squared_deriv, "Computes sqaured contracted Gaussian-type geminal integral nuclear derivatives with libint");
-    m.def("f12g12_deriv", &f12g12_deriv, "Computes contracted Gaussian-type geminal times Coulomb repulsion integral nuclear derivatives with libint");
-    m.def("f12_double_commutator_deriv", &f12_double_commutator_deriv, "Computes gradient norm of contracted Gaussian-type geminal integral nuclear derivatives with libint");
+    m.def("compute_1e_int", &compute_1e_int, "Computes one-electron integrals with libint");
+    m.def("compute_2e_int", &compute_2e_int, "Computes two-electron integrals with libint");
+    m.def("compute_1e_deriv", &compute_1e_deriv, "Computes one-electron integral nuclear derivatives with libint");
+    m.def("compute_2e_deriv", &compute_2e_deriv, "Computes two-electron integral nuclear derivatives with libint");
+    m.def("compute_1e_deriv_disk", &compute_1e_deriv_disk, "Computes one-electron nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
+    m.def("compute_2e_deriv_disk", &compute_2e_deriv_disk, "Computes coulomb integral nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
     m.def("oei_deriv_disk", &oei_deriv_disk, "Computes overlap, kinetic, and potential integral derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
-    m.def("eri_deriv_disk", &eri_deriv_disk, "Computes coulomb integral nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
-    m.def("f12_deriv_disk", &f12_deriv_disk, "Computes contracted Gaussian-type geminal integral nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
-    m.def("f12_squared_deriv_disk", &f12_squared_deriv_disk, "Computes sqaured contracted Gaussian-type geminal integral nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
-    m.def("f12g12_deriv_disk", &f12g12_deriv_disk, "Computes contracted Gaussian-type geminal times Coulomb repulsion integral nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
-    m.def("f12_double_commutator_deriv_disk", &f12_double_commutator_deriv_disk, "Computes gradient norm of contracted Gaussian-type geminal integral nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
     m.def("oei_deriv_core", &oei_deriv_core, "Computes a single OEI integral derivative tensor, in memory.");
     m.def("eri_deriv_core", &eri_deriv_core, "Computes a single coulomb integral nuclear derivative tensor, in memory.");
-    //m.def("f12_partial_deriv_core", &f12_deriv_core, "Computes a single contracted Gaussian-type geminal integral nuclear derivative tensor, in memory.");
-    //m.def("f12_squared_partial_deriv_core", &f12_squared_deriv_core, "Computes a single sqaured contracted Gaussian-type geminal integral nuclear derivative tensor, in memory.");
-    //m.def("f12g12_partial_deriv_core", &f12g12_deriv_core, "Computes a single contracted Gaussian-type geminal times Coulomb repulsion integral nuclear derivative tensor, in memory.");
-    //m.def("f12_double_commutator_partial_deriv_core", &f12_double_commutator_deriv_core, "Computes a single gradient norm of contracted Gaussian-type geminal integral nuclear derivative tensor, in memory.");
     //TODO partial derivative impl's
-    //m.def("eri_partial_deriv_disk", &eri_partial_deriv_disk, "Computes a subset of the full coulomb integral nuclear derivative tensor and writes them to disk with HDF5");
+    //m.def("compute_2e_deriv_core", &compute_2e_partial_deriv_core, "Computes a single contracted Gaussian-type geminal integral nuclear derivative tensor, in memory.");
+    //m.def("compute_2e_partial_deriv_disk", &compute_2e_partial_deriv_disk, "Computes a subset of the full coulomb integral nuclear derivative tensor and writes them to disk with HDF5");
      m.attr("LIBINT2_MAX_DERIV_ORDER") = LIBINT2_MAX_DERIV_ORDER;
 }
 
diff --git a/quax/integrals/oei.py b/quax/integrals/oei.py
index 1581abd..608f275 100644
--- a/quax/integrals/oei.py
+++ b/quax/integrals/oei.py
@@ -90,17 +90,17 @@ def potential_deriv(self, geom, deriv_vec):
 
     # Create primitive evaluation rules
     def overlap_impl(self, geom):
-        S = libint_interface.overlap()
+        S = libint_interface.compute_1e_int("overlap")
         S = S.reshape(self.nbf1, self.nbf2)
         return jnp.asarray(S)
 
     def kinetic_impl(self, geom):
-        T = libint_interface.kinetic()
+        T = libint_interface.compute_1e_int("kinetic")
         T = T.reshape(self.nbf1, self.nbf2)
         return jnp.asarray(T)
 
     def potential_impl(self, geom):
-        V = libint_interface.potential()
+        V = libint_interface.compute_1e_int("potential")
         V = V.reshape(self.nbf1, self.nbf2)
         return jnp.asarray(V)
 
@@ -113,15 +113,17 @@ def overlap_deriv_impl(self, geom, deriv_vec):
             S = self.overlap_derivatives[deriv_order-1][idx,:,:]
             return jnp.asarray(S)
         if self.mode == 'f12':
-            S = libint_interface.overlap_deriv(deriv_vec)
+            S = libint_interface.compute_1e_deriv("overlap", deriv_vec)
             return jnp.asarray(S).reshape(self.nbf1,self.nbf2)
         elif self.mode == 'disk':
             if os.path.exists("oei_derivs.h5"):
                 file_name = "oei_derivs.h5"
-                dataset_name = "overlap_deriv" + str(deriv_order)
+                dataset_name = "overlap_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order)
             elif os.path.exists("oei_partials.h5"):
                 file_name = "oei_partials.h5"
-                dataset_name = "overlap_deriv" + str(deriv_order) + "_" + str(idx)
+                dataset_name = "overlap_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order) + "_" + str(idx)
             else:
                 raise Exception("Something went wrong reading integral derivative file")
             with h5py.File(file_name, 'r') as f:
@@ -143,15 +145,17 @@ def kinetic_deriv_impl(self, geom, deriv_vec):
             T = self.kinetic_derivatives[deriv_order-1][idx,:,:]
             return jnp.asarray(T)
         if self.mode == 'f12':
-            T = libint_interface.kinetic_deriv(deriv_vec)
+            T = libint_interface.compute_1e_deriv("kinetic", deriv_vec)
             return jnp.asarray(T).reshape(self.nbf1,self.nbf2)
         elif self.mode == 'disk':
             if os.path.exists("oei_derivs.h5"):
                 file_name = "oei_derivs.h5"
-                dataset_name = "kinetic_deriv" + str(deriv_order)
+                dataset_name = "kinetic_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order)
             elif os.path.exists("oei_partials.h5"):
                 file_name = "oei_partials.h5"
-                dataset_name = "kinetic_deriv" + str(deriv_order) + "_" + str(idx)
+                dataset_name = "kinetic_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order) + "_" + str(idx)
             else:
                 raise Exception("Something went wrong reading integral derivative file")
             with h5py.File(file_name, 'r') as f:
@@ -173,15 +177,17 @@ def potential_deriv_impl(self, geom, deriv_vec):
             V = self.potential_derivatives[deriv_order-1][idx,:,:]
             return jnp.asarray(V)
         if self.mode == 'f12':
-            V = libint_interface.potential_deriv(deriv_vec)
+            V = libint_interface.compute_1e_deriv("potential", deriv_vec)
             return jnp.asarray(V).reshape(self.nbf1,self.nbf2)
         elif self.mode == 'disk':
             if os.path.exists("oei_derivs.h5"):
                 file_name = "oei_derivs.h5"
-                dataset_name = "potential_deriv" + str(deriv_order)
+                dataset_name = "potential_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order)
             elif os.path.exists("oei_partials.h5"):
                 file_name = "oei_partials.h5"
-                dataset_name = "potential_deriv" + str(deriv_order) + "_" + str(idx)
+                dataset_name = "potential_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order) + "_" + str(idx)
             else:
                 raise Exception("Something went wrong reading integral derivative file")
             with h5py.File(file_name, 'r') as f:
diff --git a/quax/integrals/tei.py b/quax/integrals/tei.py
index c29852c..b22a192 100644
--- a/quax/integrals/tei.py
+++ b/quax/integrals/tei.py
@@ -115,27 +115,27 @@ def f12_double_commutator_deriv(self, geom, beta, deriv_vec):
 
     # Create primitive evaluation rules
     def eri_impl(self, geom):
-        G = libint_interface.eri()
+        G = libint_interface.compute_2e_int("eri", 0.)
         G = G.reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
         return jnp.asarray(G)
 
     def f12_impl(self, geom, beta):
-        F = libint_interface.f12(beta)
+        F = libint_interface.compute_2e_int("f12", beta)
         F = F.reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
         return jnp.asarray(F)
 
     def f12_squared_impl(self, geom, beta):
-        F = libint_interface.f12_squared(beta)
+        F = libint_interface.compute_2e_int("f12_squared", beta)
         F = F.reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
         return jnp.asarray(F)
 
     def f12g12_impl(self, geom, beta):
-        F = libint_interface.f12g12(beta)
+        F = libint_interface.compute_2e_int("f12g12", beta)
         F = F.reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
         return jnp.asarray(F)
     
     def f12_double_commutator_impl(self, geom, beta):
-        F = libint_interface.f12_double_commutator(beta)
+        F = libint_interface.compute_2e_int("f12_double_commutator", beta)
         F = F.reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
         return jnp.asarray(F)
 
@@ -150,19 +150,23 @@ def eri_deriv_impl(self, geom, deriv_vec):
             return jnp.asarray(G)
 
         if self.mode == 'f12':
-            G = libint_interface.eri_deriv(deriv_vec)
+            G = libint_interface.compute_2e_deriv("eri", 0., deriv_vec)
             return jnp.asarray(G).reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
 
         # Read from disk
         elif self.mode == 'disk':
             # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
-            if os.path.exists("eri_derivs.h5"):
-                file_name = "eri_derivs.h5"
-                dataset_name = "eri_deriv" + str(deriv_order)
+            if os.path.exists("tei_derivs.h5"):
+                file_name = "tei_derivs.h5"
+                dataset_name = "eri_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                      + "_" + str(self.nbf3) + "_" + str(self.nbf4)\
+                                      + "_deriv" + str(deriv_order)
             # if not found, look for partial derivative tensor file with datasets named (type)_deriv(order)_(flattened_uppertri_idx)
-            elif os.path.exists("eri_partials.h5"):
-                file_name = "eri_partials.h5"
-                dataset_name = "eri_deriv" + str(deriv_order) + "_" + str(idx)
+            elif os.path.exists("tei_partials.h5"):
+                file_name = "tei_partials.h5"
+                dataset_name = "eri_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                      + "_" + str(self.nbf3) + "_" + str(self.nbf4)\
+                                      + "_deriv" + str(deriv_order) + "_" + str(idx)
             else:
                 raise Exception("ERI derivatives not found on disk")
 
@@ -183,19 +187,23 @@ def f12_deriv_impl(self, geom, beta, deriv_vec):
 
         # Use f12 derivatives in memory
         if self.mode == 'f12':
-            F = libint_interface.f12_deriv(beta, deriv_vec)
+            F = libint_interface.compute_2e_deriv("f12", beta, deriv_vec)
             return jnp.asarray(F).reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
 
         # Read from disk
         elif self.mode == 'disk':
             # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
-            if os.path.exists("f12_derivs.h5"):
-                file_name = "f12_derivs.h5"
-                dataset_name = "f12_deriv" + str(deriv_order)
+            if os.path.exists("tei_derivs.h5"):
+                file_name = "tei_derivs.h5"
+                dataset_name = "f12_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                      + "_" + str(self.nbf3) + "_" + str(self.nbf4)\
+                                      + "_deriv" + str(deriv_order)
             # if not found, look for partial derivative tensor file with datasets named (type)_deriv(order)_(flattened_uppertri_idx)
-            elif os.path.exists("f12_partials.h5"):
-                file_name = "f12_partials.h5"
-                dataset_name = "f12_deriv" + str(deriv_order) + "_" + str(idx)
+            elif os.path.exists("tei_partials.h5"):
+                file_name = "tei_partials.h5"
+                dataset_name = "f12_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                      + "_" + str(self.nbf3) + "_" + str(self.nbf4)\
+                                      + "_deriv" + str(deriv_order) + "_" + str(idx)
             else:
                 raise Exception("F12 derivatives not found on disk")
 
@@ -216,19 +224,23 @@ def f12_squared_deriv_impl(self, geom, beta, deriv_vec):
 
         # Use f12 squared derivatives in memory
         if self.mode == 'f12':
-            F = libint_interface.f12_squared_deriv(beta, deriv_vec)
+            F = libint_interface.compute_2e_deriv("f12_squared", beta, deriv_vec)
             return jnp.asarray(F).reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
 
         # Read from disk
         elif self.mode == 'disk':
             # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
-            if os.path.exists("f12_squared_derivs.h5"):
-                file_name = "f12_squared_derivs.h5"
-                dataset_name = "f12_squared_deriv" + str(deriv_order)
+            if os.path.exists("tei_derivs.h5"):
+                file_name = "tei_derivs.h5"
+                dataset_name = "f12_squared_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                      + "_" + str(self.nbf3) + "_" + str(self.nbf4)\
+                                      + "_deriv" + str(deriv_order)
             # if not found, look for partial derivative tensor file with datasets named (type)_deriv(order)_(flattened_uppertri_idx)
-            elif os.path.exists("f12_squared_partials.h5"):
-                file_name = "f12_squared_partials.h5"
-                dataset_name = "f12_squared_deriv" + str(deriv_order) + "_" + str(idx)
+            elif os.path.exists("tei_partials.h5"):
+                file_name = "tei_partials.h5"
+                dataset_name = "f12_squared_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                      + "_" + str(self.nbf3) + "_" + str(self.nbf4)\
+                                      + "_deriv" + str(deriv_order) + "_" + str(idx)
             else:
                 raise Exception("F12 Squared derivatives not found on disk")
 
@@ -249,19 +261,23 @@ def f12g12_deriv_impl(self, geom, beta, deriv_vec):
 
         # Use f12g12 derivatives in memory
         if self.mode == 'f12':
-            F = libint_interface.f12g12_deriv(beta, deriv_vec)
+            F = libint_interface.compute_2e_deriv("f12g12", beta, deriv_vec)
             return jnp.asarray(F).reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
 
         # Read from disk
         elif self.mode == 'disk':
             # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
-            if os.path.exists("f12g12_derivs.h5"):
-                file_name = "f12g12_derivs.h5"
-                dataset_name = "f12g12_deriv" + str(deriv_order)
+            if os.path.exists("tei_derivs.h5"):
+                file_name = "tei_derivs.h5"
+                dataset_name = "f12g12_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                      + "_" + str(self.nbf3) + "_" + str(self.nbf4)\
+                                      + "_deriv" + str(deriv_order)
             # if not found, look for partial derivative tensor file with datasets named (type)_deriv(order)_(flattened_uppertri_idx)
-            elif os.path.exists("f12g12_partials.h5"):
-                file_name = "f12g12_partials.h5"
-                dataset_name = "f12g12_deriv" + str(deriv_order) + "_" + str(idx)
+            elif os.path.exists("tei_partials.h5"):
+                file_name = "tei_partials.h5"
+                dataset_name = "f12g12_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                      + "_" + str(self.nbf3) + "_" + str(self.nbf4)\
+                                      + "_deriv" + str(deriv_order) + "_" + str(idx)
             else:
                 raise Exception("F12G12 derivatives not found on disk")
 
@@ -282,19 +298,23 @@ def f12_double_commutator_deriv_impl(self, geom, beta, deriv_vec):
 
         # Use f12 double commutator derivatives in memory
         if self.mode == 'f12':
-            F = libint_interface.f12_double_commutator_deriv(beta, deriv_vec)
+            F = libint_interface.compute_2e_deriv("f12_double_commutator", beta, deriv_vec)
             return jnp.asarray(F).reshape(self.nbf1, self.nbf2, self.nbf3, self.nbf4)
 
         # Read from disk
         elif self.mode == 'disk':
             # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
-            if os.path.exists("f12_double_commutator_derivs.h5"):
-                file_name = "f12_double_commutator_derivs.h5"
-                dataset_name = "f12_double_commutator_deriv" + str(deriv_order)
+            if os.path.exists("tei_derivs.h5"):
+                file_name = "tei_derivs.h5"
+                dataset_name = "f12_double_commutator_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                      + "_" + str(self.nbf3) + "_" + str(self.nbf4)\
+                                      + "_deriv" + str(deriv_order)
             # if not found, look for partial derivative tensor file with datasets named (type)_deriv(order)_(flattened_uppertri_idx)
-            elif os.path.exists("f12_double_commutator_partials.h5"):
-                file_name = "f12_double_commutator_partials.h5"
-                dataset_name = "f12_double_commutator_deriv" + str(deriv_order) + "_" + str(idx)
+            elif os.path.exists("tei_partials.h5"):
+                file_name = "tei_partials.h5"
+                dataset_name = "f12_double_commutator_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                      + "_" + str(self.nbf3) + "_" + str(self.nbf4)\
+                                      + "_deriv" + str(deriv_order) + "_" + str(idx)
             else:
                 raise Exception("F12 Double Commutator derivatives not found on disk")
 
diff --git a/quax/methods/basis_utils.py b/quax/methods/basis_utils.py
index 727f176..36538a7 100644
--- a/quax/methods/basis_utils.py
+++ b/quax/methods/basis_utils.py
@@ -45,7 +45,7 @@ def build_CABS(geom, basis_set, cabs_set, xyz_path, deriv_order, options):
     S_ao_ribs_ribs = compute_f12_oeints(geom, cabs_set, cabs_set, xyz_path, deriv_order, options, True)
 
     if options['spectral_shift']:
-        convergence = 1e-8
+        convergence = 1e-10
         fudge = jnp.asarray(jnp.linspace(0, 1, S_ao_ribs_ribs.shape[0])) * convergence
         shift = jnp.diag(fudge)
         S_ao_ribs_ribs += shift
@@ -74,21 +74,16 @@ def F_ij(s, m):
     """
     Can be numerically unstable if singular values are degenerate
     """
-
     F_ij = lambda i, j: jax.lax.cond(i == j, lambda: 0., lambda: 1 / (s[j]**2 - s[i]**2))
     F_fun = jax.vmap(jax.vmap(F_ij, (None, 0)), (0, None))
 
     indices = jnp.arange(m)
-    F = F_fun(indices, indices)
 
-    return F
+    return F_fun(indices, indices)
 
 @jax.custom_jvp
 def svd_full(A):
-
-    U, S, Vt = jnp.linalg.svd(A)
-
-    return U, S, Vt
+    return jnp.linalg.svd(A)
 
 @svd_full.defjvp
 def svd_full_jvp(primals, tangents):
@@ -102,7 +97,7 @@ def svd_full_jvp(primals, tangents):
 
     dP = U.T @ dA @ Vt.T
 
-    dS = jnp.fill_diagonal(jnp.zeros((m, n)), 1, inplace=False) * dP
+    dS = jnp.diagonal(dP)
 
     S1 = jnp.diag(S)
 
@@ -116,13 +111,11 @@ def svd_full_jvp(primals, tangents):
 
     dD2 = jnp.linalg.inv(S1) @ dP[:, m:] # Can be numerically unstable due to inversion
 
-    dD3 = jnp.zeros((n-m, n-m))
-
     dD_left = jnp.concatenate((dD1, dD2.T))
-    dD_right = jnp.concatenate((-dD2, dD3))
+    dD_right = jnp.concatenate((-dD2, jnp.zeros((n-m, n-m))))
 
     dD = jnp.concatenate((dD_left, dD_right), axis=1)
 
     dV = Vt.T @ dD
 
-    return (U, S, Vt), (dU, jnp.diagonal(dS), dV.T)
+    return (U, S, Vt), (dU, dS, dV.T)
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index cb51a53..8ff66f3 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -21,8 +21,8 @@ def compute_integrals(geom, basis_set, xyz_path, deriv_order, options):
 
     if algo == 'libint_disk':
         # Check disk for currently existing integral derivatives
-        check_oei = check_oei_disk(geom, basis_set, basis_set, xyz_path, deriv_order)
-        check_tei = check_tei_disk(geom, basis_set, basis_set, basis_set, basis_set, "eri", xyz_path, deriv_order)
+        check_oei = check_oei_disk("all", basis_set, basis_set, deriv_order)
+        check_tei = check_tei_disk("eri", basis_set, basis_set, basis_set, basis_set, deriv_order)
 
         oei_obj = OEI(basis_set, basis_set, xyz_path, deriv_order, 'disk')
         tei_obj = TEI(basis_set, basis_set, basis_set, basis_set, xyz_path, deriv_order, options, 'disk')
@@ -40,7 +40,7 @@ def compute_integrals(geom, basis_set, xyz_path, deriv_order, options):
         if check_tei:
             G = tei_obj.eri(geom)
         else:
-            libint_interface.eri_deriv_disk(deriv_order)
+            libint_interface.compute_2e_deriv_disk("eri", 0., deriv_order)
             G = tei_obj.eri(geom)
 
     else:
@@ -66,14 +66,14 @@ def compute_f12_oeints(geom, basis1, basis2, xyz_path, deriv_order, options, cab
     if cabs:
         if algo == 'libint_disk':
             # Check disk for currently existing integral derivatives
-            check = check_oei_disk(geom, basis1, basis2, xyz_path, deriv_order)
+            check = check_oei_disk("overlap", basis1, basis2, deriv_order)
     
             oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'disk')
             # If disk integral derivs are right, nothing to do
             if check:
                 S = oei_obj.overlap(geom)
             else:
-                libint_interface.oei_deriv_disk(deriv_order)
+                libint_interface.compute_1e_deriv_disk("overlap", deriv_order)
                 S = oei_obj.overlap(geom)
 
         else:
@@ -88,16 +88,21 @@ def compute_f12_oeints(geom, basis1, basis2, xyz_path, deriv_order, options, cab
     else:
         if algo == 'libint_disk':
             # Check disk for currently existing integral derivatives
-            check = check_oei_disk(geom, basis1, basis2, xyz_path, deriv_order)
+            check_T = check_oei_disk("kinetic", basis1, basis2, deriv_order)
+            check_V = check_oei_disk("potential", basis1, basis2, deriv_order)
 
             oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'disk')
             # If disk integral derivs are right, nothing to do
-            if check:
+            if check_T:
                 T = oei_obj.kinetic(geom)
-                V = oei_obj.potential(geom)
             else:
-                libint_interface.oei_deriv_disk(deriv_order)
+                libint_interface.compute_1e_deriv_disk("kinetic",deriv_order)
                 T = oei_obj.kinetic(geom)
+
+            if check_V:
+                V = oei_obj.potential(geom)
+            else:
+                libint_interface.compute_1e_deriv_disk("potential", deriv_order)
                 V = oei_obj.potential(geom)
 
         else:
@@ -122,7 +127,7 @@ def compute_f12_teints(geom, basis1, basis2, basis3, basis4, int_type, xyz_path,
 
     if algo == 'libint_disk':
         # Check disk for currently existing integral derivatives
-        check = check_tei_disk(geom, basis1, basis2, basis3, basis4, int_type, xyz_path, deriv_order)
+        check = check_tei_disk(int_type, basis1, basis2, basis3, basis4, deriv_order)
 
         tei_obj = TEI(basis1, basis2, basis3, basis4, xyz_path, deriv_order, options, 'disk')
         # If disk integral derivs are right, nothing to do
@@ -141,19 +146,19 @@ def compute_f12_teints(geom, basis1, basis2, basis3, basis4, int_type, xyz_path,
         else:
             match int_type:
                 case "f12":
-                    libint_interface.f12_deriv_disk(beta, deriv_order)
+                    libint_interface.compute_2e_deriv_disk(int_type, beta, deriv_order)
                     F = tei_obj.f12(geom, beta)
                 case "f12_squared":
-                    libint_interface.f12_squared_deriv_disk(beta, deriv_order)
+                    libint_interface.compute_2e_deriv_disk(int_type, beta, deriv_order)
                     F = tei_obj.f12_squared(geom, beta)
                 case "f12g12":
-                    libint_interface.f12g12_deriv_disk(beta, deriv_order)
+                    libint_interface.compute_2e_deriv_disk(int_type, beta, deriv_order)
                     F = tei_obj.f12g12(geom, beta)
                 case "f12_double_commutator":
-                    libint_interface.f12_double_commutator_deriv_disk(beta, deriv_order)
+                    libint_interface.compute_2e_deriv_disk(int_type, beta, deriv_order)
                     F = tei_obj.f12_double_commutator(geom, beta)
                 case "eri":
-                    libint_interface.eri_deriv_disk(deriv_order)
+                    libint_interface.compute_2e_deriv_disk(int_type, 0., deriv_order)
                     F = tei_obj.eri(geom)
 
     else:
@@ -175,26 +180,31 @@ def compute_f12_teints(geom, basis1, basis2, basis3, basis4, int_type, xyz_path,
     libint_interface.finalize()
     return F
 
-def check_oei_disk(geom, basis1, basis2, xyz_path, deriv_order, address=None):
-    # TODO need to check geometry and basis set name in addition to nbf
+def check_oei_disk(int_type, basis1, basis2, deriv_order, address=None):
     # Check OEI's in compute_integrals
     correct_int_derivs = False
+    correct_nbf1 = correct_nbf2 = correct_deriv_order = False
 
     if ((os.path.exists("oei_derivs.h5"))):
         print("Found currently existing one-electron integral derivatives in your working directory. Trying to use them.")
         oeifile = h5py.File('oei_derivs.h5', 'r')
-        with open(xyz_path, 'r') as f:
-            tmp = f.read()
-        molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
         nbf1 = basis1.nbf()
         nbf2 = basis2.nbf()
-        # Check if there are `deriv_order` datasets in the eri file
-        correct_deriv_order = len(oeifile) >= (3 * deriv_order)
-        # Check nbf dimension of integral arrays
-        sample_dataset_name = list(oeifile.keys())[0]
-        correct_nbf1 = oeifile[sample_dataset_name].shape[0] == nbf1
-        correct_nbf2 = oeifile[sample_dataset_name].shape[1] == nbf2
+
+        if int_type == "all":
+            oei_name = ["overlap_" + str(nbf1) + "_" + str(nbf2) + "_deriv" + str(deriv_order),\
+                        "kinetic_" + str(nbf1) + "_" + str(nbf2) + "_deriv" + str(deriv_order),\
+                        "potential_" + str(nbf1) + "_" + str(nbf2) + "_deriv" + str(deriv_order)]
+        else:
+            oei_name = int_type + "_" + str(nbf1) + "_" + str(nbf2) + "_deriv" + str(deriv_order)
+
+        for name in list(oeifile.keys()):
+            if name in oei_name:
+                correct_nbf1 = oeifile[name].shape[0] == nbf1
+                correct_nbf2 = oeifile[name].shape[1] == nbf2
+                correct_deriv_order = True
         oeifile.close()
+
         correct_int_derivs = correct_deriv_order and correct_nbf1 and correct_nbf2
 
     if correct_int_derivs:
@@ -207,8 +217,6 @@ def check_oei_disk(geom, basis1, basis2, xyz_path, deriv_order, address=None):
         print("Found currently existing partial oei derivatives in working directory. Assuming they are correct.")
         oeifile = h5py.File('oei_partials.h5', 'r')
         with open(xyz_path, 'r') as f:
-            tmp = f.read()
-        molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
         nbf1 = basis1.nbf()
         nbf2 = basis2.nbf()
         # Check if there are `deriv_order` datasets in the eri file
@@ -220,29 +228,30 @@ def check_oei_disk(geom, basis1, basis2, xyz_path, deriv_order, address=None):
         oeifile.close()
         correct_int_derivs = correct_deriv_order and correct_nbf1 and correct_nbf2 """
 
-def check_tei_disk(geom, basis1, basis2, basis3, basis4, int_type, xyz_path, deriv_order, address=None):
-    # TODO need to check geometry and basis set name in addition to nbf
+def check_tei_disk(int_type, basis1, basis2, basis3, basis4, deriv_order, address=None):
     # Check TEI's in compute_integrals
     correct_int_derivs = False
+    correct_nbf1 = correct_nbf2 = correct_nbf3 = correct_nbf4 = correct_deriv_order = False
 
-    if ((os.path.exists(int_type + "_derivs.h5"))):
+    if ((os.path.exists("tei_derivs.h5"))):
         print("Found currently existing " + int_type + " integral derivatives in your working directory. Trying to use them.")
-        erifile = h5py.File(int_type + '_derivs.h5', 'r')
-        with open(xyz_path, 'r') as f:
-            tmp = f.read()
-        molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
+        erifile = h5py.File('tei_derivs.h5', 'r')
         nbf1 = basis1.nbf()
         nbf2 = basis2.nbf()
         nbf3 = basis3.nbf()
         nbf4 = basis4.nbf()
-        # Check if there are `deriv_order` datasets in the eri file
-        correct_deriv_order = len(erifile) >= deriv_order
+
+        tei_name = int_type + "_" + str(nbf1) + "_" + str(nbf2)\
+                            + "_" + str(nbf3) + "_" + str(nbf4) + "_deriv" + str(deriv_order)
+        
         # Check nbf dimension of integral arrays
-        sample_dataset_name = list(erifile.keys())[0]
-        correct_nbf1 = erifile[sample_dataset_name].shape[0] == nbf1
-        correct_nbf2 = erifile[sample_dataset_name].shape[1] == nbf2
-        correct_nbf3 = erifile[sample_dataset_name].shape[2] == nbf3
-        correct_nbf4 = erifile[sample_dataset_name].shape[3] == nbf4
+        for name in list(erifile.keys()):
+            if name in tei_name:
+                correct_nbf1 = erifile[name].shape[0] == nbf1
+                correct_nbf2 = erifile[name].shape[1] == nbf2
+                correct_nbf3 = erifile[name].shape[2] == nbf3
+                correct_nbf4 = erifile[name].shape[3] == nbf4
+                correct_deriv_order = True
         erifile.close()
         correct_int_derivs = correct_deriv_order and correct_nbf1 and correct_nbf2 and correct_nbf3 and correct_nbf4
     
@@ -255,9 +264,6 @@ def check_tei_disk(geom, basis1, basis2, basis3, basis4, int_type, xyz_path, der
     elif ((os.path.exists("eri_partials.h5"))):
         print("Found currently existing partial tei derivatives in working directory. Assuming they are correct.")
         erifile = h5py.File('eri_partials.h5', 'r')
-        with open(xyz_path, 'r') as f:
-            tmp = f.read()
-        molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
         nbf1 = basis1.nbf()
         nbf2 = basis2.nbf()
         nbf3 = basis3.nbf()

From 6e977ab2fa0f4124d1702fc1adc29fc7ef05c817 Mon Sep 17 00:00:00 2001
From: Erica Mitchell <ericamitch5@gmail.com>
Date: Thu, 25 Jan 2024 16:09:15 -0500
Subject: [PATCH 39/91] Put into separate disk files

---
 quax/integrals/libint_interface.cc | 35 ++++++++++++++++++++------
 quax/integrals/tei.py              | 40 +++++++++++++++---------------
 quax/methods/ints.py               |  4 +--
 3 files changed, 50 insertions(+), 29 deletions(-)

diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index 8e83cb4..10eacd8 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -75,6 +75,25 @@ libint2::BasisSet make_ao_cabs(std::string obs_name, libint2::BasisSet cabs) {
     return cabs;
 }
 
+// Returns number of basis functions
+int nbf(std::string basis, std::string xyzfilename) {
+    libint2::initialize();
+    atoms = get_atoms(xyzfilename);
+
+    // Move harddrive load of basis and xyz to happen only once
+    libint2::BasisSet bs = libint2::BasisSet(basis, atoms);
+    bs.set_pure(false); // use cartesian gaussians
+    if (basis.find("-cabs", 10) != std::string::npos) {
+        bs = make_ao_cabs(basis, bs);
+    }
+
+    int nbf = static_cast<int>(bs.nbf());
+
+    libint2::finalize();
+
+    return nbf;
+}
+
 // Must call initialize before computing ints 
 void initialize(std::string xyzfilename, std::string basis1, std::string basis2,
                 std::string basis3, std::string basis4) {
@@ -447,6 +466,8 @@ py::array compute_1e_deriv(std::string type, std::vector<int> deriv_vec) {
             auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
             auto n2 = bs2[s2].size();    // number of basis functions in shell 2
 
+            // If the atoms are the same we ignore it as the derivatives will be zero.
+            if (atom1 == atom2 && type != "potential") continue;
             // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
             std::vector<long> shell_atom_index_list{atom1, atom2};
 
@@ -878,12 +899,6 @@ void compute_1e_deriv_disk(std::string type, int max_deriv_order) {
 void compute_2e_deriv_disk(std::string type, double beta, int max_deriv_order) { 
     std::cout << "Writing two-electron " << type << " integral derivative tensors up to order " 
                                          << max_deriv_order << " to disk...";
-    const H5std_string file_name("tei_derivs.h5");
-    H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
-    double fillvalue = 0.0;
-    DSetCreatPropList plist;
-    plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
-
     // Check to make sure you are not flooding the disk.
     long total_deriv_slices = 0;
     for (int i = 1; i <= max_deriv_order; i++){
@@ -892,6 +907,13 @@ void compute_2e_deriv_disk(std::string type, double beta, int max_deriv_order) {
     double check = (nbf1 * nbf2 * nbf3 * nbf4 * total_deriv_slices * 8) * (1e-9);
     assert(check < 50 && "Total disk space required for ERI's exceeds 50 GB. Increase threshold and recompile to proceed.");
 
+    // Create H5 File and prepare to fill with 0.0's                                         
+    const H5std_string file_name(type + "_derivs.h5");
+    H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
+    double fillvalue = 0.0;
+    DSetCreatPropList plist;
+    plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
+
     for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
         // Number of unique shell derivatives output by libint (number of indices in buffer)
         int nshell_derivs = how_many_derivs(4, deriv_order);
@@ -1582,4 +1604,3 @@ PYBIND11_MODULE(libint_interface, m) {
     //m.def("compute_2e_partial_deriv_disk", &compute_2e_partial_deriv_disk, "Computes a subset of the full coulomb integral nuclear derivative tensor and writes them to disk with HDF5");
      m.attr("LIBINT2_MAX_DERIV_ORDER") = LIBINT2_MAX_DERIV_ORDER;
 }
-
diff --git a/quax/integrals/tei.py b/quax/integrals/tei.py
index b22a192..640284e 100644
--- a/quax/integrals/tei.py
+++ b/quax/integrals/tei.py
@@ -156,14 +156,14 @@ def eri_deriv_impl(self, geom, deriv_vec):
         # Read from disk
         elif self.mode == 'disk':
             # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
-            if os.path.exists("tei_derivs.h5"):
-                file_name = "tei_derivs.h5"
+            if os.path.exists("eri_derivs.h5"):
+                file_name = "eri_derivs.h5"
                 dataset_name = "eri_" + str(self.nbf1) + "_" + str(self.nbf2)\
                                       + "_" + str(self.nbf3) + "_" + str(self.nbf4)\
                                       + "_deriv" + str(deriv_order)
             # if not found, look for partial derivative tensor file with datasets named (type)_deriv(order)_(flattened_uppertri_idx)
-            elif os.path.exists("tei_partials.h5"):
-                file_name = "tei_partials.h5"
+            elif os.path.exists("eri_partials.h5"):
+                file_name = "eri_partials.h5"
                 dataset_name = "eri_" + str(self.nbf1) + "_" + str(self.nbf2)\
                                       + "_" + str(self.nbf3) + "_" + str(self.nbf4)\
                                       + "_deriv" + str(deriv_order) + "_" + str(idx)
@@ -193,14 +193,14 @@ def f12_deriv_impl(self, geom, beta, deriv_vec):
         # Read from disk
         elif self.mode == 'disk':
             # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
-            if os.path.exists("tei_derivs.h5"):
-                file_name = "tei_derivs.h5"
+            if os.path.exists("f12_derivs.h5"):
+                file_name = "f12_derivs.h5"
                 dataset_name = "f12_" + str(self.nbf1) + "_" + str(self.nbf2)\
                                       + "_" + str(self.nbf3) + "_" + str(self.nbf4)\
                                       + "_deriv" + str(deriv_order)
             # if not found, look for partial derivative tensor file with datasets named (type)_deriv(order)_(flattened_uppertri_idx)
-            elif os.path.exists("tei_partials.h5"):
-                file_name = "tei_partials.h5"
+            elif os.path.exists("f12_partials.h5"):
+                file_name = "f12_partials.h5"
                 dataset_name = "f12_" + str(self.nbf1) + "_" + str(self.nbf2)\
                                       + "_" + str(self.nbf3) + "_" + str(self.nbf4)\
                                       + "_deriv" + str(deriv_order) + "_" + str(idx)
@@ -230,14 +230,14 @@ def f12_squared_deriv_impl(self, geom, beta, deriv_vec):
         # Read from disk
         elif self.mode == 'disk':
             # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
-            if os.path.exists("tei_derivs.h5"):
-                file_name = "tei_derivs.h5"
+            if os.path.exists("f12_squared_derivs.h5"):
+                file_name = "f12_squared_derivs.h5"
                 dataset_name = "f12_squared_" + str(self.nbf1) + "_" + str(self.nbf2)\
                                       + "_" + str(self.nbf3) + "_" + str(self.nbf4)\
                                       + "_deriv" + str(deriv_order)
             # if not found, look for partial derivative tensor file with datasets named (type)_deriv(order)_(flattened_uppertri_idx)
-            elif os.path.exists("tei_partials.h5"):
-                file_name = "tei_partials.h5"
+            elif os.path.exists("f12_squared_partials.h5"):
+                file_name = "f12_squared_partials.h5"
                 dataset_name = "f12_squared_" + str(self.nbf1) + "_" + str(self.nbf2)\
                                       + "_" + str(self.nbf3) + "_" + str(self.nbf4)\
                                       + "_deriv" + str(deriv_order) + "_" + str(idx)
@@ -267,14 +267,14 @@ def f12g12_deriv_impl(self, geom, beta, deriv_vec):
         # Read from disk
         elif self.mode == 'disk':
             # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
-            if os.path.exists("tei_derivs.h5"):
-                file_name = "tei_derivs.h5"
+            if os.path.exists("f12g12_derivs.h5"):
+                file_name = "f12g12_derivs.h5"
                 dataset_name = "f12g12_" + str(self.nbf1) + "_" + str(self.nbf2)\
                                       + "_" + str(self.nbf3) + "_" + str(self.nbf4)\
                                       + "_deriv" + str(deriv_order)
             # if not found, look for partial derivative tensor file with datasets named (type)_deriv(order)_(flattened_uppertri_idx)
-            elif os.path.exists("tei_partials.h5"):
-                file_name = "tei_partials.h5"
+            elif os.path.exists("f12g12_partials.h5"):
+                file_name = "f12g12_partials.h5"
                 dataset_name = "f12g12_" + str(self.nbf1) + "_" + str(self.nbf2)\
                                       + "_" + str(self.nbf3) + "_" + str(self.nbf4)\
                                       + "_deriv" + str(deriv_order) + "_" + str(idx)
@@ -304,14 +304,14 @@ def f12_double_commutator_deriv_impl(self, geom, beta, deriv_vec):
         # Read from disk
         elif self.mode == 'disk':
             # By default, look for full derivative tensor file with datasets named (type)_deriv(order)
-            if os.path.exists("tei_derivs.h5"):
-                file_name = "tei_derivs.h5"
+            if os.path.exists("f12_double_commutator_derivs.h5"):
+                file_name = "f12_double_commutator_derivs.h5"
                 dataset_name = "f12_double_commutator_" + str(self.nbf1) + "_" + str(self.nbf2)\
                                       + "_" + str(self.nbf3) + "_" + str(self.nbf4)\
                                       + "_deriv" + str(deriv_order)
             # if not found, look for partial derivative tensor file with datasets named (type)_deriv(order)_(flattened_uppertri_idx)
-            elif os.path.exists("tei_partials.h5"):
-                file_name = "tei_partials.h5"
+            elif os.path.exists("f12_double_commutator_partials.h5"):
+                file_name = "f12_double_commutator_partials.h5"
                 dataset_name = "f12_double_commutator_" + str(self.nbf1) + "_" + str(self.nbf2)\
                                       + "_" + str(self.nbf3) + "_" + str(self.nbf4)\
                                       + "_deriv" + str(deriv_order) + "_" + str(idx)
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index 8ff66f3..a2adfc4 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -233,9 +233,9 @@ def check_tei_disk(int_type, basis1, basis2, basis3, basis4, deriv_order, addres
     correct_int_derivs = False
     correct_nbf1 = correct_nbf2 = correct_nbf3 = correct_nbf4 = correct_deriv_order = False
 
-    if ((os.path.exists("tei_derivs.h5"))):
+    if ((os.path.exists(int_type + "_derivs.h5"))):
         print("Found currently existing " + int_type + " integral derivatives in your working directory. Trying to use them.")
-        erifile = h5py.File('tei_derivs.h5', 'r')
+        erifile = h5py.File(int_type + '_derivs.h5', 'r')
         nbf1 = basis1.nbf()
         nbf2 = basis2.nbf()
         nbf3 = basis3.nbf()

From feb31321414770a239e4879eb86291b8cf9a2b0e Mon Sep 17 00:00:00 2001
From: Erica Mitchell <ericamitch5@gmail.com>
Date: Wed, 31 Jan 2024 16:21:07 -0500
Subject: [PATCH 40/91] OEI Screeing WIP

---
 quax/core.py                       |   1 +
 quax/integrals/libint_interface.cc | 311 +++++++++++++++++++----------
 quax/integrals/makefile            |   6 +-
 quax/integrals/oei.py              |   3 +
 quax/methods/ints.py               |   6 +-
 5 files changed, 218 insertions(+), 109 deletions(-)

diff --git a/quax/core.py b/quax/core.py
index 0c861c5..ccfe62e 100644
--- a/quax/core.py
+++ b/quax/core.py
@@ -38,6 +38,7 @@ def check_options(options):
                        'damp_factor': 0.5,
                        'spectral_shift': True,
                        'integral_algo': 'libint_core',
+                       'ints_tolerance': 1.0e-14,
                        'beta': 1.0
                       }
 
diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index 10eacd8..ccc2d82 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -29,6 +29,7 @@ std::vector<long> shell2atom_1, shell2atom_2, shell2atom_3, shell2atom_4;
 size_t max_nprim;
 int max_l;
 int nthreads = 1;
+double threshold;
 
 // Creates atom objects from xyz file path
 std::vector<libint2::Atom> get_atoms(std::string xyzfilename) 
@@ -96,12 +97,14 @@ int nbf(std::string basis, std::string xyzfilename) {
 
 // Must call initialize before computing ints 
 void initialize(std::string xyzfilename, std::string basis1, std::string basis2,
-                std::string basis3, std::string basis4) {
+                std::string basis3, std::string basis4, double ints_tol) {
     libint2::initialize();
     atoms = get_atoms(xyzfilename);
     natom = atoms.size();
     ncart = natom * 3;
 
+    threshold = ints_tol;
+
     // Move harddrive load of basis and xyz to happen only once
     bs1 = libint2::BasisSet(basis1, atoms);
     bs1.set_pure(false); // use cartesian gaussians
@@ -280,8 +283,71 @@ std::vector<std::vector<int>> generate_multi_index_lookup(int nparams, int deriv
     return combos;
 }
 
+// Computes non-negligible shell pair list
+std::vector<std::pair<int, int>> build_shellpairs() {
+    const auto bs1_equiv_bs2 = (bs1 == bs2);
+
+    // construct the 2-electron repulsion integrals engine
+    std::vector<libint2::Engine> engines(nthreads);
+    engines[0] = libint2::Engine(libint2::Operator::overlap, max_nprim, max_l);
+    engines[0].set_precision(0.);
+    for (size_t i = 1; i != nthreads; ++i) {
+        engines[i] = engines[0];
+    }
+
+    std::vector<std::vector<std::pair<int, int>>> threads_sp_list(nthreads);
+    double threshold_sq = threshold * threshold;
+
+    #pragma omp parallel num_threads(nthreads)
+    {
+        int thread_id = 0;
+#ifdef _OPENMP
+        thread_id = omp_get_thread_num();
+#endif
+        auto &engine = engines[thread_id];
+        const auto &buf = engine.results();
+
+        // loop over permutationally-unique set of shells
+        for (auto s1 = 0l, s12 = 0l; s1 != bs1.size(); ++s1) {
+            auto n1 = bs1[s1].size();
+
+            auto s2_max = bs1_equiv_bs2 ? s1 : bs2.size() - 1;
+            for (auto s2 = 0; s2 <= s2_max; ++s2, ++s12) {
+                if (s12 % nthreads != thread_id) continue;
+
+                auto on_same_center = (bs1[s1].O == bs2[s2].O);
+                bool significant = on_same_center;
+                if (!on_same_center) {
+                    auto n2 = bs2[s2].size();
+                    engines[thread_id].compute(bs1[s1], bs2[s2]);
+                    double normsq = std::inner_product(buf[0], buf[0] + n1 * n2, buf[0], 0.0);
+                    significant = (normsq >= threshold_sq);
+                }
+
+                if (significant) {
+                    threads_sp_list[thread_id].push_back(std::make_pair(s1, s2));
+                } else {
+                    std::cout << "Removed Set: " << s1 << " " << s2 << std::endl;
+                }
+            }
+        }
+    }  // end of compute
+
+    for (int thread = 1; thread < nthreads; ++thread) {
+        for (const auto &pair : threads_sp_list[thread]) {
+            threads_sp_list[0].push_back(pair);
+        }
+    }
+
+    return threads_sp_list[0];
+}
+
 // Compute one-electron integral
 py::array compute_1e_int(std::string type) {
+    // Shell pairs after screening
+    const auto bs1_equiv_bs2 = (bs1 == bs2);
+    auto shellpairs = build_shellpairs();
+
     // Integral engine
     std::vector<libint2::Engine> engines(nthreads);
     
@@ -303,26 +369,38 @@ py::array compute_1e_int(std::string type) {
     size_t length = nbf1 * nbf2;
     std::vector<double> result(length); // vector to store integral array
 
-#pragma omp parallel for collapse(2) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            auto bf1 = shell2bf_1[s1];  // first basis function in first shell
-            auto n1 = bs1[s1].size(); // number of basis functions in first shell
-            auto bf2 = shell2bf_2[s2];  // first basis function in second shell
-            auto n2 = bs2[s2].size(); // number of basis functions in second shell
+#pragma omp parallel for num_threads(nthreads)
+    for (const auto &pair : shellpairs) {
+        int p1 = pair.first;
+        int p2 = pair.second;
 
-            size_t thread_id = 0;
+        const auto &s1 = bs1[p1];
+        const auto &s2 = bs2[p2];
+        auto n1 = bs1[p1].size(); // number of basis functions in first shell
+        auto n2 = bs2[p2].size(); // number of basis functions in first shell
+        auto bf1 = shell2bf_1[p1];  // first basis function in first shell
+        auto bf2 = shell2bf_2[p2];  // first basis function in second shell
+
+        size_t thread_id = 0;
 #ifdef _OPENMP
-            thread_id = omp_get_thread_num();
+        thread_id = omp_get_thread_num();
 #endif
-            engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-            const auto& buf_vec = engines[thread_id].results(); // will point to computed shell sets
+        engines[thread_id].compute(s1, s2); // Compute shell set
+        const auto& buf_vec = engines[thread_id].results(); // will point to computed shell sets
 
-            auto ints_shellset = buf_vec[0];    // Location of the computed integrals
-            if (ints_shellset == nullptr)
-                continue;  // nullptr returned if the entire shell-set was screened out
+        auto ints_shellset = buf_vec[0];    // Location of the computed integrals
+        if (ints_shellset == nullptr)
+            continue;  // nullptr returned if the entire shell-set was screened out
 
-            // Loop over shell block, keeping a total count idx for the size of shell set
+        // Loop over shell block, keeping a total count idx for the size of shell set
+        if (bs1_equiv_bs2 && p1 != p2) {
+            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                    result[(bf1 + f1) * nbf2 + bf2 + f2] = ints_shellset[idx];
+                    result[(bf2 + f2) * nbf1 + bf1 + f1] = ints_shellset[idx];
+                }
+            }
+        } else {
             for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                 for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
                     result[(bf1 + f1) * nbf2 + bf2 + f2] = ints_shellset[idx];
@@ -1324,6 +1402,10 @@ void oei_deriv_disk(int max_deriv_order) {
 
 // Computes a single 'deriv_order' derivative tensor of OEIs, keeps everything in core memory
 std::vector<py::array> oei_deriv_core(int deriv_order) {
+    // Shell pairs after screening
+    const auto bs1_equiv_bs2 = true; // Only used for HF-type integrals
+    auto shellpairs = build_shellpairs();
+
     // how many shell derivatives in the Libint buffer for overlap/kinetic integrals
     // how many shell and operator derivatives for potential integrals
     int nshell_derivs = how_many_derivs(2, deriv_order);
@@ -1357,95 +1439,109 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
     std::vector<double> T(length);
     std::vector<double> V(length);
 
-#pragma omp parallel for collapse(2) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-            auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-            auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-            auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-            auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-            auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-            std::vector<long> shell_atom_index_list{atom1, atom2};
-
-            size_t thread_id = 0;
+#pragma omp parallel for num_threads(nthreads)
+    for (const auto &pair : shellpairs) {
+        int p1 = pair.first;
+        int p2 = pair.second;
+
+        const auto &s1 = bs1[p1];
+        const auto &s2 = bs2[p2];
+        auto n1 = bs1[p1].size(); // number of basis functions in first shell
+        auto n2 = bs2[p2].size(); // number of basis functions in first shell
+        auto bf1 = shell2bf_1[p1];  // first basis function in first shell
+        auto bf2 = shell2bf_2[p2];  // first basis function in second shell    
+        auto atom1 = shell2atom_1[p1]; // Atom index of shell 1
+        auto atom2 = shell2atom_2[p2]; // Atom index of shell 2
+        std::vector<long> shell_atom_index_list{atom1, atom2};
+
+        size_t thread_id = 0;
 #ifdef _OPENMP
-            thread_id = omp_get_thread_num();
+        thread_id = omp_get_thread_num();
 #endif
-            s_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-            t_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-            v_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-            const auto& overlap_buffer = s_engines[thread_id].results(); // will point to computed shell sets
-            const auto& kinetic_buffer = t_engines[thread_id].results(); // will point to computed shell sets
-            const auto& potential_buffer = v_engines[thread_id].results(); // will point to computed shell sets
-
-            // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-            // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
-            for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                size_t offset_nuc_idx = nuc_idx * nbf1 * nbf2;
-
-                // Look up multidimensional cartesian derivative index
-                auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
-                // For overlap/kinetic and potential sepearately, create a vector of vectors called `indices`, where each subvector
-                // is your possible choices for the first derivative operator, second, third, etc and the total number of subvectors is order of differentiation
-                // What follows fills these indices
-                std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-                std::vector<std::vector<int>> potential_indices(deriv_order, std::vector<int> (0,0));
-
-                // Loop over each cartesian coordinate index which we are differentiating wrt for this nuclear cartesian derivative index
-                // and check to see if it is present in the shell duet, and where it is present in the potential operator
-                for (int j = 0; j < multi_cart_idx.size(); j++){
-                    int desired_atom_idx = multi_cart_idx[j] / 3;
-                    int desired_coord = multi_cart_idx[j] % 3;
-                    // Loop over shell indices
-                    for (int i = 0; i < 2; i++){
-                        int atom_idx = shell_atom_index_list[i];
-                        if (atom_idx == desired_atom_idx) {
-                            int tmp = 3 * i + desired_coord;
-                            indices[j].push_back(tmp);
-                            potential_indices[j].push_back(tmp);
-                        }
+        s_engines[thread_id].compute(s1, s2); // Compute shell set
+        t_engines[thread_id].compute(s1, s2); // Compute shell set
+        v_engines[thread_id].compute(s1, s2); // Compute shell set
+        const auto& overlap_buffer = s_engines[thread_id].results(); // will point to computed shell sets
+        const auto& kinetic_buffer = t_engines[thread_id].results(); // will point to computed shell sets
+        const auto& potential_buffer = v_engines[thread_id].results(); // will point to computed shell sets
+
+        // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+        // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
+        for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+            size_t offset_nuc_idx = nuc_idx * nbf1 * nbf2;
+
+            // Look up multidimensional cartesian derivative index
+            auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+            // For overlap/kinetic and potential sepearately, create a vector of vectors called `indices`, where each subvector
+            // is your possible choices for the first derivative operator, second, third, etc and the total number of subvectors is order of differentiation
+            // What follows fills these indices
+            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+            std::vector<std::vector<int>> potential_indices(deriv_order, std::vector<int> (0,0));
+
+            // Loop over each cartesian coordinate index which we are differentiating wrt for this nuclear cartesian derivative index
+            // and check to see if it is present in the shell duet, and where it is present in the potential operator
+            for (int j = 0; j < multi_cart_idx.size(); j++){
+                int desired_atom_idx = multi_cart_idx[j] / 3;
+                int desired_coord = multi_cart_idx[j] % 3;
+                // Loop over shell indices
+                for (int i = 0; i < 2; i++){
+                    int atom_idx = shell_atom_index_list[i];
+                    if (atom_idx == desired_atom_idx) {
+                        int tmp = 3 * i + desired_coord;
+                        indices[j].push_back(tmp);
+                        potential_indices[j].push_back(tmp);
                     }
-                    // Now for potentials only, loop over each atom in molecule, and if this derivative
-                    // differentiates wrt that atom, we also need to collect that index.
-                    for (int i = 0; i < natom; i++){
-                        if (i == desired_atom_idx) {
-                            int tmp = 3 * (i + 2) + desired_coord;
-                            potential_indices[j].push_back(tmp);
-                        }
+                }
+                // Now for potentials only, loop over each atom in molecule, and if this derivative
+                // differentiates wrt that atom, we also need to collect that index.
+                for (int i = 0; i < natom; i++){
+                    if (i == desired_atom_idx) {
+                        int tmp = 3 * (i + 2) + desired_coord;
+                        potential_indices[j].push_back(tmp);
                     }
                 }
+            }
 
-                // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                // and the total number of subvectors is the order of differentiation
-                // Now we want all combinations where we pick exactly one index from each subvector.
-                // This is achievable through a cartesian product
-                std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                std::vector<std::vector<int>> potential_index_combos = cartesian_product(potential_indices);
-                std::vector<int> buffer_indices;
-                std::vector<int> potential_buffer_indices;
-                // Overlap/Kinetic integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
-                for (auto vec : index_combos)  {
-                    std::sort(vec.begin(), vec.end());
-                    int buf_idx = 0;
-                    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                    buffer_indices.push_back(buf_idx);
-                }
-                // Potential integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
-                for (auto vec : potential_index_combos)  {
-                    std::sort(vec.begin(), vec.end());
-                    int buf_idx = 0;
-                    auto it = lower_bound(potential_buffer_multidim_lookup.begin(), potential_buffer_multidim_lookup.end(), vec);
-                    if (it != potential_buffer_multidim_lookup.end()) buf_idx = it - potential_buffer_multidim_lookup.begin();
-                    potential_buffer_indices.push_back(buf_idx);
-                }
+            // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+            // and the total number of subvectors is the order of differentiation
+            // Now we want all combinations where we pick exactly one index from each subvector.
+            // This is achievable through a cartesian product
+            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+            std::vector<std::vector<int>> potential_index_combos = cartesian_product(potential_indices);
+            std::vector<int> buffer_indices;
+            std::vector<int> potential_buffer_indices;
+            // Overlap/Kinetic integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+            for (auto vec : index_combos)  {
+                std::sort(vec.begin(), vec.end());
+                int buf_idx = 0;
+                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                buffer_indices.push_back(buf_idx);
+            }
+            // Potential integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+            for (auto vec : potential_index_combos)  {
+                std::sort(vec.begin(), vec.end());
+                int buf_idx = 0;
+                auto it = lower_bound(potential_buffer_multidim_lookup.begin(), potential_buffer_multidim_lookup.end(), vec);
+                if (it != potential_buffer_multidim_lookup.end()) buf_idx = it - potential_buffer_multidim_lookup.begin();
+                potential_buffer_indices.push_back(buf_idx);
+            }
 
-                // Loop over shell block for each buffer index which contributes to this derivative
-                // Overlap and Kinetic
-                for(auto i = 0; i < buffer_indices.size(); ++i) {
-                    auto overlap_shellset = overlap_buffer[buffer_indices[i]];
-                    auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
+            // Loop over shell block for each buffer index which contributes to this derivative
+            // Overlap and Kinetic
+            for(auto i = 0; i < buffer_indices.size(); ++i) {
+                auto overlap_shellset = overlap_buffer[buffer_indices[i]];
+                auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
+                if (bs1_equiv_bs2 && p1 != p2) {
+                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                        for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                            S[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += overlap_shellset[idx];
+                            S[(bf2 + f2) * nbf1 + bf1 + f1 + offset_nuc_idx] += overlap_shellset[idx];
+                            T[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += kinetic_shellset[idx];
+                            T[(bf2 + f2) * nbf1 + bf1 + f1 + offset_nuc_idx] += kinetic_shellset[idx];
+                        }
+                    }
+                } else {
                     for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                         for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
                             S[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += overlap_shellset[idx];
@@ -1453,17 +1549,26 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
                         }
                     }
                 }
-                // Potential
-                for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
-                    auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
+            }
+            // Potential
+            for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
+                auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
+                if (bs1_equiv_bs2 && p1 != p2) {
+                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                        for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                            V[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += potential_shellset[idx];
+                            V[(bf2 + f2) * nbf1 + bf1 + f1 + offset_nuc_idx] += potential_shellset[idx];
+                        }
+                    }
+                } else {
                     for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                         for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
                             V[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += potential_shellset[idx];
                         }
                     }
                 }
-            } // Unique nuclear cartesian derivative indices loop
-        }
+            }
+        } // Unique nuclear cartesian derivative indices loop
     } // shell duet loops
     return {py::array(S.size(), S.data()), py::array(T.size(), T.data()), py::array(V.size(), V.data())}; // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
 } // oei_deriv_core function
diff --git a/quax/integrals/makefile b/quax/integrals/makefile
index d2ef2a9..d7c5a88 100644
--- a/quax/integrals/makefile
+++ b/quax/integrals/makefile
@@ -2,11 +2,11 @@
 # Eigen headers, Python headers, Pybind11 headers, Libint API headers libint2.h libint2.hpp, the rest of the Libint2 headers, and the library location of libint2.a,
 CC      := g++
 # Options passed to compiler, add "-fopenmp" if intending to use OpenMP
-CFLAGS  := -O3 -fPIC -fopenmp
+CFLAGS  := -O3 -fPIC -fopenmp -g
 # Libint prefix location (where /include, /include/libint2, /lib, /share are located) 
-LIBINT_PREFIX := /home/ecm23353/psi_env
+LIBINT_PREFIX := /home/vulcan/ecm23353/.conda/envs/f12
 # Conda prefix location, it is suggested to use conda to install nearly all dependencies
-CONDA_PREFIX := /home/ecm23353/psi_env
+CONDA_PREFIX := /home/vulcan/ecm23353/.conda/envs/f12
 
 I1 := $(LIBINT_PREFIX)/include
 I2 := $(LIBINT_PREFIX)/include/libint2
diff --git a/quax/integrals/oei.py b/quax/integrals/oei.py
index 608f275..074a417 100644
--- a/quax/integrals/oei.py
+++ b/quax/integrals/oei.py
@@ -111,6 +111,7 @@ def overlap_deriv_impl(self, geom, deriv_vec):
 
         if self.mode == 'core':
             S = self.overlap_derivatives[deriv_order-1][idx,:,:]
+            jax.debug.print(" {b} ", b=jnp.allclose(S, S.T))
             return jnp.asarray(S)
         if self.mode == 'f12':
             S = libint_interface.compute_1e_deriv("overlap", deriv_vec)
@@ -143,6 +144,7 @@ def kinetic_deriv_impl(self, geom, deriv_vec):
 
         if self.mode == 'core':
             T = self.kinetic_derivatives[deriv_order-1][idx,:,:]
+            jax.debug.print(" {b} ", b=jnp.allclose(T, T.T))
             return jnp.asarray(T)
         if self.mode == 'f12':
             T = libint_interface.compute_1e_deriv("kinetic", deriv_vec)
@@ -175,6 +177,7 @@ def potential_deriv_impl(self, geom, deriv_vec):
 
         if self.mode == 'core':
             V = self.potential_derivatives[deriv_order-1][idx,:,:]
+            jax.debug.print(" {b} ", b=jnp.allclose(V, V.T))
             return jnp.asarray(V)
         if self.mode == 'f12':
             V = libint_interface.compute_1e_deriv("potential", deriv_vec)
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index a2adfc4..f137153 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -17,7 +17,7 @@ def compute_integrals(geom, basis_set, xyz_path, deriv_order, options):
     # Load integral algo, decides to compute integrals in memory or use disk 
     algo = options['integral_algo']
     basis_name = basis_set.name()
-    libint_interface.initialize(xyz_path, basis_name, basis_name, basis_name, basis_name)
+    libint_interface.initialize(xyz_path, basis_name, basis_name, basis_name, basis_name, options['ints_tolerance'])
 
     if algo == 'libint_disk':
         # Check disk for currently existing integral derivatives
@@ -61,7 +61,7 @@ def compute_f12_oeints(geom, basis1, basis2, xyz_path, deriv_order, options, cab
     algo = options['integral_algo']
     basis1_name = basis1.name()
     basis2_name = basis2.name()
-    libint_interface.initialize(xyz_path, basis1_name, basis2_name, basis1_name, basis2_name)
+    libint_interface.initialize(xyz_path, basis1_name, basis2_name, basis1_name, basis2_name, options['ints_tolerance'])
 
     if cabs:
         if algo == 'libint_disk':
@@ -123,7 +123,7 @@ def compute_f12_teints(geom, basis1, basis2, basis3, basis4, int_type, xyz_path,
     basis2_name = basis2.name()
     basis3_name = basis3.name()
     basis4_name = basis4.name()
-    libint_interface.initialize(xyz_path, basis1_name, basis2_name, basis3_name, basis4_name)
+    libint_interface.initialize(xyz_path, basis1_name, basis2_name, basis3_name, basis4_name, options['ints_tolerance'])
 
     if algo == 'libint_disk':
         # Check disk for currently existing integral derivatives

From 3d1c15744c193b33d8eb0e02f55911cfc1677a72 Mon Sep 17 00:00:00 2001
From: Erica Mitchell <ericamitch5@gmail.com>
Date: Thu, 1 Feb 2024 17:35:58 -0500
Subject: [PATCH 41/91] OEI shellpairs Work

---
 quax/integrals/libint_interface.cc | 456 +++++++++++++++++------------
 1 file changed, 267 insertions(+), 189 deletions(-)

diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index ccc2d82..4890d62 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -283,9 +283,9 @@ std::vector<std::vector<int>> generate_multi_index_lookup(int nparams, int deriv
     return combos;
 }
 
-// Computes non-negligible shell pair list
-std::vector<std::pair<int, int>> build_shellpairs() {
-    const auto bs1_equiv_bs2 = (bs1 == bs2);
+// Computes non-negligible shell pair list for one-electron integrals
+std::vector<std::pair<int, int>> build_shellpairs(libint2::BasisSet A, libint2::BasisSet B) {
+    const auto A_equiv_B = (A == B);
 
     // construct the 2-electron repulsion integrals engine
     std::vector<libint2::Engine> engines(nthreads);
@@ -308,18 +308,18 @@ std::vector<std::pair<int, int>> build_shellpairs() {
         const auto &buf = engine.results();
 
         // loop over permutationally-unique set of shells
-        for (auto s1 = 0l, s12 = 0l; s1 != bs1.size(); ++s1) {
-            auto n1 = bs1[s1].size();
+        for (auto s1 = 0l, s12 = 0l; s1 != A.size(); ++s1) {
+            auto n1 = A[s1].size();
 
-            auto s2_max = bs1_equiv_bs2 ? s1 : bs2.size() - 1;
+            auto s2_max = A_equiv_B ? s1 : B.size() - 1;
             for (auto s2 = 0; s2 <= s2_max; ++s2, ++s12) {
                 if (s12 % nthreads != thread_id) continue;
 
-                auto on_same_center = (bs1[s1].O == bs2[s2].O);
+                auto on_same_center = (A[s1].O == B[s2].O);
                 bool significant = on_same_center;
                 if (!on_same_center) {
-                    auto n2 = bs2[s2].size();
-                    engines[thread_id].compute(bs1[s1], bs2[s2]);
+                    auto n2 = B[s2].size();
+                    engines[thread_id].compute(A[s1], B[s2]);
                     double normsq = std::inner_product(buf[0], buf[0] + n1 * n2, buf[0], 0.0);
                     significant = (normsq >= threshold_sq);
                 }
@@ -346,7 +346,7 @@ std::vector<std::pair<int, int>> build_shellpairs() {
 py::array compute_1e_int(std::string type) {
     // Shell pairs after screening
     const auto bs1_equiv_bs2 = (bs1 == bs2);
-    auto shellpairs = build_shellpairs();
+    auto shellpairs = build_shellpairs(bs1, bs2);
 
     // Integral engine
     std::vector<libint2::Engine> engines(nthreads);
@@ -381,7 +381,7 @@ py::array compute_1e_int(std::string type) {
         auto bf1 = shell2bf_1[p1];  // first basis function in first shell
         auto bf2 = shell2bf_2[p2];  // first basis function in second shell
 
-        size_t thread_id = 0;
+        int thread_id = 0;
 #ifdef _OPENMP
         thread_id = omp_get_thread_num();
 #endif
@@ -461,7 +461,7 @@ py::array compute_2e_int(std::string type, double beta) {
                     auto bf4 = shell2bf_4[s4];  // first basis function in fourth shell
                     auto n4 = bs4[s4].size(); // number of basis functions in fourth shell
 
-                    size_t thread_id = 0;
+                    int thread_id = 0;
 #ifdef _OPENMP
                     thread_id = omp_get_thread_num();
 #endif
@@ -510,7 +510,10 @@ py::array compute_1e_deriv(std::string type, std::vector<int> deriv_vec) {
     // Potential integrals buffer is flattened upper triangle of (6 + NCART) dimensional deriv_order tensor
     int d1_buf_idx = (type == "potential") ? 6 + ncart : 6;
     const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(d1_buf_idx, deriv_order);
-    
+
+    // Shell pairs after screening
+    const auto bs1_equiv_bs2 = (bs1 == bs2);
+    auto shellpairs = build_shellpairs(bs1, bs2);
 
     // One-electron integral derivative engine
     std::vector<libint2::Engine> engines(nthreads);
@@ -534,73 +537,88 @@ py::array compute_1e_deriv(std::string type, std::vector<int> deriv_vec) {
     size_t length = nbf1 * nbf2;
     std::vector<double> result(length);
 
-#pragma omp parallel for collapse(2) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-            auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-            auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-            auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-            auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-            auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-
-            // If the atoms are the same we ignore it as the derivatives will be zero.
-            if (atom1 == atom2 && type != "potential") continue;
-            // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
-            std::vector<long> shell_atom_index_list{atom1, atom2};
+#pragma omp parallel for num_threads(nthreads)
+    for (const auto &pair : shellpairs) {
+        int p1 = pair.first;
+        int p2 = pair.second;
 
-            // For every desired atom derivative, check shell and nuclear indices for a match,
-            // add it to subvector for that derivative
-            // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
-            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-            for (int j = 0; j < desired_atom_indices.size(); j++){
-                int desired_atom_idx = desired_atom_indices[j];
-                // Shell indices
-                for (int i = 0; i < 2; i++){
-                    int atom_idx = shell_atom_index_list[i];
-                    if (atom_idx == desired_atom_idx) { 
-                        int tmp = 3 * i + desired_coordinates[j];
-                        indices[j].push_back(tmp);
-                    }
+        const auto &s1 = bs1[p1];
+        const auto &s2 = bs2[p2];
+        auto n1 = bs1[p1].size(); // number of basis functions in first shell
+        auto n2 = bs2[p2].size(); // number of basis functions in first shell
+        auto bf1 = shell2bf_1[p1];  // first basis function in first shell
+        auto bf2 = shell2bf_2[p2];  // first basis function in second shell
+        auto atom1 = shell2atom_1[p1]; // Atom index of shell 1
+        auto atom2 = shell2atom_2[p2]; // Atom index of shell 2
+
+        // If the atoms are the same we ignore it as the derivatives will be zero.
+        if (atom1 == atom2 && type != "potential") continue;
+        // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
+        std::vector<long> shell_atom_index_list{atom1, atom2};
+
+        int thread_id = 0;
+#ifdef _OPENMP
+        thread_id = omp_get_thread_num();
+#endif
+        engines[thread_id].compute(s1, s2); // Compute shell set
+        const auto& buf_vec = engines[thread_id].results(); // will point to computed shell sets
+
+        // For every desired atom derivative, check shell and nuclear indices for a match,
+        // add it to subvector for that derivative
+        // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
+        std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+        for (int j = 0; j < desired_atom_indices.size(); j++){
+            int desired_atom_idx = desired_atom_indices[j];
+            // Shell indices
+            for (int i = 0; i < 2; i++){
+                int atom_idx = shell_atom_index_list[i];
+                if (atom_idx == desired_atom_idx) {
+                    int tmp = 3 * i + desired_coordinates[j];
+                    indices[j].push_back(tmp);
                 }
-                
-                if (type == "potential") {
-                    for (int i = 0; i < natom; i++){
-                        // i = shell_atom_index_list[i];
-                        if (i == desired_atom_idx) {
-                            int tmp = 3 * (i + 2) + desired_coordinates[j];
-                            indices[j].push_back(tmp);
-                        }
+            }
+
+            if (type == "potential") {
+                for (int i = 0; i < natom; i++){
+                    // i = shell_atom_index_list[i];
+                    if (i == desired_atom_idx) {
+                        int tmp = 3 * (i + 2) + desired_coordinates[j];
+                        indices[j].push_back(tmp);
                     }
                 }
             }
-            
-            // Now indices is a vector of vectors, where each subvector is your choices
-            // for the first derivative operator, second, third, etc
-            // and the total number of subvectors is the order of differentiation
-            // Now we want all combinations where we pick exactly one index from each subvector.
-            // This is achievable through a cartesian product
-            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-            std::vector<int> buffer_indices;
+        }
+        
+        // Now indices is a vector of vectors, where each subvector is your choices
+        // for the first derivative operator, second, third, etc
+        // and the total number of subvectors is the order of differentiation
+        // Now we want all combinations where we pick exactly one index from each subvector.
+        // This is achievable through a cartesian product
+        std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+        std::vector<int> buffer_indices;
+
+        // Collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+        for (auto vec : index_combos)  {
+            std::sort(vec.begin(), vec.end());
+            int buf_idx = 0;
+            auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+            if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+            buffer_indices.push_back(buf_idx);
+        }
 
-            // Collect needed buffer indices which we need to sum for this nuclear cartesian derivative
-            for (auto vec : index_combos)  {
-                std::sort(vec.begin(), vec.end());
-                int buf_idx = 0;
-                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                buffer_indices.push_back(buf_idx);
+        // Loop over every buffer index and accumulate for every shell set.
+        if (bs1_equiv_bs2 && p1 != p2) {
+            for(auto i = 0; i < buffer_indices.size(); ++i) {
+                auto ints_shellset = buf_vec[buffer_indices[i]];
+                if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
+                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                    for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                        result[(bf1 + f1) * nbf2 + bf2 + f2] += ints_shellset[idx];
+                        result[(bf2 + f2) * nbf1 + bf1 + f1] += ints_shellset[idx];
+                    }
+                }
             }
-
-            // Compute the integrals
-            size_t thread_id = 0;
-#ifdef _OPENMP
-            thread_id = omp_get_thread_num();
-#endif
-            engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-            const auto& buf_vec = engines[thread_id].results(); // will point to computed shell sets
-
-            // Loop over every buffer index and accumulate for every shell set.
+        } else {
             for(auto i = 0; i < buffer_indices.size(); ++i) {
                 auto ints_shellset = buf_vec[buffer_indices[i]];
                 if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
@@ -733,7 +751,7 @@ py::array compute_2e_deriv(std::string type, double beta, std::vector<int> deriv
                     }
 
                     // If we made it this far, the shell derivative we want is contained in the buffer. 
-                    size_t thread_id = 0;
+                    int thread_id = 0;
 #ifdef _OPENMP
                     thread_id = omp_get_thread_num();
 #endif
@@ -859,7 +877,7 @@ void compute_1e_deriv_disk(std::string type, int max_deriv_order) {
                 auto n2 = bs2[s2].size();    // number of basis functions in shell 2
                 std::vector<long> shell_atom_index_list{atom1, atom2};
 
-                size_t thread_id = 0;
+                int thread_id = 0;
 #ifdef _OPENMP
                 thread_id = omp_get_thread_num();
 #endif
@@ -1072,7 +1090,7 @@ void compute_2e_deriv_disk(std::string type, double beta, int max_deriv_order) {
                         if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
                         std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
 
-                        size_t thread_id = 0;
+                        int thread_id = 0;
 #ifdef _OPENMP
                         thread_id = omp_get_thread_num();
 #endif
@@ -1198,6 +1216,9 @@ void oei_deriv_disk(int max_deriv_order) {
         total_deriv_slices += how_many_derivs(natom, i);
     }
 
+    // Shell pairs after screening
+    auto shellpairs = build_shellpairs(bs1, bs2);
+
     // Create H5 File and prepare to fill with 0.0's
     const H5std_string file_name("oei_derivs.h5");
     H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
@@ -1256,95 +1277,117 @@ void oei_deriv_disk(int max_deriv_order) {
         /* Initialize lock */
         omp_init_lock(&lock);
 
-#pragma omp parallel for collapse(2) num_threads(nthreads)
-        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-                auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                std::vector<long> shell_atom_index_list{atom1, atom2};
+#pragma omp parallel for num_threads(nthreads)
+        for (const auto &pair : shellpairs) {
+            int p1 = pair.first;
+            int p2 = pair.second;
+
+            const auto &s1 = bs1[p1];
+            const auto &s2 = bs2[p2];
+            auto n1 = bs1[p1].size(); // number of basis functions in first shell
+            auto n2 = bs2[p2].size(); // number of basis functions in first shell
+            auto bf1 = shell2bf_1[p1];  // first basis function in first shell
+            auto bf2 = shell2bf_2[p2];  // first basis function in second shell
+            auto atom1 = shell2atom_1[p1]; // Atom index of shell 1
+            auto atom2 = shell2atom_2[p2]; // Atom index of shell 2
+            std::vector<long> shell_atom_index_list{atom1, atom2};
 
-                size_t thread_id = 0;
+            int thread_id = 0;
 #ifdef _OPENMP
-                thread_id = omp_get_thread_num();
+            thread_id = omp_get_thread_num();
 #endif
-                s_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-                t_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-                v_engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-                const auto& overlap_buffer = s_engines[thread_id].results(); // will point to computed shell sets
-                const auto& kinetic_buffer = t_engines[thread_id].results(); // will point to computed shell sets
-                const auto& potential_buffer = v_engines[thread_id].results(); // will point to computed shell sets;
-
-                // Define shell set slabs
-                double overlap_shellset_slab [n1][n2][nderivs_triu] = {};
-                double kinetic_shellset_slab [n1][n2][nderivs_triu] = {};
-                double potential_shellset_slab [n1][n2][nderivs_triu] = {};
-
-                // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-                // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
-                for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                    // Look up multidimensional cartesian derivative index
-                    auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
-                    // For overlap/kinetic and potential sepearately, create a vector of vectors called `indices`, where each subvector
-                    // is your possible choices for the first derivative operator, second, third, etc and the total number of subvectors is order of differentiation
-                    // What follows fills these indices
-                    std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-                    std::vector<std::vector<int>> potential_indices(deriv_order, std::vector<int> (0,0));
-
-                    // Loop over each cartesian coordinate index which we are differentiating wrt for this nuclear cartesian derivative index
-                    // and check to see if it is present in the shell duet, and where it is present in the potential operator
-                    for (int j = 0; j < multi_cart_idx.size(); j++){
-                        int desired_atom_idx = multi_cart_idx[j] / 3;
-                        int desired_coord = multi_cart_idx[j] % 3;
-                        // Loop over shell indices
-                        for (int i = 0; i < 2; i++){
-                            int atom_idx = shell_atom_index_list[i];
-                            if (atom_idx == desired_atom_idx) {
-                                int tmp = 3 * i + desired_coord;
-                                indices[j].push_back(tmp);
-                                potential_indices[j].push_back(tmp);
-                            }
+            s_engines[thread_id].compute(s1, s2); // Compute shell set
+            t_engines[thread_id].compute(s1, s2); // Compute shell set
+            v_engines[thread_id].compute(s1, s2); // Compute shell set
+            const auto& overlap_buffer = s_engines[thread_id].results(); // will point to computed shell sets
+            const auto& kinetic_buffer = t_engines[thread_id].results(); // will point to computed shell sets
+            const auto& potential_buffer = v_engines[thread_id].results(); // will point to computed shell sets
+
+            // Define shell set slabs
+            double overlap_shellset_slab [n1][n2][nderivs_triu] = {};
+            double kinetic_shellset_slab [n1][n2][nderivs_triu] = {};
+            double potential_shellset_slab [n1][n2][nderivs_triu] = {};
+
+            double overlap_shellset_slab_T [n2][n1][nderivs_triu] = {};
+            double kinetic_shellset_slab_T [n2][n1][nderivs_triu] = {};
+            double potential_shellset_slab_T [n2][n1][nderivs_triu] = {};
+
+            // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+            // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
+            for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                // Look up multidimensional cartesian derivative index
+                auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+                // For overlap/kinetic and potential sepearately, create a vector of vectors called `indices`, where each subvector
+                // is your possible choices for the first derivative operator, second, third, etc and the total number of subvectors is order of differentiation
+                // What follows fills these indices
+                std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+                std::vector<std::vector<int>> potential_indices(deriv_order, std::vector<int> (0,0));
+
+                // Loop over each cartesian coordinate index which we are differentiating wrt for this nuclear cartesian derivative index
+                // and check to see if it is present in the shell duet, and where it is present in the potential operator
+                for (int j = 0; j < multi_cart_idx.size(); j++){
+                    int desired_atom_idx = multi_cart_idx[j] / 3;
+                    int desired_coord = multi_cart_idx[j] % 3;
+                    // Loop over shell indices
+                    for (int i = 0; i < 2; i++){
+                        int atom_idx = shell_atom_index_list[i];
+                        if (atom_idx == desired_atom_idx) {
+                            int tmp = 3 * i + desired_coord;
+                            indices[j].push_back(tmp);
+                            potential_indices[j].push_back(tmp);
                         }
-                        // Now for potentials only, loop over each atom in molecule, and if this derivative
-                        // differentiates wrt that atom, we also need to collect that index.
-                        for (int i = 0; i < natom; i++){
-                            if (i == desired_atom_idx) {
-                                int tmp = 3 * (i + 2) + desired_coord;
-                                potential_indices[j].push_back(tmp);
-                            }
+                    }
+                    // Now for potentials only, loop over each atom in molecule, and if this derivative
+                    // differentiates wrt that atom, we also need to collect that index.
+                    for (int i = 0; i < natom; i++){
+                        if (i == desired_atom_idx) {
+                            int tmp = 3 * (i + 2) + desired_coord;
+                            potential_indices[j].push_back(tmp);
                         }
                     }
+                }
 
-                    // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                    // and the total number of subvectors is the order of differentiation
-                    // Now we want all combinations where we pick exactly one index from each subvector.
-                    // This is achievable through a cartesian product
-                    std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                    std::vector<std::vector<int>> potential_index_combos = cartesian_product(potential_indices);
-                    std::vector<int> buffer_indices;
-                    std::vector<int> potential_buffer_indices;
-                    // Overlap/Kinetic integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
-                    for (auto vec : index_combos)  {
-                        std::sort(vec.begin(), vec.end());
-                        int buf_idx = 0;
-                        auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                        if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                        buffer_indices.push_back(buf_idx);
-                    }
-                    // Potential integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
-                    for (auto vec : potential_index_combos)  {
-                        std::sort(vec.begin(), vec.end());
-                        int buf_idx = 0;
-                        auto it = lower_bound(potential_buffer_multidim_lookup.begin(), potential_buffer_multidim_lookup.end(), vec);
-                        if (it != potential_buffer_multidim_lookup.end()) buf_idx = it - potential_buffer_multidim_lookup.begin();
-                        potential_buffer_indices.push_back(buf_idx);
-                    }
+                // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+                // and the total number of subvectors is the order of differentiation
+                // Now we want all combinations where we pick exactly one index from each subvector.
+                // This is achievable through a cartesian product
+                std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                std::vector<std::vector<int>> potential_index_combos = cartesian_product(potential_indices);
+                std::vector<int> buffer_indices;
+                std::vector<int> potential_buffer_indices;
+                // Overlap/Kinetic integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+                for (auto vec : index_combos)  {
+                    std::sort(vec.begin(), vec.end());
+                    int buf_idx = 0;
+                    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                    buffer_indices.push_back(buf_idx);
+                }
+                // Potential integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+                for (auto vec : potential_index_combos)  {
+                    std::sort(vec.begin(), vec.end());
+                    int buf_idx = 0;
+                    auto it = lower_bound(potential_buffer_multidim_lookup.begin(), potential_buffer_multidim_lookup.end(), vec);
+                    if (it != potential_buffer_multidim_lookup.end()) buf_idx = it - potential_buffer_multidim_lookup.begin();
+                    potential_buffer_indices.push_back(buf_idx);
+                }
 
-                    // Loop over shell block for each buffer index which contributes to this derivative
-                    // Overlap and Kinetic
+                // Loop over shell block for each buffer index which contributes to this derivative
+                // Overlap and Kinetic
+                if (p1 != p2) {
+                    for(auto i = 0; i < buffer_indices.size(); ++i) {
+                        auto overlap_shellset = overlap_buffer[buffer_indices[i]];
+                        auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                                overlap_shellset_slab[f1][f2][nuc_idx] += overlap_shellset[idx];
+                                kinetic_shellset_slab[f1][f2][nuc_idx] += kinetic_shellset[idx];
+                                overlap_shellset_slab_T[f2][f1][nuc_idx] += overlap_shellset[idx];
+                                kinetic_shellset_slab_T[f2][f1][nuc_idx] += kinetic_shellset[idx];
+                            }
+                        }
+                    }
+                } else {
                     for(auto i = 0; i < buffer_indices.size(); ++i) {
                         auto overlap_shellset = overlap_buffer[buffer_indices[i]];
                         auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
@@ -1355,37 +1398,66 @@ void oei_deriv_disk(int max_deriv_order) {
                             }
                         }
                     }
-                    // Potential
+                }
+                // Potential
+                if (p1 != p2) {
                     for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
                         auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
                         for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                             for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
                                 potential_shellset_slab[f1][f2][nuc_idx] += potential_shellset[idx];
+                                potential_shellset_slab_T[f2][f1][nuc_idx] += potential_shellset[idx];
                             }
                         }
                     }
-                } // Unique nuclear cartesian derivative indices loop
-
-                /* Serialize HDF dataset writing using OpenMP lock */
-                omp_set_lock(&lock);
-
+                } else {
+                    for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
+                        auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                                potential_shellset_slab[f1][f2][nuc_idx] += potential_shellset[idx];
+                            }
+                        }
+                    }
+                }
+            } // Unique nuclear cartesian derivative indices loop
+
+            /* Serialize HDF dataset writing using OpenMP lock */
+            omp_set_lock(&lock);
+
+            // Now write this shell set slab to HDF5 file
+            // Create file space hyperslab, defining where to write data to in file
+            hsize_t count[3] = {n1, n2, nderivs_triu};
+            hsize_t start[3] = {bf1, bf2, 0};
+            fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
+            // Create dataspace defining for memory dataset to write to file
+            hsize_t mem_dims[] = {n1, n2, nderivs_triu};
+            DataSpace mspace(3, mem_dims);
+            mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
+            // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
+            overlap_dataset->write(overlap_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+            kinetic_dataset->write(kinetic_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+            potential_dataset->write(potential_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+
+            if (p1 != p2) {
                 // Now write this shell set slab to HDF5 file
                 // Create file space hyperslab, defining where to write data to in file
-                hsize_t count[3] = {n1, n2, nderivs_triu};
-                hsize_t start[3] = {bf1, bf2, 0};
-                fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
+                hsize_t count_T[3] = {n2, n1, nderivs_triu};
+                hsize_t start_T[3] = {bf2, bf1, 0};
+                fspace.selectHyperslab(H5S_SELECT_SET, count_T, start_T, stride, block);
                 // Create dataspace defining for memory dataset to write to file
-                hsize_t mem_dims[] = {n1, n2, nderivs_triu};
-                DataSpace mspace(3, mem_dims);
-                mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
+                hsize_t mem_dims_T[] = {n2, n1, nderivs_triu};
+                DataSpace mspace_T(3, mem_dims_T);
+                mspace_T.selectHyperslab(H5S_SELECT_SET, count_T, zerostart, stride, block);
                 // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
-                overlap_dataset->write(overlap_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
-                kinetic_dataset->write(kinetic_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
-                potential_dataset->write(potential_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
-
-                /* Release lock */
-                omp_unset_lock(&lock);
+                overlap_dataset->write(overlap_shellset_slab_T, PredType::NATIVE_DOUBLE, mspace_T, fspace);
+                kinetic_dataset->write(kinetic_shellset_slab_T, PredType::NATIVE_DOUBLE, mspace_T, fspace);
+                potential_dataset->write(potential_shellset_slab_T, PredType::NATIVE_DOUBLE, mspace_T, fspace);
             }
+
+            /* Release lock */
+            omp_unset_lock(&lock);
+
         } // shell duet loops
         // Delete datasets for this derivative order
         delete overlap_dataset;
@@ -1403,8 +1475,7 @@ void oei_deriv_disk(int max_deriv_order) {
 // Computes a single 'deriv_order' derivative tensor of OEIs, keeps everything in core memory
 std::vector<py::array> oei_deriv_core(int deriv_order) {
     // Shell pairs after screening
-    const auto bs1_equiv_bs2 = true; // Only used for HF-type integrals
-    auto shellpairs = build_shellpairs();
+    auto shellpairs = build_shellpairs(bs1, bs2);
 
     // how many shell derivatives in the Libint buffer for overlap/kinetic integrals
     // how many shell and operator derivatives for potential integrals
@@ -1449,12 +1520,12 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
         auto n1 = bs1[p1].size(); // number of basis functions in first shell
         auto n2 = bs2[p2].size(); // number of basis functions in first shell
         auto bf1 = shell2bf_1[p1];  // first basis function in first shell
-        auto bf2 = shell2bf_2[p2];  // first basis function in second shell    
+        auto bf2 = shell2bf_2[p2];  // first basis function in second shell
         auto atom1 = shell2atom_1[p1]; // Atom index of shell 1
         auto atom2 = shell2atom_2[p2]; // Atom index of shell 2
         std::vector<long> shell_atom_index_list{atom1, atom2};
 
-        size_t thread_id = 0;
+        int thread_id = 0;
 #ifdef _OPENMP
         thread_id = omp_get_thread_num();
 #endif
@@ -1529,10 +1600,10 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
 
             // Loop over shell block for each buffer index which contributes to this derivative
             // Overlap and Kinetic
-            for(auto i = 0; i < buffer_indices.size(); ++i) {
-                auto overlap_shellset = overlap_buffer[buffer_indices[i]];
-                auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
-                if (bs1_equiv_bs2 && p1 != p2) {
+            if (p1 != p2) {
+                for(auto i = 0; i < buffer_indices.size(); ++i) {
+                    auto overlap_shellset = overlap_buffer[buffer_indices[i]];
+                    auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
                     for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                         for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
                             S[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += overlap_shellset[idx];
@@ -1541,7 +1612,11 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
                             T[(bf2 + f2) * nbf1 + bf1 + f1 + offset_nuc_idx] += kinetic_shellset[idx];
                         }
                     }
-                } else {
+                }
+            } else {
+                for(auto i = 0; i < buffer_indices.size(); ++i) {
+                    auto overlap_shellset = overlap_buffer[buffer_indices[i]];
+                    auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
                     for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                         for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
                             S[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += overlap_shellset[idx];
@@ -1551,16 +1626,19 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
                 }
             }
             // Potential
-            for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
-                auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
-                if (bs1_equiv_bs2 && p1 != p2) {
+            if (p1 != p2) {
+                for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
+                    auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
                     for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                         for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
                             V[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += potential_shellset[idx];
                             V[(bf2 + f2) * nbf1 + bf1 + f1 + offset_nuc_idx] += potential_shellset[idx];
                         }
                     }
-                } else {
+                }
+            } else {
+                for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
+                    auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
                     for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                         for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
                             V[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += potential_shellset[idx];
@@ -1618,7 +1696,7 @@ py::array eri_deriv_core(int deriv_order) {
                     if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
                     std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
 
-                    size_t thread_id = 0;
+                    int thread_id = 0;
 #ifdef _OPENMP
                     thread_id = omp_get_thread_num();
 #endif

From 0fc48f024f193c10559cc389b9a1fd0175768ccf Mon Sep 17 00:00:00 2001
From: Erica Mitchell <ericamitch5@gmail.com>
Date: Tue, 6 Feb 2024 14:54:36 -0500
Subject: [PATCH 42/91] WIP Schwarz Screening

---
 quax/integrals/libint_interface.cc | 209 ++++++++++++++++++++++++-----
 1 file changed, 179 insertions(+), 30 deletions(-)

diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index 4890d62..44e7017 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -342,6 +342,76 @@ std::vector<std::pair<int, int>> build_shellpairs(libint2::BasisSet A, libint2::
     return threads_sp_list[0];
 }
 
+// Schwarz-Screening of two-electron integrals
+std::vector<std::pair<int, int>> schwarz_screening(libint2::BasisSet A, libint2::BasisSet B){
+
+    const auto A_equiv_B = (A == B);
+
+    // construct the 2-electron repulsion integrals engine
+    std::vector<libint2::Engine> engines(nthreads);
+    engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l);
+    engines[0].set_precision(0.);
+    for (size_t i = 1; i != nthreads; ++i) {
+        engines[i] = engines[0];
+    }
+
+    std::vector<double> shell_pair_values(A.size() * B.size());
+    double max_integral = 0.0;
+
+    // loop over permutationally-unique set of shells
+    #pragma omp parallel num_threads(nthreads)
+    {
+        int thread_id = 0;
+#ifdef _OPENMP
+        thread_id = omp_get_thread_num();
+#endif
+        auto &engine = engines[thread_id];
+
+        // loop over permutationally-unique set of shells
+        for (auto s1 = 0l, s12 = 0l; s1 != A.size(); ++s1) {
+            auto n1 = A[s1].size();
+
+            auto s2_max = A_equiv_B ? s1 : B.size() - 1;
+            for (auto s2 = 0; s2 <= s2_max; ++s2, ++s12) {
+                if (s12 % nthreads != thread_id) continue;
+
+                auto n2 = B[s2].size();
+
+                engines[thread_id].compute(A[s1], B[s2], A[s1], B[s2]);
+                const double *buffer = const_cast<double *>(engine.results()[0]);
+
+                double shell_max_val = 0.0;
+                for (int f1 = 0; f1 != n1; f1++) {
+                    for (int f2 = 0; f2 != n2; f2++) {
+                        shell_max_val =
+                            std::max(shell_max_val, std::fabs(buffer[f1 * (n1 * n2 * n2 + n2) + f2 * (n1 * n2 + 1)]));
+                    }
+                }
+                max_integral = std::max(max_integral, shell_max_val);
+                shell_pair_values[s1 * B.size() + s2] = shell_max_val;
+            }
+        }
+    }
+
+    double threshold_sq = threshold * threshold;
+    double threshold_sq_over_max = threshold_sq / max_integral;
+
+    std::vector<std::pair<int, int>> shell_pairs;
+
+    for (auto s1 = 0l, s12 = 0l; s1 != A.size(); ++s1) {
+        auto s2_max = A_equiv_B ? s1 : B.size() - 1;
+        for (auto s2 = 0; s2 <= s2_max; ++s2, ++s12) {
+            if (shell_pair_values[s1 * B.size() + s2] >= threshold_sq_over_max) {
+                shell_pairs.push_back(std::make_pair(s1, s2));
+            } else {
+                std::cout << "Removed: " << s1 << " " << s2 << std::endl;
+            }
+        }
+    }
+
+    return shell_pairs;
+}
+
 // Compute one-electron integral
 py::array compute_1e_int(std::string type) {
     // Shell pairs after screening
@@ -413,6 +483,13 @@ py::array compute_1e_int(std::string type) {
 
 // Computes two-electron integrals
 py::array compute_2e_int(std::string type, double beta) {
+    // Shell screening
+    const auto bs1_equiv_bs2 = (bs1 == bs2);
+    const auto bs1_equiv_bs3 = (bs1 == bs3);
+    const auto bs3_equiv_bs4 = (bs3 == bs4);
+    const auto shellpairs_bra = schwarz_screening(bs1, bs2);
+    const auto shellpairs_ket = schwarz_screening(bs3, bs4);
+
     // workaround for data copying: perhaps pass an empty numpy array, then populate it in C++?
     // avoids last line, which copies
     std::vector<libint2::Engine> eri_engines(nthreads);
@@ -447,41 +524,113 @@ py::array compute_2e_int(std::string type, double beta) {
     size_t length = nbf1 * nbf2 * nbf3 * nbf4;
     std::vector<double> result(length);
     
-#pragma omp parallel for collapse(4) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            for(auto s3=0; s3 != bs3.size(); ++s3) {
-                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                    auto bf1 = shell2bf_1[s1];  // first basis function in first shell
-                    auto n1 = bs1[s1].size(); // number of basis functions in first shell
-                    auto bf2 = shell2bf_2[s2];  // first basis function in second shell
-                    auto n2 = bs2[s2].size(); // number of basis functions in second shell
-                    auto bf3 = shell2bf_3[s3];  // first basis function in third shell
-                    auto n3 = bs3[s3].size(); // number of basis functions in third shell
-                    auto bf4 = shell2bf_4[s4];  // first basis function in fourth shell
-                    auto n4 = bs4[s4].size(); // number of basis functions in fourth shell
+#pragma omp parallel for num_threads(nthreads)
+    for (const auto &pair : shellpairs_bra) {
+        int p1 = pair.first;
+        int p2 = pair.second;
 
-                    int thread_id = 0;
+        const auto &s1 = bs1[p1];
+        const auto &s2 = bs2[p2];
+        auto n1 = bs1[p1].size(); // number of basis functions in first shell
+        auto n2 = bs2[p2].size(); // number of basis functions in first shell
+        auto bf1 = shell2bf_1[p1];  // first basis function in first shell
+        auto bf2 = shell2bf_2[p2];  // first basis function in second shell
+
+        for (const auto &pair : shellpairs_ket) {
+            int p3 = pair.first;
+            int p4 = pair.second;
+
+            const auto &s3 = bs1[p3];
+            const auto &s4 = bs2[p4];
+            auto n3 = bs3[p3].size(); // number of basis functions in first shell
+            auto n4 = bs4[p4].size(); // number of basis functions in first shell
+            auto bf3 = shell2bf_3[p3];  // first basis function in first shell
+            auto bf4 = shell2bf_4[p4];  // first basis function in second shell
+
+            int thread_id = 0;
 #ifdef _OPENMP
-                    thread_id = omp_get_thread_num();
+            thread_id = omp_get_thread_num();
 #endif
-                    eri_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                    const auto& buf_vec = eri_engines[thread_id].results(); // will point to computed shell sets
+            eri_engines[thread_id].compute(s1, s2, s3, s4); // Compute shell set
+            const auto& buf_vec = eri_engines[thread_id].results(); // will point to computed shell sets
 
-                    auto ints_shellset = buf_vec[0];    // Location of the computed integrals
-                    if (ints_shellset == nullptr)
-                        continue;  // nullptr returned if the entire shell-set was screened out
+            auto ints_shellset = buf_vec[0];    // Location of the computed integrals
+            if (ints_shellset == nullptr)
+                continue;  // nullptr returned if the entire shell-set was screened out
 
-                    // Loop over shell block, keeping a total count idx for the size of shell set
-                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                        size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
-                        for(auto f2 = 0; f2 != n2; ++f2) {
-                            size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
-                            for(auto f3 = 0; f3 != n3; ++f3) {
-                                size_t offset_3 = (bf3 + f3) * nbf4;
-                                for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                    result[offset_1 + offset_2 + offset_3 + bf4 + f4] = ints_shellset[idx];
-                                }
+            std::cout << "(" << p1 << ", " << p2 << ", " << ", " << p3 << ", " << p4 << ")" << std::endl;
+
+            if (bs1_equiv_bs2 && p1 != p2 && bs3_equiv_bs4 && p3 != p4) {
+                // Loop over shell block, keeping a total count idx for the size of shell set
+                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                    size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                    size_t offset_1_T = (bf1 + f1) * nbf3 * nbf4;
+                    for(auto f2 = 0; f2 != n2; ++f2) {
+                        size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                        size_t offset_2_T = (bf2 + f2) * nbf1 * nbf3 * nbf4;
+                        for(auto f3 = 0; f3 != n3; ++f3) {
+                            size_t offset_3 = (bf3 + f3) * nbf4;
+                            size_t offset_3_T = bf3 + f3;
+                            for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                size_t offset_4 = bf4 + f4;
+                                size_t offset_4_T = (bf4 + f4) * nbf3;
+                                result[offset_1 + offset_2 + offset_3 + offset_4] = 
+                                    result[offset_1_T + offset_2_T + offset_3_T + offset_4_T] = ints_shellset[idx];
+                                std::cout << "Loop (12|34) = (21|43) : " << ints_shellset[idx] << std::endl;
+                            }
+                        }
+                    }
+                }
+            } else if (bs1_equiv_bs2 && p1 != p2) {
+                // Loop over shell block, keeping a total count idx for the size of shell set
+                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                    size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                    size_t offset_1_T = (bf1 + f1) * nbf3 * nbf4;
+                    for(auto f2 = 0; f2 != n2; ++f2) {
+                        size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                        size_t offset_2_T = (bf2 + f2) * nbf1 * nbf3 * nbf4;
+                        for(auto f3 = 0; f3 != n3; ++f3) {
+                            size_t offset_3 = (bf3 + f3) * nbf4;
+                            for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                size_t offset_4 = bf4 + f4;
+                                result[offset_1 + offset_2 + offset_3 + offset_4] =
+                                    result[offset_1_T + offset_2_T + offset_3 + offset_4] = ints_shellset[idx];
+                                std::cout << "Loop (12|34) = (21|34) : " << ints_shellset[idx] << std::endl;
+                            }
+                        }
+                    }
+                }
+            } else if (bs3_equiv_bs4 && p3 != p4) {
+                // Loop over shell block, keeping a total count idx for the size of shell set
+                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                    size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                    for(auto f2 = 0; f2 != n2; ++f2) {
+                        size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                        for(auto f3 = 0; f3 != n3; ++f3) {
+                            size_t offset_3 = (bf3 + f3) * nbf4;
+                            size_t offset_3_T = bf3 + f3;
+                            for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                size_t offset_4 = bf4 + f4;
+                                size_t offset_4_T = (bf4 + f4) * nbf3;
+                                result[offset_1 + offset_2 + offset_3 + offset_4] =
+                                    result[offset_1 + offset_2 + offset_3_T + offset_4_T] = ints_shellset[idx];
+                                std::cout << "Loop (12|34) = (12|43) : " << ints_shellset[idx] << std::endl;
+                            }
+                        }
+                    }
+                }
+            } else {
+                // Loop over shell block, keeping a total count idx for the size of shell set
+                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                    size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                    for(auto f2 = 0; f2 != n2; ++f2) {
+                        size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                        for(auto f3 = 0; f3 != n3; ++f3) {
+                            size_t offset_3 = (bf3 + f3) * nbf4;
+                            for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                size_t offset_4 = bf4 + f4;
+                                result[offset_1 + offset_2 + offset_3 + offset_4] = ints_shellset[idx];
+                                std::cout << "Loop (12|34) : " << ints_shellset[idx] << std::endl;
                             }
                         }
                     }

From d55ab2ed0e60d40780c93352d68d19b01dcad793 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Wed, 7 Feb 2024 11:14:13 -0500
Subject: [PATCH 43/91] Working Schwarz screening

---
 .gitignore                         |  3 +++
 quax/integrals/libint_interface.cc | 24 +++++++++++++-----------
 setup.py                           |  2 +-
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/.gitignore b/.gitignore
index cf2ea80..7713851 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,9 @@ __pycache__/
 *.h5
 *.xyz
 
+# Makefile
+makefile
+
 # Distribution / packaging
 .Python
 env/
diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index 44e7017..0013564 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -365,7 +365,6 @@ std::vector<std::pair<int, int>> schwarz_screening(libint2::BasisSet A, libint2:
 #ifdef _OPENMP
         thread_id = omp_get_thread_num();
 #endif
-        auto &engine = engines[thread_id];
 
         // loop over permutationally-unique set of shells
         for (auto s1 = 0l, s12 = 0l; s1 != A.size(); ++s1) {
@@ -378,7 +377,9 @@ std::vector<std::pair<int, int>> schwarz_screening(libint2::BasisSet A, libint2:
                 auto n2 = B[s2].size();
 
                 engines[thread_id].compute(A[s1], B[s2], A[s1], B[s2]);
-                const double *buffer = const_cast<double *>(engine.results()[0]);
+                const double * buffer = const_cast<double *>(engines[thread_id].results()[0]);
+
+                if (buffer == nullptr) continue;
 
                 double shell_max_val = 0.0;
                 for (int f1 = 0; f1 != n1; f1++) {
@@ -558,8 +559,7 @@ py::array compute_2e_int(std::string type, double beta) {
             if (ints_shellset == nullptr)
                 continue;  // nullptr returned if the entire shell-set was screened out
 
-            std::cout << "(" << p1 << ", " << p2 << ", " << ", " << p3 << ", " << p4 << ")" << std::endl;
-
+            auto full = false;
             if (bs1_equiv_bs2 && p1 != p2 && bs3_equiv_bs4 && p3 != p4) {
                 // Loop over shell block, keeping a total count idx for the size of shell set
                 for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
@@ -576,12 +576,13 @@ py::array compute_2e_int(std::string type, double beta) {
                                 size_t offset_4_T = (bf4 + f4) * nbf3;
                                 result[offset_1 + offset_2 + offset_3 + offset_4] = 
                                     result[offset_1_T + offset_2_T + offset_3_T + offset_4_T] = ints_shellset[idx];
-                                std::cout << "Loop (12|34) = (21|43) : " << ints_shellset[idx] << std::endl;
                             }
                         }
                     }
                 }
-            } else if (bs1_equiv_bs2 && p1 != p2) {
+                full = true;
+            } 
+            if (bs1_equiv_bs2 && p1 != p2) {
                 // Loop over shell block, keeping a total count idx for the size of shell set
                 for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                     size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
@@ -595,12 +596,13 @@ py::array compute_2e_int(std::string type, double beta) {
                                 size_t offset_4 = bf4 + f4;
                                 result[offset_1 + offset_2 + offset_3 + offset_4] =
                                     result[offset_1_T + offset_2_T + offset_3 + offset_4] = ints_shellset[idx];
-                                std::cout << "Loop (12|34) = (21|34) : " << ints_shellset[idx] << std::endl;
                             }
                         }
                     }
                 }
-            } else if (bs3_equiv_bs4 && p3 != p4) {
+                full = true;
+            } 
+            if (bs3_equiv_bs4 && p3 != p4) {
                 // Loop over shell block, keeping a total count idx for the size of shell set
                 for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                     size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
@@ -614,12 +616,13 @@ py::array compute_2e_int(std::string type, double beta) {
                                 size_t offset_4_T = (bf4 + f4) * nbf3;
                                 result[offset_1 + offset_2 + offset_3 + offset_4] =
                                     result[offset_1 + offset_2 + offset_3_T + offset_4_T] = ints_shellset[idx];
-                                std::cout << "Loop (12|34) = (12|43) : " << ints_shellset[idx] << std::endl;
                             }
                         }
                     }
                 }
-            } else {
+                full = true;
+            } 
+            if (full == false) {
                 // Loop over shell block, keeping a total count idx for the size of shell set
                 for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                     size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
@@ -630,7 +633,6 @@ py::array compute_2e_int(std::string type, double beta) {
                             for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
                                 size_t offset_4 = bf4 + f4;
                                 result[offset_1 + offset_2 + offset_3 + offset_4] = ints_shellset[idx];
-                                std::cout << "Loop (12|34) : " << ints_shellset[idx] << std::endl;
                             }
                         }
                     }
diff --git a/setup.py b/setup.py
index 6f0e3b1..5c61f00 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
             'numpy>=1.23',
             'jax>=0.4.19',
             'jaxlib>=0.4.19',
-            'h5py>=2.8.0'
+            'h5py>=2.8.0',
             'scipy>=1.9'
         ],
         extras_require={

From fe0e2a1805904f9043d7b874fe7a30eb7a2ba83f Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Thu, 8 Feb 2024 12:22:05 -0500
Subject: [PATCH 44/91] Working 2e Screening?

---
 .gitignore                         |   2 +-
 quax/integrals/libint_interface.cc | 117 ++++++++++++++++-------------
 quax/integrals/makefile            |  43 -----------
 3 files changed, 65 insertions(+), 97 deletions(-)
 delete mode 100644 quax/integrals/makefile

diff --git a/.gitignore b/.gitignore
index 7713851..15f3eec 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,7 +14,7 @@ __pycache__/
 *.xyz
 
 # Makefile
-makefile
+**/makefile
 
 # Distribution / packaging
 .Python
diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index 0013564..f668a5d 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -30,6 +30,7 @@ size_t max_nprim;
 int max_l;
 int nthreads = 1;
 double threshold;
+double max_engine_precision;
 
 // Creates atom objects from xyz file path
 std::vector<libint2::Atom> get_atoms(std::string xyzfilename) 
@@ -147,6 +148,7 @@ void initialize(std::string xyzfilename, std::string basis1, std::string basis2,
                          std::max(bs3.max_nprim(), bs4.max_nprim()));
     max_l = std::max(std::max(bs1.max_l(), bs2.max_l()),
                      std::max(bs3.max_l(), bs4.max_l()));
+    max_engine_precision = std::log(std::numeric_limits<double>::epsilon() * threshold);
 
     // Get number of OMP threads
 #ifdef _OPENMP
@@ -320,15 +322,12 @@ std::vector<std::pair<int, int>> build_shellpairs(libint2::BasisSet A, libint2::
                 if (!on_same_center) {
                     auto n2 = B[s2].size();
                     engines[thread_id].compute(A[s1], B[s2]);
-                    double normsq = std::inner_product(buf[0], buf[0] + n1 * n2, buf[0], 0.0);
+                    double normsq = std::inner_product(buf[0], buf[0] + n1 * n2, buf[0], 0.0); // Frobenius Norm
                     significant = (normsq >= threshold_sq);
                 }
 
-                if (significant) {
+                if (significant)
                     threads_sp_list[thread_id].push_back(std::make_pair(s1, s2));
-                } else {
-                    std::cout << "Removed Set: " << s1 << " " << s2 << std::endl;
-                }
             }
         }
     }  // end of compute
@@ -402,11 +401,8 @@ std::vector<std::pair<int, int>> schwarz_screening(libint2::BasisSet A, libint2:
     for (auto s1 = 0l, s12 = 0l; s1 != A.size(); ++s1) {
         auto s2_max = A_equiv_B ? s1 : B.size() - 1;
         for (auto s2 = 0; s2 <= s2_max; ++s2, ++s12) {
-            if (shell_pair_values[s1 * B.size() + s2] >= threshold_sq_over_max) {
+            if (shell_pair_values[s1 * B.size() + s2] >= threshold_sq_over_max)
                 shell_pairs.push_back(std::make_pair(s1, s2));
-            } else {
-                std::cout << "Removed: " << s1 << " " << s2 << std::endl;
-            }
         }
     }
 
@@ -433,6 +429,7 @@ py::array compute_1e_int(std::string type) {
        throw std::invalid_argument("type must be overlap, kinetic, or potential");
     }
 
+    engines[0].set_precision(max_engine_precision);
     for (size_t i = 1; i != nthreads; ++i) {
         engines[i] = engines[0];
     }
@@ -486,40 +483,40 @@ py::array compute_1e_int(std::string type) {
 py::array compute_2e_int(std::string type, double beta) {
     // Shell screening
     const auto bs1_equiv_bs2 = (bs1 == bs2);
-    const auto bs1_equiv_bs3 = (bs1 == bs3);
     const auto bs3_equiv_bs4 = (bs3 == bs4);
     const auto shellpairs_bra = schwarz_screening(bs1, bs2);
     const auto shellpairs_ket = schwarz_screening(bs3, bs4);
 
     // workaround for data copying: perhaps pass an empty numpy array, then populate it in C++?
     // avoids last line, which copies
-    std::vector<libint2::Engine> eri_engines(nthreads);
+    std::vector<libint2::Engine> engines(nthreads);
 
     if (type == "eri") {
-        eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l);
+        engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l);
     } else if (type == "f12") {
         auto cgtg_params = make_cgtg(beta);
-        eri_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l);
-        eri_engines[0].set_params(cgtg_params);
+        engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l);
+        engines[0].set_params(cgtg_params);
     } else if (type == "f12g12") {
         auto cgtg_params = make_cgtg(beta);
-        eri_engines[0] = libint2::Engine(libint2::Operator::cgtg_x_coulomb, max_nprim, max_l);
-        eri_engines[0].set_params(cgtg_params);
+        engines[0] = libint2::Engine(libint2::Operator::cgtg_x_coulomb, max_nprim, max_l);
+        engines[0].set_params(cgtg_params);
     } else if (type == "f12_squared") {
         auto cgtg_params = take_square(make_cgtg(beta));
-        eri_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l);
-        eri_engines[0].set_params(cgtg_params);
+        engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l);
+        engines[0].set_params(cgtg_params);
     } else if (type == "f12_double_commutator") {
         auto cgtg_params = make_cgtg(beta);
-        eri_engines[0] = libint2::Engine(libint2::Operator::delcgtg2, max_nprim, max_l, 0,
+        engines[0] = libint2::Engine(libint2::Operator::delcgtg2, max_nprim, max_l, 0,
                                             std::numeric_limits<libint2::scalar_type>::epsilon(),
                                             cgtg_params, libint2::BraKet::xx_xx);
     } else {
         throw std::invalid_argument("type must be eri, f12, f12g12, f12_squared, or f12_double_commutator");
     }
 
+    engines[0].set_precision(max_engine_precision);
     for (size_t i = 1; i != nthreads; ++i) {
-        eri_engines[i] = eri_engines[0];
+        engines[i] = engines[0];
     }
 
     size_t length = nbf1 * nbf2 * nbf3 * nbf4;
@@ -541,8 +538,8 @@ py::array compute_2e_int(std::string type, double beta) {
             int p3 = pair.first;
             int p4 = pair.second;
 
-            const auto &s3 = bs1[p3];
-            const auto &s4 = bs2[p4];
+            const auto &s3 = bs3[p3];
+            const auto &s4 = bs4[p4];
             auto n3 = bs3[p3].size(); // number of basis functions in first shell
             auto n4 = bs4[p4].size(); // number of basis functions in first shell
             auto bf3 = shell2bf_3[p3];  // first basis function in first shell
@@ -552,8 +549,8 @@ py::array compute_2e_int(std::string type, double beta) {
 #ifdef _OPENMP
             thread_id = omp_get_thread_num();
 #endif
-            eri_engines[thread_id].compute(s1, s2, s3, s4); // Compute shell set
-            const auto& buf_vec = eri_engines[thread_id].results(); // will point to computed shell sets
+            engines[thread_id].compute(s1, s2, s3, s4); // Compute shell set
+            const auto& buf_vec = engines[thread_id].results(); // will point to computed shell sets
 
             auto ints_shellset = buf_vec[0];    // Location of the computed integrals
             if (ints_shellset == nullptr)
@@ -680,6 +677,7 @@ py::array compute_1e_deriv(std::string type, std::vector<int> deriv_vec) {
        throw std::invalid_argument("type must be overlap, kinetic, or potential");
     }
 
+    engines[0].set_precision(max_engine_precision);
     for (size_t i = 1; i != nthreads; ++i) {
         engines[i] = engines[0];
     }
@@ -800,33 +798,34 @@ py::array compute_2e_deriv(std::string type, double beta, std::vector<int> deriv
     const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
 
     // ERI derivative integral engine
-    std::vector<libint2::Engine> eri_engines(nthreads);
+    std::vector<libint2::Engine> engines(nthreads);
 
     if (type == "eri") {
-        eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
+        engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
     } else if (type == "f12") {
         auto cgtg_params = make_cgtg(beta);
-        eri_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
-        eri_engines[0].set_params(cgtg_params);
+        engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
+        engines[0].set_params(cgtg_params);
     } else if (type == "f12g12") {
         auto cgtg_params = make_cgtg(beta);
-        eri_engines[0] = libint2::Engine(libint2::Operator::cgtg_x_coulomb, max_nprim, max_l, deriv_order);
-        eri_engines[0].set_params(cgtg_params);
+        engines[0] = libint2::Engine(libint2::Operator::cgtg_x_coulomb, max_nprim, max_l, deriv_order);
+        engines[0].set_params(cgtg_params);
     } else if (type == "f12_squared") {
         auto cgtg_params = take_square(make_cgtg(beta));
-        eri_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
-        eri_engines[0].set_params(cgtg_params);
+        engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
+        engines[0].set_params(cgtg_params);
     } else if (type == "f12_double_commutator") {
         auto cgtg_params = make_cgtg(beta);
-        eri_engines[0] = libint2::Engine(libint2::Operator::delcgtg2, max_nprim, max_l, deriv_order,
+        engines[0] = libint2::Engine(libint2::Operator::delcgtg2, max_nprim, max_l, deriv_order,
                                             std::numeric_limits<libint2::scalar_type>::epsilon(),
                                             cgtg_params, libint2::BraKet::xx_xx);
     } else {
         throw std::invalid_argument("type must be eri, f12, f12g12, f12_squared, or f12_double_commutator");
     }
 
+    engines[0].set_precision(max_engine_precision);
     for (size_t i = 1; i != nthreads; ++i) {
-        eri_engines[i] = eri_engines[0];
+        engines[i] = engines[0];
     }
 
     size_t length = nbf1 * nbf2 * nbf3 * nbf4;
@@ -906,8 +905,8 @@ py::array compute_2e_deriv(std::string type, double beta, std::vector<int> deriv
 #ifdef _OPENMP
                     thread_id = omp_get_thread_num();
 #endif
-                    eri_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                    const auto& buf_vec = eri_engines[thread_id].results(); // will point to computed shell sets
+                    engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                    const auto& buf_vec = engines[thread_id].results(); // will point to computed shell sets
 
                     for(auto i = 0; i < buffer_indices.size(); ++i) {
                         auto ints_shellset = buf_vec[buffer_indices[i]];
@@ -997,6 +996,7 @@ void compute_1e_deriv_disk(std::string type, int max_deriv_order) {
            throw std::invalid_argument("type must be overlap, kinetic, or potential");
         }
 
+        engines[0].set_precision(max_engine_precision);
         for (size_t i = 1; i != nthreads; ++i) {
             engines[i] = engines[0];
         }
@@ -1176,33 +1176,34 @@ void compute_2e_deriv_disk(std::string type, double beta, int max_deriv_order) {
         const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
 
         // Libint engine for computing shell quartet derivatives
-        std::vector<libint2::Engine> eri_engines(nthreads);
+        std::vector<libint2::Engine> engines(nthreads);
 
         if (type == "eri") {
-            eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
+            engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
         } else if (type == "f12") {
             auto cgtg_params = make_cgtg(beta);
-            eri_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
-            eri_engines[0].set_params(cgtg_params);
+            engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
+            engines[0].set_params(cgtg_params);
         } else if (type == "f12g12") {
             auto cgtg_params = make_cgtg(beta);
-            eri_engines[0] = libint2::Engine(libint2::Operator::cgtg_x_coulomb, max_nprim, max_l, deriv_order);
-            eri_engines[0].set_params(cgtg_params);
+            engines[0] = libint2::Engine(libint2::Operator::cgtg_x_coulomb, max_nprim, max_l, deriv_order);
+            engines[0].set_params(cgtg_params);
         } else if (type == "f12_squared") {
             auto cgtg_params = take_square(make_cgtg(beta));
-            eri_engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
-            eri_engines[0].set_params(cgtg_params);
+            engines[0] = libint2::Engine(libint2::Operator::cgtg, max_nprim, max_l, deriv_order);
+            engines[0].set_params(cgtg_params);
         } else if (type == "f12_double_commutator") {
             auto cgtg_params = make_cgtg(beta);
-            eri_engines[0] = libint2::Engine(libint2::Operator::delcgtg2, max_nprim, max_l, deriv_order,
+            engines[0] = libint2::Engine(libint2::Operator::delcgtg2, max_nprim, max_l, deriv_order,
                                                 std::numeric_limits<libint2::scalar_type>::epsilon(),
                                                 cgtg_params, libint2::BraKet::xx_xx);
         } else {
             throw std::invalid_argument("type must be eri, f12, f12g12, f12_squared, or f12_double_commutator");
         }
 
+        engines[0].set_precision(max_engine_precision);
         for (size_t i = 1; i != nthreads; ++i) {
-            eri_engines[i] = eri_engines[0];
+            engines[i] = engines[0];
         }
 
         // Define HDF5 dataset name
@@ -1245,8 +1246,8 @@ void compute_2e_deriv_disk(std::string type, double beta, int max_deriv_order) {
 #ifdef _OPENMP
                         thread_id = omp_get_thread_num();
 #endif
-                        eri_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                        const auto& eri_buffer = eri_engines[thread_id].results(); // will point to computed shell sets
+                        engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                        const auto& eri_buffer = engines[thread_id].results(); // will point to computed shell sets
 
                         // Define shell set slab, with extra dimension for unique derivatives, initialized with 0.0's
                         double eri_shellset_slab [n1][n2][n3][n4][nderivs_triu] = {};
@@ -1400,6 +1401,10 @@ void oei_deriv_disk(int max_deriv_order) {
         t_engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l, deriv_order);
         v_engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l, deriv_order);
         v_engines[0].set_params(make_point_charges(atoms));
+
+        s_engines[0].set_precision(max_engine_precision);
+        t_engines[0].set_precision(max_engine_precision);
+        v_engines[0].set_precision(max_engine_precision);
         for (size_t i = 1; i != nthreads; ++i) {
             s_engines[i] = s_engines[0];
             t_engines[i] = t_engines[0];
@@ -1650,6 +1655,10 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
     t_engines[0] = libint2::Engine(libint2::Operator::kinetic, max_nprim, max_l, deriv_order);
     v_engines[0] = libint2::Engine(libint2::Operator::nuclear, max_nprim, max_l, deriv_order);
     v_engines[0].set_params(make_point_charges(atoms));
+
+    s_engines[0].set_precision(max_engine_precision);
+    t_engines[0].set_precision(max_engine_precision);
+    v_engines[0].set_precision(max_engine_precision);
     for (size_t i = 1; i != nthreads; ++i) {
         s_engines[i] = s_engines[0];
         t_engines[i] = t_engines[0];
@@ -1816,10 +1825,12 @@ py::array eri_deriv_core(int deriv_order) {
     const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
 
     // Libint engine for computing shell quartet derivatives
-    std::vector<libint2::Engine> eri_engines(nthreads);
-    eri_engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
+    std::vector<libint2::Engine> engines(nthreads);
+    engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
+
+    engines[0].set_precision(max_engine_precision);
     for (size_t i = 1; i != nthreads; ++i) {
-        eri_engines[i] = eri_engines[0];
+        engines[i] = engines[0];
     }
 
     size_t length = nbf1 * nbf2 * nbf3 * nbf4 * nderivs_triu;
@@ -1851,8 +1862,8 @@ py::array eri_deriv_core(int deriv_order) {
 #ifdef _OPENMP
                     thread_id = omp_get_thread_num();
 #endif
-                    eri_engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                    const auto& eri_buffer = eri_engines[thread_id].results(); // will point to computed shell sets
+                    engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
+                    const auto& eri_buffer = engines[thread_id].results(); // will point to computed shell sets
 
                     // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
                     for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
diff --git a/quax/integrals/makefile b/quax/integrals/makefile
deleted file mode 100644
index d7c5a88..0000000
--- a/quax/integrals/makefile
+++ /dev/null
@@ -1,43 +0,0 @@
-# NOTE: These paths below need to be edited such that they point to a set of 
-# Eigen headers, Python headers, Pybind11 headers, Libint API headers libint2.h libint2.hpp, the rest of the Libint2 headers, and the library location of libint2.a,
-CC      := g++
-# Options passed to compiler, add "-fopenmp" if intending to use OpenMP
-CFLAGS  := -O3 -fPIC -fopenmp -g
-# Libint prefix location (where /include, /include/libint2, /lib, /share are located) 
-LIBINT_PREFIX := /home/vulcan/ecm23353/.conda/envs/f12
-# Conda prefix location, it is suggested to use conda to install nearly all dependencies
-CONDA_PREFIX := /home/vulcan/ecm23353/.conda/envs/f12
-
-I1 := $(LIBINT_PREFIX)/include
-I2 := $(LIBINT_PREFIX)/include/libint2
-L1 := $(LIBINT_PREFIX)/lib
-# Eigen headers location 
-I3 := $(CONDA_PREFIX)/include/eigen3
-# Python headers location 
-I4 := $(CONDA_PREFIX)/include/python3.10
-# Pybind11 headers location 
-I5 := $(CONDA_PREFIX)/lib/python3.10/site-packages/pybind11/include
-# HDF5 headers, static and shared libraries 
-I6 := $(CONDA_PREFIX)/include
-L2 := $(CONDA_PREFIX)/lib
-# Edit path in quotes to be same location as L2 definition above
-RPATH := -Wl,-rpath,"$(CONDA_PREFIX)/lib"
-
-# This 'TARGETS' suffix should be set to whatever is returned by the command `python3-config --extension-suffix` entered on command line.
-# and it should match the same python version referenced in the above include path for I4 := (3.7 in this case)
-TARGETS := libint_interface.cpython-310-x86_64-linux-gnu.so
-OBJ     := libint_interface.o
-
-# Rest is boilerplate. Do not edit unless you know what you're doing.
-.PHONY: all clean
-
-all: $(TARGETS)
-
-clean:
-	rm -f $(OBJ)
-
-$(OBJ): %.o : %.cc $(DEPS)
-	$(CC) -c $< -o $@ $(CFLAGS) -I $(I1) -I $(I2) -I $(I3) -I $(I4) -I $(I5) -I $(I6) -lint2 -L $(L1) -lhdf5 -lhdf5_cpp -L $(L2) $(RPATH)
-$(TARGETS): $(OBJ)
-	$(CC) $^ -o $@ $(CFLAGS) -shared -I $(I1) -I $(I2) -I $(I3) -I $(I4) -I $(I5) -I $(I6) -lint2 -L $(L1) -lhdf5 -lhdf5_cpp -L $(L2) $(RPATH)
-

From 4cd013ab515889b5f04ee89137faab8f964e1b96 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Thu, 8 Feb 2024 12:23:33 -0500
Subject: [PATCH 45/91] Makefile

---
 .gitignore              |  3 ---
 quax/integrals/makefile | 43 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 3 deletions(-)
 create mode 100644 quax/integrals/makefile

diff --git a/.gitignore b/.gitignore
index 15f3eec..cf2ea80 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,9 +13,6 @@ __pycache__/
 *.h5
 *.xyz
 
-# Makefile
-**/makefile
-
 # Distribution / packaging
 .Python
 env/
diff --git a/quax/integrals/makefile b/quax/integrals/makefile
new file mode 100644
index 0000000..de1e519
--- /dev/null
+++ b/quax/integrals/makefile
@@ -0,0 +1,43 @@
+# NOTE: These paths below need to be edited such that they point to a set of 
+# Eigen headers, Python headers, Pybind11 headers, Libint API headers libint2.h libint2.hpp, the rest of the Libint2 headers, and the library location of libint2.a,
+CC      := g++-10
+# Options passed to compiler, add "-fopenmp" if intending to use OpenMP
+CFLAGS  := -O3 -fPIC -fopenmp -g
+# Libint prefix location (where /include, /include/libint2, /lib, /share are located) 
+LIBINT_PREFIX := /home/ecm23353/miniconda3/envs/p4dev
+# Conda prefix location, it is suggested to use conda to install nearly all dependencies
+CONDA_PREFIX := /home/ecm23353/miniconda3/envs/p4dev
+
+I1 := $(LIBINT_PREFIX)/include
+I2 := $(LIBINT_PREFIX)/include/libint2
+L1 := $(LIBINT_PREFIX)/lib
+# Eigen headers location 
+I3 := $(CONDA_PREFIX)/include/eigen3
+# Python headers location 
+I4 := $(CONDA_PREFIX)/include/python3.11
+# Pybind11 headers location 
+I5 := $(CONDA_PREFIX)/lib/python3.11/site-packages/pybind11/include
+# HDF5 headers, static and shared libraries 
+I6 := $(CONDA_PREFIX)/include
+L2 := $(CONDA_PREFIX)/lib
+# Edit path in quotes to be same location as L2 definition above
+RPATH := -Wl,-rpath,"$(CONDA_PREFIX)/lib"
+
+# This 'TARGETS' suffix should be set to whatever is returned by the command `python3-config --extension-suffix` entered on command line.
+# and it should match the same python version referenced in the above include path for I4 := (3.7 in this case)
+TARGETS := libint_interface.cpython-311-x86_64-linux-gnu.so
+OBJ     := libint_interface.o
+
+# Rest is boilerplate. Do not edit unless you know what you're doing.
+.PHONY: all clean
+
+all: $(TARGETS)
+
+clean:
+	rm -f $(OBJ)
+
+$(OBJ): %.o : %.cc $(DEPS)
+	$(CC) -c $< -o $@ $(CFLAGS) -I $(I1) -I $(I2) -I $(I3) -I $(I4) -I $(I5) -I $(I6) -lint2 -L $(L1) -lhdf5 -lhdf5_cpp -L $(L2) $(RPATH)
+$(TARGETS): $(OBJ)
+	$(CC) $^ -o $@ $(CFLAGS) -shared -I $(I1) -I $(I2) -I $(I3) -I $(I4) -I $(I5) -I $(I6) -lint2 -L $(L1) -lhdf5 -lhdf5_cpp -L $(L2) $(RPATH)
+

From b7fab0f2b415b1cfca1491766754f503c2af42c0 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Thu, 8 Feb 2024 14:07:22 -0500
Subject: [PATCH 46/91] tei_deriv working, not disk

---
 quax/integrals/libint_interface.cc | 500 +++++++++++++++++++----------
 quax/integrals/oei.py              |   3 -
 2 files changed, 336 insertions(+), 167 deletions(-)

diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index f668a5d..263c9bd 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -797,6 +797,12 @@ py::array compute_2e_deriv(std::string type, double beta, std::vector<int> deriv
     // to multidimensional shell derivative index
     const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
 
+    // Shell screening
+    const auto bs1_equiv_bs2 = (bs1 == bs2);
+    const auto bs3_equiv_bs4 = (bs3 == bs4);
+    const auto shellpairs_bra = schwarz_screening(bs1, bs2);
+    const auto shellpairs_ket = schwarz_screening(bs3, bs4);
+
     // ERI derivative integral engine
     std::vector<libint2::Engine> engines(nthreads);
 
@@ -831,95 +837,177 @@ py::array compute_2e_deriv(std::string type, double beta, std::vector<int> deriv
     size_t length = nbf1 * nbf2 * nbf3 * nbf4;
     std::vector<double> result(length);
 
-#pragma omp parallel for collapse(4) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
-                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
-                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
-                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
-                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
-                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
-                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
-
-                    // If the atoms are the same we ignore it as the derivatives will be zero.
-                    if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
-                    // Ensure all desired_atoms correspond to at least one shell atom to
-                    // ensure desired derivative exists. else, skip this shell quartet.
-                    bool atoms_not_present = false;
-                    for (int i = 0; i < deriv_order; i++){
-                        if (atom1 == desired_atom_indices[i]) continue; 
-                        else if (atom2 == desired_atom_indices[i]) continue;
-                        else if (atom3 == desired_atom_indices[i]) continue;
-                        else if (atom4 == desired_atom_indices[i]) continue;
-                        else {atoms_not_present = true; break;}
-                    }
-                    if (atoms_not_present) continue;
+#pragma omp parallel for num_threads(nthreads)
+    for (const auto &pair : shellpairs_bra) {
+        int p1 = pair.first;
+        int p2 = pair.second;
 
-                    // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
-                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
-                
-                    // For every desired atom derivative, check shell indices for a match,
-                    // add it to subvector for that derivative
-                    // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
-                    std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-                    for (int j = 0; j < desired_atom_indices.size(); j++){
-                        int desired_atom_idx = desired_atom_indices[j];
-                        // Shell indices
-                        for (int i = 0; i < 4; i++){
-                            int atom_idx = shell_atom_index_list[i];
-                            if (atom_idx == desired_atom_idx) {
-                                int tmp = 3 * i + desired_coordinates[j];
-                                indices[j].push_back(tmp);
-                            }
-                        }
-                    }
-                    
-                    // Now indices is a vector of vectors, where each subvector is your choices
-                    // for the first derivative operator, second, third, etc
-                    // and the total number of subvectors is the order of differentiation
-                    // Now we want all combinations where we pick exactly one index from each subvector.
-                    // This is achievable through a cartesian product 
-                    std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                    std::vector<int> buffer_indices;
+        const auto &s1 = bs1[p1];
+        const auto &s2 = bs2[p2];
+        auto n1 = bs1[p1].size(); // number of basis functions in first shell
+        auto n2 = bs2[p2].size(); // number of basis functions in first shell
+        auto bf1 = shell2bf_1[p1];  // first basis function in first shell
+        auto bf2 = shell2bf_2[p2];  // first basis function in second shell
+        auto atom1 = shell2atom_1[p1]; // Atom index of shell 1
+        auto atom2 = shell2atom_2[p2]; // Atom index of shell 2
 
-                    // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                    for (auto vec : index_combos)  {
-                        std::sort(vec.begin(), vec.end());
-                        int buf_idx = 0;
-                        // buffer_multidim_lookup
-                        auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                        if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                        buffer_indices.push_back(buf_idx);
+        for (const auto &pair : shellpairs_ket) {
+            int p3 = pair.first;
+            int p4 = pair.second;
+
+            const auto &s3 = bs3[p3];
+            const auto &s4 = bs4[p4];
+            auto n3 = bs3[p3].size(); // number of basis functions in first shell
+            auto n4 = bs4[p4].size(); // number of basis functions in first shell
+            auto bf3 = shell2bf_3[p3];  // first basis function in first shell
+            auto bf4 = shell2bf_4[p4];  // first basis function in second shell
+            auto atom3 = shell2atom_3[p3]; // Atom index of shell 3
+            auto atom4 = shell2atom_4[p4]; // Atom index of shell 4
+
+            // If the atoms are the same we ignore it as the derivatives will be zero.
+            if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
+            // Ensure all desired_atoms correspond to at least one shell atom to
+            // ensure desired derivative exists. else, skip this shell quartet.
+            bool atoms_not_present = false;
+            for (int i = 0; i < deriv_order; i++){
+                if (atom1 == desired_atom_indices[i]) continue; 
+                else if (atom2 == desired_atom_indices[i]) continue;
+                else if (atom3 == desired_atom_indices[i]) continue;
+                else if (atom4 == desired_atom_indices[i]) continue;
+                else {atoms_not_present = true; break;}
+            }
+            if (atoms_not_present) continue;
+
+            // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
+            std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
+            
+            // For every desired atom derivative, check shell indices for a match,
+            // add it to subvector for that derivative
+            // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
+            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+            for (int j = 0; j < desired_atom_indices.size(); j++){
+                int desired_atom_idx = desired_atom_indices[j];
+                // Shell indices
+                for (int i = 0; i < 4; i++){
+                    int atom_idx = shell_atom_index_list[i];
+                    if (atom_idx == desired_atom_idx) {
+                        int tmp = 3 * i + desired_coordinates[j];
+                        indices[j].push_back(tmp);
                     }
+                }
+            }
+            
+            // Now indices is a vector of vectors, where each subvector is your choices
+            // for the first derivative operator, second, third, etc
+            // and the total number of subvectors is the order of differentiation
+            // Now we want all combinations where we pick exactly one index from each subvector.
+            // This is achievable through a cartesian product 
+            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+            std::vector<int> buffer_indices;
+
+            // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
+            for (auto vec : index_combos)  {
+                std::sort(vec.begin(), vec.end());
+                int buf_idx = 0;
+                // buffer_multidim_lookup
+                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                buffer_indices.push_back(buf_idx);
+            }
 
-                    // If we made it this far, the shell derivative we want is contained in the buffer. 
-                    int thread_id = 0;
+            // If we made it this far, the shell derivative we want is contained in the buffer. 
+            int thread_id = 0;
 #ifdef _OPENMP
-                    thread_id = omp_get_thread_num();
+            thread_id = omp_get_thread_num();
 #endif
-                    engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                    const auto& buf_vec = engines[thread_id].results(); // will point to computed shell sets
-
-                    for(auto i = 0; i < buffer_indices.size(); ++i) {
-                        auto ints_shellset = buf_vec[buffer_indices[i]];
-                        if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
-                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                            size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
-                            for(auto f2 = 0; f2 != n2; ++f2) {
-                                size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
-                                for(auto f3 = 0; f3 != n3; ++f3) {
-                                    size_t offset_3 = (bf3 + f3) * nbf4;
-                                    for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                        result[offset_1 + offset_2 + offset_3 + bf4 + f4] += ints_shellset[idx];
-                                    }
+            engines[thread_id].compute(s1, s2, s3, s4); // Compute shell set
+            const auto& buf_vec = engines[thread_id].results(); // will point to computed shell sets
+            
+            auto full = false;
+            if (bs1_equiv_bs2 && p1 != p2 && bs3_equiv_bs4 && p3 != p4) {
+                for(auto i = 0; i < buffer_indices.size(); ++i) {
+                    auto ints_shellset = buf_vec[buffer_indices[i]];
+                    if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
+                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                        size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                        size_t offset_1_T = (bf1 + f1) * nbf3 * nbf4;
+                        for(auto f2 = 0; f2 != n2; ++f2) {
+                            size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                            size_t offset_2_T = (bf2 + f2) * nbf1 * nbf3 * nbf4;
+                            for(auto f3 = 0; f3 != n3; ++f3) {
+                                size_t offset_3 = (bf3 + f3) * nbf4;
+                                size_t offset_3_T = bf3 + f3;
+                                for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                    size_t offset_4 = bf4 + f4;
+                                    size_t offset_4_T = (bf4 + f4) * nbf3;
+                                    result[offset_1 + offset_2 + offset_3 + offset_4] = 
+                                        result[offset_1_T + offset_2_T + offset_3_T + offset_4_T] += ints_shellset[idx];
+                                }
+                            }
+                        }
+                    }
+                }
+                full = true;
+            }
+            if (bs1_equiv_bs2 && p1 != p2) {
+                for(auto i = 0; i < buffer_indices.size(); ++i) {
+                    auto ints_shellset = buf_vec[buffer_indices[i]];
+                    if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
+                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                        size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                        size_t offset_1_T = (bf1 + f1) * nbf3 * nbf4;
+                        for(auto f2 = 0; f2 != n2; ++f2) {
+                            size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                            size_t offset_2_T = (bf2 + f2) * nbf1 * nbf3 * nbf4;
+                            for(auto f3 = 0; f3 != n3; ++f3) {
+                                size_t offset_3 = (bf3 + f3) * nbf4;
+                                for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                    size_t offset_4 = bf4 + f4;
+                                    result[offset_1 + offset_2 + offset_3 + offset_4] =
+                                        result[offset_1_T + offset_2_T + offset_3 + offset_4] += ints_shellset[idx];
+                                }
+                            }
+                        }
+                    }
+                }
+                full = true;
+            }
+            if (bs3_equiv_bs4 && p3 != p4) {
+                for(auto i = 0; i < buffer_indices.size(); ++i) {
+                    auto ints_shellset = buf_vec[buffer_indices[i]];
+                    if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
+                    // Loop over shell block, keeping a total count idx for the size of shell set
+                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                        size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                        for(auto f2 = 0; f2 != n2; ++f2) {
+                            size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                            for(auto f3 = 0; f3 != n3; ++f3) {
+                                size_t offset_3 = (bf3 + f3) * nbf4;
+                                size_t offset_3_T = bf3 + f3;
+                                for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                    size_t offset_4 = bf4 + f4;
+                                    size_t offset_4_T = (bf4 + f4) * nbf3;
+                                    result[offset_1 + offset_2 + offset_3 + offset_4] =
+                                        result[offset_1 + offset_2 + offset_3_T + offset_4_T] += ints_shellset[idx];
+                                }
+                            }
+                        }
+                    }
+                }
+                full = true;
+            }
+            if (full == false) {
+                for(auto i = 0; i < buffer_indices.size(); ++i) {
+                    auto ints_shellset = buf_vec[buffer_indices[i]];
+                    if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
+                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                        size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                        for(auto f2 = 0; f2 != n2; ++f2) {
+                            size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                            for(auto f3 = 0; f3 != n3; ++f3) {
+                                size_t offset_3 = (bf3 + f3) * nbf4;
+                                for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                    result[offset_1 + offset_2 + offset_3 + bf4 + f4] += ints_shellset[idx];
                                 }
                             }
                         }
@@ -1207,13 +1295,13 @@ void compute_2e_deriv_disk(std::string type, double beta, int max_deriv_order) {
         }
 
         // Define HDF5 dataset name
-        const H5std_string eri_dset_name(type + "_" + std::to_string(nbf1) + "_" + std::to_string(nbf2)
+        const H5std_string dset_name(type + "_" + std::to_string(nbf1) + "_" + std::to_string(nbf2)
                                          + "_" + std::to_string(nbf3) + "_" + std::to_string(nbf4)
                                          + "_deriv" + std::to_string(deriv_order));
         hsize_t file_dims[] = {nbf1, nbf2, nbf3, nbf4, nderivs_triu};
         DataSpace fspace(5, file_dims);
         // Create dataset for each integral type and write 0.0's into the file 
-        DataSet* eri_dataset = new DataSet(file->createDataSet(eri_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
+        DataSet* dataset = new DataSet(file->createDataSet(dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
         hsize_t stride[5] = {1, 1, 1, 1, 1}; // stride and block can be used to 
         hsize_t block[5] = {1, 1, 1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
         hsize_t zerostart[5] = {0, 0, 0, 0, 0};
@@ -1247,10 +1335,10 @@ void compute_2e_deriv_disk(std::string type, double beta, int max_deriv_order) {
                         thread_id = omp_get_thread_num();
 #endif
                         engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                        const auto& eri_buffer = engines[thread_id].results(); // will point to computed shell sets
+                        const auto& buffer = engines[thread_id].results(); // will point to computed shell sets
 
                         // Define shell set slab, with extra dimension for unique derivatives, initialized with 0.0's
-                        double eri_shellset_slab [n1][n2][n3][n4][nderivs_triu] = {};
+                        double ints_shellset_slab [n1][n2][n3][n4][nderivs_triu] = {};
                         // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
                         for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
                             // Look up multidimensional cartesian derivative index
@@ -1291,13 +1379,13 @@ void compute_2e_deriv_disk(std::string type, double beta, int max_deriv_order) {
 
                             // Loop over shell block, keeping a total count idx for the size of shell set
                             for(auto i = 0; i < buffer_indices.size(); ++i) {
-                                auto eri_shellset = eri_buffer[buffer_indices[i]];
-                                if (eri_shellset == nullptr) continue;
+                                auto ints_shellset = buffer[buffer_indices[i]];
+                                if (ints_shellset == nullptr) continue;
                                 for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                                     for(auto f2 = 0; f2 != n2; ++f2) {
                                         for(auto f3 = 0; f3 != n3; ++f3) {
                                             for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                                eri_shellset_slab[f1][f2][f3][f4][nuc_idx] += eri_shellset[idx];
+                                                ints_shellset_slab[f1][f2][f3][f4][nuc_idx] += ints_shellset[idx];
                                             }
                                         }
                                     }
@@ -1318,7 +1406,7 @@ void compute_2e_deriv_disk(std::string type, double beta, int max_deriv_order) {
                         mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
                         // Write buffer data 'shellset_slab' with data type double from
                         // memory dataspace `mspace` to file dataspace `fspace`
-                        eri_dataset->write(eri_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+                        dataset->write(ints_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
 
                         /* Release lock */
                         omp_unset_lock(&lock);
@@ -1327,7 +1415,7 @@ void compute_2e_deriv_disk(std::string type, double beta, int max_deriv_order) {
             }
         } // shell quartet loops
         // Close the dataset for this derivative order
-        delete eri_dataset;
+        delete dataset;
     } // deriv order loop
 
     /* Finished lock mechanism, destroy it */
@@ -1824,6 +1912,10 @@ py::array eri_deriv_core(int deriv_order) {
     // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
     const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
 
+    // Shell screening
+    const auto shellpairs_bra = schwarz_screening(bs1, bs2);
+    const auto shellpairs_ket = schwarz_screening(bs3, bs4);
+
     // Libint engine for computing shell quartet derivatives
     std::vector<libint2::Engine> engines(nthreads);
     engines[0] = libint2::Engine(libint2::Operator::coulomb, max_nprim, max_l, deriv_order);
@@ -1836,93 +1928,173 @@ py::array eri_deriv_core(int deriv_order) {
     size_t length = nbf1 * nbf2 * nbf3 * nbf4 * nderivs_triu;
     std::vector<double> result(length);
 
-    // Begin shell quartet loops
-#pragma omp parallel for collapse(4) num_threads(nthreads)
-    for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-        for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-            for(auto s3 = 0; s3 != bs3.size(); ++s3) {
-                for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                    auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                    auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                    auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                    auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                    auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                    auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                    auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
-                    auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
-                    auto n3 = bs3[s3].size();    // number of basis functions in shell 3
-                    auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
-                    auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
-                    auto n4 = bs4[s4].size();    // number of basis functions in shell 4
-
-                    if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
-                    std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
-
-                    int thread_id = 0;
+#pragma omp parallel for num_threads(nthreads)
+    for (const auto &pair : shellpairs_bra) {
+        int p1 = pair.first;
+        int p2 = pair.second;
+
+        const auto &s1 = bs1[p1];
+        const auto &s2 = bs2[p2];
+        auto n1 = bs1[p1].size(); // number of basis functions in first shell
+        auto n2 = bs2[p2].size(); // number of basis functions in first shell
+        auto bf1 = shell2bf_1[p1];  // first basis function in first shell
+        auto bf2 = shell2bf_2[p2];  // first basis function in second shell
+        auto atom1 = shell2atom_1[p1]; // Atom index of shell 1
+        auto atom2 = shell2atom_2[p2]; // Atom index of shell 2
+
+        for (const auto &pair : shellpairs_ket) {
+            int p3 = pair.first;
+            int p4 = pair.second;
+
+            const auto &s3 = bs3[p3];
+            const auto &s4 = bs4[p4];
+            auto n3 = bs3[p3].size(); // number of basis functions in first shell
+            auto n4 = bs4[p4].size(); // number of basis functions in first shell
+            auto bf3 = shell2bf_3[p3];  // first basis function in first shell
+            auto bf4 = shell2bf_4[p4];  // first basis function in second shell
+            auto atom3 = shell2atom_3[p3]; // Atom index of shell 3
+            auto atom4 = shell2atom_4[p4]; // Atom index of shell 4
+
+            if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
+            std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
+
+            int thread_id = 0;
 #ifdef _OPENMP
-                    thread_id = omp_get_thread_num();
+            thread_id = omp_get_thread_num();
 #endif
-                    engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                    const auto& eri_buffer = engines[thread_id].results(); // will point to computed shell sets
+            engines[thread_id].compute(s1, s2, s3, s4); // Compute shell set
+            const auto& buf_vec = engines[thread_id].results(); // will point to computed shell sets
 
-                    // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-                    for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                        size_t offset_nuc_idx = nuc_idx * nbf1 * nbf2 * nbf3 * nbf4;
+            // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+            for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                size_t offset_nuc_idx = nuc_idx * nbf1 * nbf2 * nbf3 * nbf4;
 
-                        // Look up multidimensional cartesian derivative index
-                        auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+                // Look up multidimensional cartesian derivative index
+                auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
     
-                        // Find out which shell derivatives provided by Libint correspond to this nuclear cartesian derivative
-                        std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-                        for (int j = 0; j < multi_cart_idx.size(); j++){
-                            int desired_atom_idx = multi_cart_idx[j] / 3;
-                            int desired_coord = multi_cart_idx[j] % 3;
-                            for (int i = 0; i < 4; i++){
-                                int atom_idx = shell_atom_index_list[i];
-                                if (atom_idx == desired_atom_idx) {
-                                    int tmp = 3 * i + desired_coord;
-                                    indices[j].push_back(tmp);
+                // Find out which shell derivatives provided by Libint correspond to this nuclear cartesian derivative
+                std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+                for (int j = 0; j < multi_cart_idx.size(); j++){
+                    int desired_atom_idx = multi_cart_idx[j] / 3;
+                    int desired_coord = multi_cart_idx[j] % 3;
+                    for (int i = 0; i < 4; i++){
+                        int atom_idx = shell_atom_index_list[i];
+                        if (atom_idx == desired_atom_idx) {
+                            int tmp = 3 * i + desired_coord;
+                            indices[j].push_back(tmp);
+                        }
+                    }
+                }
+
+                // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+                // and the total number of subvectors is the order of differentiation
+                // Now we want all combinations where we pick exactly one index from each subvector.
+                // This is achievable through a cartesian product 
+                std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                std::vector<int> buffer_indices;
+                
+                // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
+                for (auto vec : index_combos)  {
+                    std::sort(vec.begin(), vec.end());
+                    int buf_idx = 0;
+                    // buffer_multidim_lookup
+                    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                    buffer_indices.push_back(buf_idx);
+                }
+
+                auto full = false;
+                if (p1 != p2 && p3 != p4) {
+                    for(auto i = 0; i < buffer_indices.size(); ++i) {
+                        auto ints_shellset = buf_vec[buffer_indices[i]];
+                        if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                            size_t offset_1_T = (bf1 + f1) * nbf3 * nbf4;
+                            for(auto f2 = 0; f2 != n2; ++f2) {
+                                size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                                size_t offset_2_T = (bf2 + f2) * nbf1 * nbf3 * nbf4;
+                                for(auto f3 = 0; f3 != n3; ++f3) {
+                                    size_t offset_3 = (bf3 + f3) * nbf4;
+                                    size_t offset_3_T = bf3 + f3;
+                                    for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                        size_t offset_4 = bf4 + f4;
+                                        size_t offset_4_T = (bf4 + f4) * nbf3;
+                                        result[offset_1 + offset_2 + offset_3 + offset_4 + offset_nuc_idx] = 
+                                            result[offset_1_T + offset_2_T + offset_3_T + offset_4_T  + offset_nuc_idx] += ints_shellset[idx];
+                                    }
                                 }
                             }
                         }
-
-                        // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
-                        // and the total number of subvectors is the order of differentiation
-                        // Now we want all combinations where we pick exactly one index from each subvector.
-                        // This is achievable through a cartesian product 
-                        std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                        std::vector<int> buffer_indices;
-                        
-                        // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                        for (auto vec : index_combos)  {
-                            std::sort(vec.begin(), vec.end());
-                            int buf_idx = 0;
-                            // buffer_multidim_lookup
-                            auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                            if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                            buffer_indices.push_back(buf_idx);
+                    }
+                    full = true;
+                }
+                if (p1 != p2) {
+                    for(auto i = 0; i < buffer_indices.size(); ++i) {
+                        auto ints_shellset = buf_vec[buffer_indices[i]];
+                        if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                            size_t offset_1_T = (bf1 + f1) * nbf3 * nbf4;
+                            for(auto f2 = 0; f2 != n2; ++f2) {
+                                size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                                size_t offset_2_T = (bf2 + f2) * nbf1 * nbf3 * nbf4;
+                                for(auto f3 = 0; f3 != n3; ++f3) {
+                                    size_t offset_3 = (bf3 + f3) * nbf4;
+                                    for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                        size_t offset_4 = bf4 + f4;
+                                        result[offset_1 + offset_2 + offset_3 + offset_4  + offset_nuc_idx] =
+                                            result[offset_1_T + offset_2_T + offset_3 + offset_4  + offset_nuc_idx] += ints_shellset[idx];
+                                    }
+                                }
+                            }
                         }
-
+                    }
+                    full = true;
+                }
+                if (p3 != p4) {
+                    for(auto i = 0; i < buffer_indices.size(); ++i) {
+                        auto ints_shellset = buf_vec[buffer_indices[i]];
+                        if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
                         // Loop over shell block, keeping a total count idx for the size of shell set
-                        for(auto i = 0; i < buffer_indices.size(); ++i) {
-                            auto eri_shellset = eri_buffer[buffer_indices[i]];
-                            if (eri_shellset == nullptr) continue;
-                            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                                size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
-                                for(auto f2 = 0; f2 != n2; ++f2) {
-                                    size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
-                                    for(auto f3 = 0; f3 != n3; ++f3) {
-                                        size_t offset_3 = (bf3 + f3) * nbf4;
-                                        for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                            result[offset_1 + offset_2 + offset_3 + bf4 + f4 + offset_nuc_idx] += eri_shellset[idx];
-                                        }
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                            for(auto f2 = 0; f2 != n2; ++f2) {
+                                size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                                for(auto f3 = 0; f3 != n3; ++f3) {
+                                    size_t offset_3 = (bf3 + f3) * nbf4;
+                                    size_t offset_3_T = bf3 + f3;
+                                    for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                        size_t offset_4 = bf4 + f4;
+                                        size_t offset_4_T = (bf4 + f4) * nbf3;
+                                        result[offset_1 + offset_2 + offset_3 + offset_4  + offset_nuc_idx] =
+                                            result[offset_1 + offset_2 + offset_3_T + offset_4_T  + offset_nuc_idx] += ints_shellset[idx];
                                     }
                                 }
                             }
                         }
-                    } // For every nuc_idx 0, nderivs_triu
+                    }
+                    full = true;
                 }
-            }
+                if (full == false) {
+                    for(auto i = 0; i < buffer_indices.size(); ++i) {
+                        auto ints_shellset = buf_vec[buffer_indices[i]];
+                        if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
+                            for(auto f2 = 0; f2 != n2; ++f2) {
+                                size_t offset_2 = (bf2 + f2) * nbf3 * nbf4;
+                                for(auto f3 = 0; f3 != n3; ++f3) {
+                                    size_t offset_3 = (bf3 + f3) * nbf4;
+                                    for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                        result[offset_1 + offset_2 + offset_3 + bf4 + f4  + offset_nuc_idx] += ints_shellset[idx];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            } // For every nuc_idx 0, nderivs_triu
         }
     } // shell quartet loops
     return py::array(result.size(), result.data()); // This apparently copies data, but it should be fine right? https://github.com/pybind/pybind11/issues/1042 there's a workaround
diff --git a/quax/integrals/oei.py b/quax/integrals/oei.py
index 074a417..608f275 100644
--- a/quax/integrals/oei.py
+++ b/quax/integrals/oei.py
@@ -111,7 +111,6 @@ def overlap_deriv_impl(self, geom, deriv_vec):
 
         if self.mode == 'core':
             S = self.overlap_derivatives[deriv_order-1][idx,:,:]
-            jax.debug.print(" {b} ", b=jnp.allclose(S, S.T))
             return jnp.asarray(S)
         if self.mode == 'f12':
             S = libint_interface.compute_1e_deriv("overlap", deriv_vec)
@@ -144,7 +143,6 @@ def kinetic_deriv_impl(self, geom, deriv_vec):
 
         if self.mode == 'core':
             T = self.kinetic_derivatives[deriv_order-1][idx,:,:]
-            jax.debug.print(" {b} ", b=jnp.allclose(T, T.T))
             return jnp.asarray(T)
         if self.mode == 'f12':
             T = libint_interface.compute_1e_deriv("kinetic", deriv_vec)
@@ -177,7 +175,6 @@ def potential_deriv_impl(self, geom, deriv_vec):
 
         if self.mode == 'core':
             V = self.potential_derivatives[deriv_order-1][idx,:,:]
-            jax.debug.print(" {b} ", b=jnp.allclose(V, V.T))
             return jnp.asarray(V)
         if self.mode == 'f12':
             V = libint_interface.compute_1e_deriv("potential", deriv_vec)

From 0c6820dea3540ff1aa71bdb46675100be2b3d86e Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Fri, 9 Feb 2024 13:14:57 -0500
Subject: [PATCH 47/91] OEI Disk mem no copy

---
 quax/integrals/libint_interface.cc | 73 +++++++++---------------------
 1 file changed, 22 insertions(+), 51 deletions(-)

diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index 263c9bd..b512f81 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -1514,8 +1514,6 @@ void oei_deriv_disk(int max_deriv_order) {
         DataSet* overlap_dataset = new DataSet(file->createDataSet(overlap_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
         DataSet* kinetic_dataset = new DataSet(file->createDataSet(kinetic_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
         DataSet* potential_dataset = new DataSet(file->createDataSet(potential_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
-        hsize_t stride[3] = {1, 1, 1}; // stride and block can be used to 
-        hsize_t block[3] = {1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
         hsize_t zerostart[3] = {0, 0, 0};
 
         /* Initialize lock */
@@ -1552,10 +1550,6 @@ void oei_deriv_disk(int max_deriv_order) {
             double kinetic_shellset_slab [n1][n2][nderivs_triu] = {};
             double potential_shellset_slab [n1][n2][nderivs_triu] = {};
 
-            double overlap_shellset_slab_T [n2][n1][nderivs_triu] = {};
-            double kinetic_shellset_slab_T [n2][n1][nderivs_triu] = {};
-            double potential_shellset_slab_T [n2][n1][nderivs_triu] = {};
-
             // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
             // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
             for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
@@ -1618,49 +1612,22 @@ void oei_deriv_disk(int max_deriv_order) {
 
                 // Loop over shell block for each buffer index which contributes to this derivative
                 // Overlap and Kinetic
-                if (p1 != p2) {
-                    for(auto i = 0; i < buffer_indices.size(); ++i) {
-                        auto overlap_shellset = overlap_buffer[buffer_indices[i]];
-                        auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
-                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                                overlap_shellset_slab[f1][f2][nuc_idx] += overlap_shellset[idx];
-                                kinetic_shellset_slab[f1][f2][nuc_idx] += kinetic_shellset[idx];
-                                overlap_shellset_slab_T[f2][f1][nuc_idx] += overlap_shellset[idx];
-                                kinetic_shellset_slab_T[f2][f1][nuc_idx] += kinetic_shellset[idx];
-                            }
-                        }
-                    }
-                } else {
-                    for(auto i = 0; i < buffer_indices.size(); ++i) {
-                        auto overlap_shellset = overlap_buffer[buffer_indices[i]];
-                        auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
-                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                                overlap_shellset_slab[f1][f2][nuc_idx] += overlap_shellset[idx];
-                                kinetic_shellset_slab[f1][f2][nuc_idx] += kinetic_shellset[idx];
-                            }
+                for(auto i = 0; i < buffer_indices.size(); ++i) {
+                    auto overlap_shellset = overlap_buffer[buffer_indices[i]];
+                    auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
+                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                        for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                            overlap_shellset_slab[f1][f2][nuc_idx] += overlap_shellset[idx];
+                            kinetic_shellset_slab[f1][f2][nuc_idx] += kinetic_shellset[idx];
                         }
                     }
                 }
                 // Potential
-                if (p1 != p2) {
-                    for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
-                        auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
-                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                                potential_shellset_slab[f1][f2][nuc_idx] += potential_shellset[idx];
-                                potential_shellset_slab_T[f2][f1][nuc_idx] += potential_shellset[idx];
-                            }
-                        }
-                    }
-                } else {
-                    for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
-                        auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
-                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                                potential_shellset_slab[f1][f2][nuc_idx] += potential_shellset[idx];
-                            }
+                for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
+                    auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
+                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                        for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                            potential_shellset_slab[f1][f2][nuc_idx] += potential_shellset[idx];
                         }
                     }
                 }
@@ -1671,6 +1638,8 @@ void oei_deriv_disk(int max_deriv_order) {
 
             // Now write this shell set slab to HDF5 file
             // Create file space hyperslab, defining where to write data to in file
+            hsize_t stride[3] = {1, 1, 1}; // stride and block can be used to 
+            hsize_t block[3] = {1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
             hsize_t count[3] = {n1, n2, nderivs_triu};
             hsize_t start[3] = {bf1, bf2, 0};
             fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
@@ -1686,17 +1655,19 @@ void oei_deriv_disk(int max_deriv_order) {
             if (p1 != p2) {
                 // Now write this shell set slab to HDF5 file
                 // Create file space hyperslab, defining where to write data to in file
-                hsize_t count_T[3] = {n2, n1, nderivs_triu};
+                hsize_t stride_T[3] = {1, 1, 1}; // stride and block can be used to 
+                hsize_t block_T[3] = {n2, 1, 1};  // add values to multiple places, useful if symmetry ever used.
+                hsize_t count_T[3] = {1, n1, nderivs_triu};
                 hsize_t start_T[3] = {bf2, bf1, 0};
-                fspace.selectHyperslab(H5S_SELECT_SET, count_T, start_T, stride, block);
+                fspace.selectHyperslab(H5S_SELECT_SET, count_T, start_T, stride_T, block_T);
                 // Create dataspace defining for memory dataset to write to file
                 hsize_t mem_dims_T[] = {n2, n1, nderivs_triu};
                 DataSpace mspace_T(3, mem_dims_T);
-                mspace_T.selectHyperslab(H5S_SELECT_SET, count_T, zerostart, stride, block);
+                mspace_T.selectHyperslab(H5S_SELECT_SET, count_T, zerostart, stride_T, block_T);
                 // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
-                overlap_dataset->write(overlap_shellset_slab_T, PredType::NATIVE_DOUBLE, mspace_T, fspace);
-                kinetic_dataset->write(kinetic_shellset_slab_T, PredType::NATIVE_DOUBLE, mspace_T, fspace);
-                potential_dataset->write(potential_shellset_slab_T, PredType::NATIVE_DOUBLE, mspace_T, fspace);
+                overlap_dataset->write(overlap_shellset_slab, PredType::NATIVE_DOUBLE, mspace_T, fspace);
+                kinetic_dataset->write(kinetic_shellset_slab, PredType::NATIVE_DOUBLE, mspace_T, fspace);
+                potential_dataset->write(potential_shellset_slab, PredType::NATIVE_DOUBLE, mspace_T, fspace);
             }
 
             /* Release lock */

From 2b1856dd4376fb9499a697c88d821d646a888ae3 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 12 Feb 2024 13:16:22 -0500
Subject: [PATCH 48/91] TEI/OEI disk writing

---
 quax/integrals/libint_interface.cc | 510 ++++++++++++++++++-----------
 1 file changed, 323 insertions(+), 187 deletions(-)

diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index b512f81..c4c6149 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -729,7 +729,6 @@ py::array compute_1e_deriv(std::string type, std::vector<int> deriv_vec) {
 
             if (type == "potential") {
                 for (int i = 0; i < natom; i++){
-                    // i = shell_atom_index_list[i];
                     if (i == desired_atom_idx) {
                         int tmp = 3 * (i + 2) + desired_coordinates[j];
                         indices[j].push_back(tmp);
@@ -1027,9 +1026,9 @@ py::array compute_2e_deriv(std::string type, double beta, std::vector<int> deriv
 //      HDF5 Dataset names within the file:
 //      oei_nbf1_nbf2_deriv1 
 //          shape (nbf,nbf,n_unique_1st_derivs)
-//      oei_nbf1_nbf2__deriv2 
+//      oei_nbf1_nbf2_deriv2 
 //          shape (nbf,nbf,n_unique_2nd_derivs)
-//      oei_nbf1_nbf2__deriv3 
+//      oei_nbf1_nbf2_deriv3 
 //          shape (nbf,nbf,n_unique_3rd_derivs)
 //      ...
 // The number of unique derivatives is essentially equal to the size of the
@@ -1045,6 +1044,10 @@ void compute_1e_deriv_disk(std::string type, int max_deriv_order) {
     double check = (nbf1 * nbf2 * total_deriv_slices * 8) * (1e-9);
     assert(check < 10 && "Total disk space required for ERI's exceeds 10 GB. Increase threshold and recompile to proceed.");
 
+    // Shell pairs after screening
+    const auto bs1_equiv_bs2 = (bs1 == bs2);
+    auto shellpairs = build_shellpairs(bs1, bs2);
+
     // Create H5 File and prepare to fill with 0.0's
     const H5std_string file_name("oei_derivs.h5");
     H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
@@ -1053,10 +1056,6 @@ void compute_1e_deriv_disk(std::string type, int max_deriv_order) {
     plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
 
     for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
-        // how many shell derivatives in the Libint buffer for overlap/kinetic integrals
-        // how many shell and operator derivatives for potential integrals
-        int nshell_derivs = how_many_derivs(2, deriv_order);
-        int nshell_derivs_potential = how_many_derivs(2, deriv_order, natom);
         // how many unique cartesian nuclear derivatives (e.g., so we only save one of d^2/dx1dx2 and d^2/dx2dx1, etc)
         unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
@@ -1105,109 +1104,143 @@ void compute_1e_deriv_disk(std::string type, int max_deriv_order) {
         /* Initialize lock */
         omp_init_lock(&lock);
 
-#pragma omp parallel for collapse(2) num_threads(nthreads)
-        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-                auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                std::vector<long> shell_atom_index_list{atom1, atom2};
+#pragma omp parallel for num_threads(nthreads)
+        for (const auto &pair : shellpairs) {
+            int p1 = pair.first;
+            int p2 = pair.second;
 
-                int thread_id = 0;
+            const auto &s1 = bs1[p1];
+            const auto &s2 = bs2[p2];
+            auto n1 = bs1[p1].size(); // number of basis functions in first shell
+            auto n2 = bs2[p2].size(); // number of basis functions in first shell
+            auto bf1 = shell2bf_1[p1];  // first basis function in first shell
+            auto bf2 = shell2bf_2[p2];  // first basis function in second shell
+            auto atom1 = shell2atom_1[p1]; // Atom index of shell 1
+            auto atom2 = shell2atom_2[p2]; // Atom index of shell 2
+            std::vector<long> shell_atom_index_list{atom1, atom2};
+
+            int thread_id = 0;
 #ifdef _OPENMP
-                thread_id = omp_get_thread_num();
+            thread_id = omp_get_thread_num();
 #endif
-                engines[thread_id].compute(bs1[s1], bs2[s2]); // Compute shell set
-                const auto& buffer = engines[thread_id].results(); // will point to computed shell sets
+            engines[thread_id].compute(s1, s2); // Compute shell set
+            const auto& buffer = engines[thread_id].results(); // will point to computed shell sets
 
-                // Define shell set slabs
-                double shellset_slab [n1][n2][nderivs_triu] = {};
+            // Define shell set slabs
+            double shellset_slab_12 [n1][n2][nderivs_triu] = {};
+            double shellset_slab_21 [n2][n1][nderivs_triu] = {};
 
-                // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-                // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
-                for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                    // Look up multidimensional cartesian derivative index
-                    auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
-                    // Create a vector of vectors called `indices`, where each subvector
-                    // is your possible choices for the first derivative operator, second, third, etc
-                    // and the total number of subvectors is order of differentiation
-                    // What follows fills these indices
-                    std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+            // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+            // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
+            for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                // Look up multidimensional cartesian derivative index
+                auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+                // Create a vector of vectors called `indices`, where each subvector
+                // is your possible choices for the first derivative operator, second, third, etc
+                // and the total number of subvectors is order of differentiation
+                // What follows fills these indices
+                std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
 
-                    // Loop over each cartesian coordinate index which we are differentiating wrt
-                    // for this nuclear cartesian derivative index and check to see if it is present
-                    // in the shell duet, and where it is present in the potential operator
-                    for (int j = 0; j < multi_cart_idx.size(); j++){
-                        int desired_atom_idx = multi_cart_idx[j] / 3;
-                        int desired_coord = multi_cart_idx[j] % 3;
-                        // Loop over shell indices
-                        for (int i = 0; i < 2; i++){
-                            int atom_idx = shell_atom_index_list[i];
-                            if (atom_idx == desired_atom_idx) {
-                                int tmp = 3 * i + desired_coord;
-                                indices[j].push_back(tmp);
-                            }
+                // Loop over each cartesian coordinate index which we are differentiating wrt
+                // for this nuclear cartesian derivative index and check to see if it is present
+                // in the shell duet, and where it is present in the potential operator
+                for (int j = 0; j < multi_cart_idx.size(); j++){
+                    int desired_atom_idx = multi_cart_idx[j] / 3;
+                    int desired_coord = multi_cart_idx[j] % 3;
+                    // Loop over shell indices
+                    for (int i = 0; i < 2; i++){
+                        int atom_idx = shell_atom_index_list[i];
+                        if (atom_idx == desired_atom_idx) {
+                            int tmp = 3 * i + desired_coord;
+                            indices[j].push_back(tmp);
                         }
-                        // Now for potentials only, loop over each atom in molecule, and if this derivative
-                        // differentiates wrt that atom, we also need to collect that index.
-                        if (type == "potential") {
-                            for (int i = 0; i < natom; i++){
-                                if (i == desired_atom_idx) {
-                                    int tmp = 3 * (i + 2) + desired_coord;
-                                    indices[j].push_back(tmp);
-                                }
+                    }
+                    // Now for potentials only, loop over each atom in molecule, and if this derivative
+                    // differentiates wrt that atom, we also need to collect that index.
+                    if (type == "potential") {
+                        for (int i = 0; i < natom; i++){
+                            if (i == desired_atom_idx) {
+                                int tmp = 3 * (i + 2) + desired_coord;
+                                indices[j].push_back(tmp);
                             }
                         }
                     }
+                }
 
-                    // Now indices is a vector of vectors, where each subvector is your choices
-                    // for the first derivative operator, second, third, etc
-                    // and the total number of subvectors is the order of differentiation
-                    // Now we want all combinations where we pick exactly one index from each subvector.
-                    // This is achievable through a cartesian product
-                    std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                    std::vector<int> buffer_indices;
-                    // Collect needed buffer indices which we need to sum for this nuclear cartesian derivative
-                    for (auto vec : index_combos)  {
-                        std::sort(vec.begin(), vec.end());
-                        int buf_idx = 0;
-                        auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                        if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                        buffer_indices.push_back(buf_idx);
+                // Now indices is a vector of vectors, where each subvector is your choices
+                // for the first derivative operator, second, third, etc
+                // and the total number of subvectors is the order of differentiation
+                // Now we want all combinations where we pick exactly one index from each subvector.
+                // This is achievable through a cartesian product
+                std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                std::vector<int> buffer_indices;
+                // Collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+                for (auto vec : index_combos)  {
+                    std::sort(vec.begin(), vec.end());
+                    int buf_idx = 0;
+                    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                    buffer_indices.push_back(buf_idx);
+                }
+
+                // Loop over shell block for each buffer index which contributes to this derivative
+                if (bs1_equiv_bs2 && p1 != p2){
+                    // Loop over shell block for each buffer index which contributes to this derivative
+                    for(auto i = 0; i < buffer_indices.size(); ++i) {
+                        auto shellset = buffer[buffer_indices[i]];
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                                shellset_slab_12[f1][f2][nuc_idx] =
+                                    shellset_slab_21[f2][f1][nuc_idx] += shellset[idx];
+                            }
+                        }
                     }
+                } else {
                     // Loop over shell block for each buffer index which contributes to this derivative
                     for(auto i = 0; i < buffer_indices.size(); ++i) {
                         auto shellset = buffer[buffer_indices[i]];
                         for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                             for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                                shellset_slab[f1][f2][nuc_idx] += shellset[idx];
+                                shellset_slab_12[f1][f2][nuc_idx] += shellset[idx];
                             }
                         }
                     }
-                } // Unique nuclear cartesian derivative indices loop
+                }
+            } // Unique nuclear cartesian derivative indices loop
 
-                /* Serialize HDF dataset writing using OpenMP lock */
-                omp_set_lock(&lock);
+            /* Serialize HDF dataset writing using OpenMP lock */
+            omp_set_lock(&lock);
 
+            // Now write this shell set slab to HDF5 file
+            // Create file space hyperslab, defining where to write data to in file
+            hsize_t count[3] = {n1, n2, nderivs_triu};
+            hsize_t start[3] = {bf1, bf2, 0};
+            fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
+            // Create dataspace defining for memory dataset to write to file
+            hsize_t mem_dims[] = {n1, n2, nderivs_triu};
+            DataSpace mspace(3, mem_dims);
+            mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
+            // Write buffer data 'shellset_slab' with data type double from
+            // memory dataspace `mspace` to file dataspace `fspace`
+            dataset->write(shellset_slab_12, PredType::NATIVE_DOUBLE, mspace, fspace);
+
+            if (bs1_equiv_bs2 && p1 != p2) {
                 // Now write this shell set slab to HDF5 file
                 // Create file space hyperslab, defining where to write data to in file
-                hsize_t count[3] = {n1, n2, nderivs_triu};
-                hsize_t start[3] = {bf1, bf2, 0};
-                fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
+                hsize_t count_T[3] = {n2, n1, nderivs_triu};
+                hsize_t start_T[3] = {bf2, bf1, 0};
+                fspace.selectHyperslab(H5S_SELECT_SET, count_T, start_T, stride, block);
                 // Create dataspace defining for memory dataset to write to file
-                hsize_t mem_dims[] = {n1, n2, nderivs_triu};
-                DataSpace mspace(3, mem_dims);
-                mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
-                // Write buffer data 'shellset_slab' with data type double from
-                // memory dataspace `mspace` to file dataspace `fspace`
-                dataset->write(shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
-
-                /* Release lock */
-                omp_unset_lock(&lock);
+                hsize_t mem_dims_T[] = {n2, n1, nderivs_triu};
+                DataSpace mspace_T(3, mem_dims_T);
+                mspace_T.selectHyperslab(H5S_SELECT_SET, count_T, zerostart, stride, block);
+                // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
+                dataset->write(shellset_slab_21, PredType::NATIVE_DOUBLE, mspace_T, fspace);
             }
+
+            /* Release lock */
+            omp_unset_lock(&lock);
+            
         } // shell duet loops
         // Delete datasets for this derivative order
         delete dataset;
@@ -1242,6 +1275,12 @@ void compute_2e_deriv_disk(std::string type, double beta, int max_deriv_order) {
     double check = (nbf1 * nbf2 * nbf3 * nbf4 * total_deriv_slices * 8) * (1e-9);
     assert(check < 50 && "Total disk space required for ERI's exceeds 50 GB. Increase threshold and recompile to proceed.");
 
+    // Shell screening
+    const auto bs1_equiv_bs2 = (bs1 == bs2);
+    const auto bs3_equiv_bs4 = (bs3 == bs4);
+    const auto shellpairs_bra = schwarz_screening(bs1, bs2);
+    const auto shellpairs_ket = schwarz_screening(bs3, bs4);
+    
     // Create H5 File and prepare to fill with 0.0's                                         
     const H5std_string file_name(type + "_derivs.h5");
     H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
@@ -1250,8 +1289,6 @@ void compute_2e_deriv_disk(std::string type, double beta, int max_deriv_order) {
     plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
 
     for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
-        // Number of unique shell derivatives output by libint (number of indices in buffer)
-        int nshell_derivs = how_many_derivs(4, deriv_order);
         // Number of unique nuclear derivatives of ERI's
         unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
@@ -1309,109 +1346,217 @@ void compute_2e_deriv_disk(std::string type, double beta, int max_deriv_order) {
         /* Initialize lock */
         omp_init_lock(&lock);
 
-#pragma omp parallel for collapse(4) num_threads(nthreads)
-        for(auto s1 = 0; s1 != bs1.size(); ++s1) {
-            for(auto s2 = 0; s2 != bs2.size(); ++s2) {
-                for(auto s3 = 0; s3 != bs3.size(); ++s3) {
-                    for(auto s4 = 0; s4 != bs4.size(); ++s4) {
-                        auto bf1 = shell2bf_1[s1];     // Index of first basis function in shell 1
-                        auto atom1 = shell2atom_1[s1]; // Atom index of shell 1
-                        auto n1 = bs1[s1].size();    // number of basis functions in shell 1
-                        auto bf2 = shell2bf_2[s2];     // Index of first basis function in shell 2
-                        auto atom2 = shell2atom_2[s2]; // Atom index of shell 2
-                        auto n2 = bs2[s2].size();    // number of basis functions in shell 2
-                        auto bf3 = shell2bf_3[s3];     // Index of first basis function in shell 3
-                        auto atom3 = shell2atom_3[s3]; // Atom index of shell 3
-                        auto n3 = bs3[s3].size();    // number of basis functions in shell 3
-                        auto bf4 = shell2bf_4[s4];     // Index of first basis function in shell 4
-                        auto atom4 = shell2atom_4[s4]; // Atom index of shell 4
-                        auto n4 = bs4[s4].size();    // number of basis functions in shell 4
-
-                        if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
-                        std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
-
-                        int thread_id = 0;
+#pragma omp parallel for num_threads(nthreads)
+        for (const auto &pair : shellpairs_bra) {
+            int p1 = pair.first;
+            int p2 = pair.second;
+
+            const auto &s1 = bs1[p1];
+            const auto &s2 = bs2[p2];
+            auto n1 = bs1[p1].size(); // number of basis functions in first shell
+            auto n2 = bs2[p2].size(); // number of basis functions in second shell
+            auto bf1 = shell2bf_1[p1];  // first basis function in first shell
+            auto bf2 = shell2bf_2[p2];  // first basis function in second shell
+            auto atom1 = shell2atom_1[p1]; // Atom index of shell 1
+            auto atom2 = shell2atom_2[p2]; // Atom index of shell 2
+
+            for (const auto &pair : shellpairs_ket) {
+                int p3 = pair.first;
+                int p4 = pair.second;
+
+                const auto &s3 = bs3[p3];
+                const auto &s4 = bs4[p4];
+                auto n3 = bs3[p3].size(); // number of basis functions in third shell
+                auto n4 = bs4[p4].size(); // number of basis functions in fourth shell
+                auto bf3 = shell2bf_3[p3];  // first basis function in third shell
+                auto bf4 = shell2bf_4[p4];  // first basis function in fourth shell
+                auto atom3 = shell2atom_3[p3]; // Atom index of shell 3
+                auto atom4 = shell2atom_4[p4]; // Atom index of shell 4
+
+                // If the atoms are the same we ignore it as the derivatives will be zero.
+                if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
+                std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
+
+                int thread_id = 0;
 #ifdef _OPENMP
-                        thread_id = omp_get_thread_num();
+                thread_id = omp_get_thread_num();
 #endif
-                        engines[thread_id].compute(bs1[s1], bs2[s2], bs3[s3], bs4[s4]); // Compute shell set
-                        const auto& buffer = engines[thread_id].results(); // will point to computed shell sets
-
-                        // Define shell set slab, with extra dimension for unique derivatives, initialized with 0.0's
-                        double ints_shellset_slab [n1][n2][n3][n4][nderivs_triu] = {};
-                        // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
-                        for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
-                            // Look up multidimensional cartesian derivative index
-                            auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
-    
-                            std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
-    
-                            // Find out which 
-                            for (int j = 0; j < multi_cart_idx.size(); j++){
-                                int desired_atom_idx = multi_cart_idx[j] / 3;
-                                int desired_coord = multi_cart_idx[j] % 3;
-                                for (int i = 0; i < 4; i++){
-                                    int atom_idx = shell_atom_index_list[i];
-                                    if (atom_idx == desired_atom_idx) {
-                                        int tmp = 3 * i + desired_coord;
-                                        indices[j].push_back(tmp);
+                engines[thread_id].compute(s1, s2, s3, s4); // Compute shell set
+                const auto& buf_vec = engines[thread_id].results(); // will point to computed shell sets
+
+                // Define shell set slab, with extra dimension for unique derivatives, initialized with 0.0's
+                double ints_shellset_slab_1234 [n1][n2][n3][n4][nderivs_triu] = {};
+                double ints_shellset_slab_2143 [n2][n1][n4][n3][nderivs_triu] = {};
+                double ints_shellset_slab_2134 [n2][n1][n3][n4][nderivs_triu] = {};
+                double ints_shellset_slab_1243 [n1][n2][n4][n3][nderivs_triu] = {};
+
+                // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+                for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                    // Look up multidimensional cartesian derivative index
+                    auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+
+                    std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+
+                    // Find out which 
+                    for (int j = 0; j < multi_cart_idx.size(); j++){
+                        int desired_atom_idx = multi_cart_idx[j] / 3;
+                        int desired_coord = multi_cart_idx[j] % 3;
+                        for (int i = 0; i < 4; i++){
+                            int atom_idx = shell_atom_index_list[i];
+                            if (atom_idx == desired_atom_idx) {
+                                int tmp = 3 * i + desired_coord;
+                                indices[j].push_back(tmp);
+                            }
+                        }
+                    }
+
+                    // Now indices is a vector of vectors, where each subvector is your choices
+                    // for the first derivative operator, second, third, etc
+                    // and the total number of subvectors is the order of differentiation
+                    // Now we want all combinations where we pick exactly one index from each subvector.
+                    // This is achievable through a cartesian product 
+                    std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                    std::vector<int> buffer_indices;
+
+                    // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
+                    for (auto vec : index_combos)  {
+                        std::sort(vec.begin(), vec.end());
+                        int buf_idx = 0;
+                        // buffer_multidim_lookup
+                        auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                        if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                        buffer_indices.push_back(buf_idx);
+                    }
+
+                    auto full = false;
+                    // Loop over shell block, keeping a total count idx for the size of shell set
+                    if (bs1_equiv_bs2 && p1 != p2 && bs3_equiv_bs4 && p3 != p4) {
+                        for(auto i = 0; i < buffer_indices.size(); ++i) {
+                            auto ints_shellset = buf_vec[buffer_indices[i]];
+                            if (ints_shellset == nullptr) continue;
+                            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                                for(auto f2 = 0; f2 != n2; ++f2) {
+                                    for(auto f3 = 0; f3 != n3; ++f3) {
+                                        for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                            ints_shellset_slab_1234[f1][f2][f3][f4][nuc_idx] =
+                                                ints_shellset_slab_2143[f2][f1][f4][f3][nuc_idx] += ints_shellset[idx];
+                                        }
                                     }
                                 }
                             }
-
-                            // Now indices is a vector of vectors, where each subvector is your choices
-                            // for the first derivative operator, second, third, etc
-                            // and the total number of subvectors is the order of differentiation
-                            // Now we want all combinations where we pick exactly one index from each subvector.
-                            // This is achievable through a cartesian product 
-                            std::vector<std::vector<int>> index_combos = cartesian_product(indices);
-                            std::vector<int> buffer_indices;
-
-                            // Binary search to find 1d buffer index from multidimensional shell derivative index in `index_combos`
-                            for (auto vec : index_combos)  {
-                                std::sort(vec.begin(), vec.end());
-                                int buf_idx = 0;
-                                // buffer_multidim_lookup
-                                auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
-                                if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
-                                buffer_indices.push_back(buf_idx);
+                        }
+                        full = true;
+                    }
+                    if (bs1_equiv_bs2 && p1 != p2) {
+                        for(auto i = 0; i < buffer_indices.size(); ++i) {
+                            auto ints_shellset = buf_vec[buffer_indices[i]];
+                            if (ints_shellset == nullptr) continue;
+                            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                                for(auto f2 = 0; f2 != n2; ++f2) {
+                                    for(auto f3 = 0; f3 != n3; ++f3) {
+                                        for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                            ints_shellset_slab_1234[f1][f2][f3][f4][nuc_idx] =
+                                                ints_shellset_slab_2134[f2][f1][f3][f4][nuc_idx] += ints_shellset[idx];
+                                        }
+                                    }
+                                }
                             }
-
-                            // Loop over shell block, keeping a total count idx for the size of shell set
-                            for(auto i = 0; i < buffer_indices.size(); ++i) {
-                                auto ints_shellset = buffer[buffer_indices[i]];
-                                if (ints_shellset == nullptr) continue;
-                                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                                    for(auto f2 = 0; f2 != n2; ++f2) {
-                                        for(auto f3 = 0; f3 != n3; ++f3) {
-                                            for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
-                                                ints_shellset_slab[f1][f2][f3][f4][nuc_idx] += ints_shellset[idx];
-                                            }
+                        }
+                        full = true;
+                    }
+                    if (bs3_equiv_bs4 && p3 != p4) {
+                        for(auto i = 0; i < buffer_indices.size(); ++i) {
+                            auto ints_shellset = buf_vec[buffer_indices[i]];
+                            if (ints_shellset == nullptr) continue;
+                            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                                for(auto f2 = 0; f2 != n2; ++f2) {
+                                    for(auto f3 = 0; f3 != n3; ++f3) {
+                                        for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                            ints_shellset_slab_1234[f1][f2][f3][f4][nuc_idx] =
+                                                ints_shellset_slab_1243[f1][f2][f4][f3][nuc_idx] += ints_shellset[idx];
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        full = true;
+                    }
+                    if (full == false) {
+                        for(auto i = 0; i < buffer_indices.size(); ++i) {
+                            auto ints_shellset = buf_vec[buffer_indices[i]];
+                            if (ints_shellset == nullptr) continue;
+                            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                                for(auto f2 = 0; f2 != n2; ++f2) {
+                                    for(auto f3 = 0; f3 != n3; ++f3) {
+                                        for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
+                                            ints_shellset_slab_1234[f1][f2][f3][f4][nuc_idx] += ints_shellset[idx];
                                         }
                                     }
                                 }
                             }
-                        } // For every nuc_idx 0, nderivs_triu
-
-                        /* Serialize HDF dataset writing using OpenMP lock */
-                        omp_set_lock(&lock);
-
-                        // Now write this shell set slab to HDF5 file
-                        hsize_t count[5] = {n1, n2, n3, n4, nderivs_triu};
-                        hsize_t start[5] = {bf1, bf2, bf3, bf4, 0};
-                        fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
-                        // Create dataspace defining for memory dataset to write to file
-                        hsize_t mem_dims[] = {n1, n2, n3, n4, nderivs_triu};
-                        DataSpace mspace(5, mem_dims);
-                        mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
-                        // Write buffer data 'shellset_slab' with data type double from
-                        // memory dataspace `mspace` to file dataspace `fspace`
-                        dataset->write(ints_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
-
-                        /* Release lock */
-                        omp_unset_lock(&lock);
+                        }
                     }
+                } // For every nuc_idx 0, nderivs_triu
+
+                /* Serialize HDF dataset writing using OpenMP lock */
+                omp_set_lock(&lock);
+
+                // Now write this shell set slab to HDF5 file
+                hsize_t count[5] = {n1, n2, n3, n4, nderivs_triu};
+                hsize_t start[5] = {bf1, bf2, bf3, bf4, 0};
+                fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
+                // Create dataspace defining for memory dataset to write to file
+                hsize_t mem_dims[] = {n1, n2, n3, n4, nderivs_triu};
+                DataSpace mspace(5, mem_dims);
+                mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
+                // Write buffer data 'shellset_slab' with data type double from
+                // memory dataspace `mspace` to file dataspace `fspace`
+                dataset->write(ints_shellset_slab_1234, PredType::NATIVE_DOUBLE, mspace, fspace);
+
+                if (bs1_equiv_bs2 && p1 != p2 && bs3_equiv_bs4 && p3 != p4) {
+                    // Now write this shell set slab to HDF5 file
+                    hsize_t count_T[5] = {n2, n1, n4, n3, nderivs_triu};
+                    hsize_t start_T[5] = {bf2, bf1, bf4, bf3, 0};
+                    fspace.selectHyperslab(H5S_SELECT_SET, count_T, start_T, stride, block);
+                    // Create dataspace defining for memory dataset to write to file
+                    hsize_t mem_dims_T[] = {n2, n1, n4, n3, nderivs_triu};
+                    DataSpace mspace_T(5, mem_dims_T);
+                    mspace_T.selectHyperslab(H5S_SELECT_SET, count_T, zerostart, stride, block);
+                    // Write buffer data 'shellset_slab' with data type double from
+                    // memory dataspace `mspace` to file dataspace `fspace`
+                    dataset->write(ints_shellset_slab_2143, PredType::NATIVE_DOUBLE, mspace_T, fspace);
+                }
+
+                if (bs1_equiv_bs2 && p1 != p2) {
+                    // Now write this shell set slab to HDF5 file
+                    hsize_t count_T[5] = {n2, n1, n3, n4, nderivs_triu};
+                    hsize_t start_T[5] = {bf2, bf1, bf3, bf4, 0};
+                    fspace.selectHyperslab(H5S_SELECT_SET, count_T, start_T, stride, block);
+                    // Create dataspace defining for memory dataset to write to file
+                    hsize_t mem_dims_T[] = {n2, n1, n3, n4, nderivs_triu};
+                    DataSpace mspace_T(5, mem_dims_T);
+                    mspace_T.selectHyperslab(H5S_SELECT_SET, count_T, zerostart, stride, block);
+                    // Write buffer data 'shellset_slab' with data type double from
+                    // memory dataspace `mspace` to file dataspace `fspace`
+                    dataset->write(ints_shellset_slab_2134, PredType::NATIVE_DOUBLE, mspace_T, fspace);
                 }
+
+                if (bs3_equiv_bs4 && p3 != p4) {
+                    // Now write this shell set slab to HDF5 file
+                    hsize_t count_T[5] = {n1, n2, n4, n3, nderivs_triu};
+                    hsize_t start_T[5] = {bf1, bf2, bf4, bf3, 0};
+                    fspace.selectHyperslab(H5S_SELECT_SET, count_T, start_T, stride, block);
+                    // Create dataspace defining for memory dataset to write to file
+                    hsize_t mem_dims_T[] = {n1, n2, n4, n3, nderivs_triu};
+                    DataSpace mspace_T(5, mem_dims_T);
+                    mspace_T.selectHyperslab(H5S_SELECT_SET, count_T, zerostart, stride, block);
+                    // Write buffer data 'shellset_slab' with data type double from
+                    // memory dataspace `mspace` to file dataspace `fspace`
+                    dataset->write(ints_shellset_slab_1243, PredType::NATIVE_DOUBLE, mspace_T, fspace);
+                }
+
+                /* Release lock */
+                omp_unset_lock(&lock);
             }
         } // shell quartet loops
         // Close the dataset for this derivative order
@@ -1467,10 +1612,6 @@ void oei_deriv_disk(int max_deriv_order) {
     plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
 
     for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
-        // how many shell derivatives in the Libint buffer for overlap/kinetic integrals
-        // how many shell and operator derivatives for potential integrals
-        int nshell_derivs = how_many_derivs(2, deriv_order);
-        int nshell_derivs_potential = how_many_derivs(2, deriv_order, natom);
         // how many unique cartesian nuclear derivatives (e.g., so we only save one of d^2/dx1dx2 and d^2/dx2dx1, etc)
         unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
@@ -1692,10 +1833,6 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
     // Shell pairs after screening
     auto shellpairs = build_shellpairs(bs1, bs2);
 
-    // how many shell derivatives in the Libint buffer for overlap/kinetic integrals
-    // how many shell and operator derivatives for potential integrals
-    int nshell_derivs = how_many_derivs(2, deriv_order);
-    int nshell_derivs_potential = how_many_derivs(2, deriv_order, natom);
     // how many unique cartesian nuclear derivatives (e.g., so we only save one of d^2/dx1dx2 and d^2/dx2dx1, etc)
     unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
@@ -1872,8 +2009,6 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
 
 // Computes a single 'deriv_order' derivative tensor of electron repulsion integrals, keeps everything in core memory
 py::array eri_deriv_core(int deriv_order) {
-    // Number of unique shell derivatives output by libint (number of indices in buffer)
-    int nshell_derivs = how_many_derivs(4, deriv_order);
     // Number of unique nuclear derivatives of ERI's
     unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
 
@@ -1926,6 +2061,7 @@ py::array eri_deriv_core(int deriv_order) {
             auto atom3 = shell2atom_3[p3]; // Atom index of shell 3
             auto atom4 = shell2atom_4[p4]; // Atom index of shell 4
 
+            // If the atoms are the same we ignore it as the derivatives will be zero.
             if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
             std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
 

From 7b82e8307e42abdd19e27d17294e35be90e31681 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Tue, 13 Feb 2024 13:38:10 -0500
Subject: [PATCH 49/91] Proper Schwarz Screening, clean-up of errant disk algos

---
 quax/integrals/libint_interface.cc | 179 +++++++++++++++++++----------
 1 file changed, 117 insertions(+), 62 deletions(-)

diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index c4c6149..8c64568 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -342,7 +342,7 @@ std::vector<std::pair<int, int>> build_shellpairs(libint2::BasisSet A, libint2::
 }
 
 // Schwarz-Screening of two-electron integrals
-std::vector<std::pair<int, int>> schwarz_screening(libint2::BasisSet A, libint2::BasisSet B){
+std::tuple<std::vector<std::pair<int, int>>, std::vector<double>> schwarz_screening(libint2::BasisSet A, libint2::BasisSet B){
 
     const auto A_equiv_B = (A == B);
 
@@ -387,8 +387,14 @@ std::vector<std::pair<int, int>> schwarz_screening(libint2::BasisSet A, libint2:
                             std::max(shell_max_val, std::fabs(buffer[f1 * (n1 * n2 * n2 + n2) + f2 * (n1 * n2 + 1)]));
                     }
                 }
+
                 max_integral = std::max(max_integral, shell_max_val);
-                shell_pair_values[s1 * B.size() + s2] = shell_max_val;
+
+                if (A_equiv_B) {
+                    shell_pair_values[s1 * B.size() + s2] = shell_pair_values[s2 * A.size() + s1] = shell_max_val;
+                } else {
+                    shell_pair_values[s1 * B.size() + s2] = shell_max_val;
+                }
             }
         }
     }
@@ -406,7 +412,7 @@ std::vector<std::pair<int, int>> schwarz_screening(libint2::BasisSet A, libint2:
         }
     }
 
-    return shell_pairs;
+    return std::make_tuple(shell_pairs, shell_pair_values);
 }
 
 // Compute one-electron integral
@@ -482,10 +488,13 @@ py::array compute_1e_int(std::string type) {
 // Computes two-electron integrals
 py::array compute_2e_int(std::string type, double beta) {
     // Shell screening
+    std::vector<std::pair<int, int>> shellpairs_bra, shellpairs_ket;
+    std::vector<double> schwarz_bra, schwarz_ket;
     const auto bs1_equiv_bs2 = (bs1 == bs2);
     const auto bs3_equiv_bs4 = (bs3 == bs4);
-    const auto shellpairs_bra = schwarz_screening(bs1, bs2);
-    const auto shellpairs_ket = schwarz_screening(bs3, bs4);
+    std::tie(shellpairs_bra, schwarz_bra) = schwarz_screening(bs1, bs2);
+    std::tie(shellpairs_ket, schwarz_ket) = schwarz_screening(bs3, bs4);
+    auto threshold_sq = threshold * threshold;
 
     // workaround for data copying: perhaps pass an empty numpy array, then populate it in C++?
     // avoids last line, which copies
@@ -545,6 +554,9 @@ py::array compute_2e_int(std::string type, double beta) {
             auto bf3 = shell2bf_3[p3];  // first basis function in first shell
             auto bf4 = shell2bf_4[p4];  // first basis function in second shell
 
+            // Perform schwarz screening
+            if (schwarz_bra[p1 * bs2.size() + p2] * schwarz_ket[p3 * bs4.size() + p4] < threshold_sq) continue;
+
             int thread_id = 0;
 #ifdef _OPENMP
             thread_id = omp_get_thread_num();
@@ -797,10 +809,13 @@ py::array compute_2e_deriv(std::string type, double beta, std::vector<int> deriv
     const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(12, deriv_order);
 
     // Shell screening
+    std::vector<std::pair<int, int>> shellpairs_bra, shellpairs_ket;
+    std::vector<double> schwarz_bra, schwarz_ket;
     const auto bs1_equiv_bs2 = (bs1 == bs2);
     const auto bs3_equiv_bs4 = (bs3 == bs4);
-    const auto shellpairs_bra = schwarz_screening(bs1, bs2);
-    const auto shellpairs_ket = schwarz_screening(bs3, bs4);
+    std::tie(shellpairs_bra, schwarz_bra) = schwarz_screening(bs1, bs2);
+    std::tie(shellpairs_ket, schwarz_ket) = schwarz_screening(bs3, bs4);
+    auto threshold_sq = threshold * threshold;
 
     // ERI derivative integral engine
     std::vector<libint2::Engine> engines(nthreads);
@@ -844,7 +859,7 @@ py::array compute_2e_deriv(std::string type, double beta, std::vector<int> deriv
         const auto &s1 = bs1[p1];
         const auto &s2 = bs2[p2];
         auto n1 = bs1[p1].size(); // number of basis functions in first shell
-        auto n2 = bs2[p2].size(); // number of basis functions in first shell
+        auto n2 = bs2[p2].size(); // number of basis functions in second shell
         auto bf1 = shell2bf_1[p1];  // first basis function in first shell
         auto bf2 = shell2bf_2[p2];  // first basis function in second shell
         auto atom1 = shell2atom_1[p1]; // Atom index of shell 1
@@ -856,13 +871,16 @@ py::array compute_2e_deriv(std::string type, double beta, std::vector<int> deriv
 
             const auto &s3 = bs3[p3];
             const auto &s4 = bs4[p4];
-            auto n3 = bs3[p3].size(); // number of basis functions in first shell
-            auto n4 = bs4[p4].size(); // number of basis functions in first shell
-            auto bf3 = shell2bf_3[p3];  // first basis function in first shell
-            auto bf4 = shell2bf_4[p4];  // first basis function in second shell
+            auto n3 = bs3[p3].size(); // number of basis functions in third shell
+            auto n4 = bs4[p4].size(); // number of basis functions in fourth shell
+            auto bf3 = shell2bf_3[p3];  // first basis function in third shell
+            auto bf4 = shell2bf_4[p4];  // first basis function in fourth shell
             auto atom3 = shell2atom_3[p3]; // Atom index of shell 3
             auto atom4 = shell2atom_4[p4]; // Atom index of shell 4
 
+            // Perform schwarz screening
+            if (schwarz_bra[p1 * bs2.size() + p2] * schwarz_ket[p3 * bs4.size() + p4] < threshold_sq) continue;
+
             // If the atoms are the same we ignore it as the derivatives will be zero.
             if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
             // Ensure all desired_atoms correspond to at least one shell atom to
@@ -1276,10 +1294,13 @@ void compute_2e_deriv_disk(std::string type, double beta, int max_deriv_order) {
     assert(check < 50 && "Total disk space required for ERI's exceeds 50 GB. Increase threshold and recompile to proceed.");
 
     // Shell screening
+    std::vector<std::pair<int, int>> shellpairs_bra, shellpairs_ket;
+    std::vector<double> schwarz_bra, schwarz_ket;
     const auto bs1_equiv_bs2 = (bs1 == bs2);
     const auto bs3_equiv_bs4 = (bs3 == bs4);
-    const auto shellpairs_bra = schwarz_screening(bs1, bs2);
-    const auto shellpairs_ket = schwarz_screening(bs3, bs4);
+    std::tie(shellpairs_bra, schwarz_bra) = schwarz_screening(bs1, bs2);
+    std::tie(shellpairs_ket, schwarz_ket) = schwarz_screening(bs3, bs4);
+    auto threshold_sq = threshold * threshold;
     
     // Create H5 File and prepare to fill with 0.0's                                         
     const H5std_string file_name(type + "_derivs.h5");
@@ -1373,6 +1394,9 @@ void compute_2e_deriv_disk(std::string type, double beta, int max_deriv_order) {
                 auto atom3 = shell2atom_3[p3]; // Atom index of shell 3
                 auto atom4 = shell2atom_4[p4]; // Atom index of shell 4
 
+                // Perform schwarz screening
+                if (schwarz_bra[p1 * bs2.size() + p2] * schwarz_ket[p3 * bs4.size() + p4] < threshold_sq) continue;
+
                 // If the atoms are the same we ignore it as the derivatives will be zero.
                 if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
                 std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};
@@ -1397,7 +1421,6 @@ void compute_2e_deriv_disk(std::string type, double beta, int max_deriv_order) {
 
                     std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
 
-                    // Find out which 
                     for (int j = 0; j < multi_cart_idx.size(); j++){
                         int desired_atom_idx = multi_cart_idx[j] / 3;
                         int desired_coord = multi_cart_idx[j] % 3;
@@ -1655,6 +1678,8 @@ void oei_deriv_disk(int max_deriv_order) {
         DataSet* overlap_dataset = new DataSet(file->createDataSet(overlap_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
         DataSet* kinetic_dataset = new DataSet(file->createDataSet(kinetic_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
         DataSet* potential_dataset = new DataSet(file->createDataSet(potential_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
+        hsize_t stride[3] = {1, 1, 1}; // stride and block can be used to 
+        hsize_t block[3] = {1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
         hsize_t zerostart[3] = {0, 0, 0};
 
         /* Initialize lock */
@@ -1687,9 +1712,12 @@ void oei_deriv_disk(int max_deriv_order) {
             const auto& potential_buffer = v_engines[thread_id].results(); // will point to computed shell sets
 
             // Define shell set slabs
-            double overlap_shellset_slab [n1][n2][nderivs_triu] = {};
-            double kinetic_shellset_slab [n1][n2][nderivs_triu] = {};
-            double potential_shellset_slab [n1][n2][nderivs_triu] = {};
+            double overlap_shellset_slab_12 [n1][n2][nderivs_triu] = {};
+            double kinetic_shellset_slab_12 [n1][n2][nderivs_triu] = {};
+            double potential_shellset_slab_12 [n1][n2][nderivs_triu] = {};
+            double overlap_shellset_slab_21 [n2][n1][nderivs_triu] = {};
+            double kinetic_shellset_slab_21 [n2][n1][nderivs_triu] = {};
+            double potential_shellset_slab_21 [n2][n1][nderivs_triu] = {};
 
             // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
             // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
@@ -1752,23 +1780,49 @@ void oei_deriv_disk(int max_deriv_order) {
                 }
 
                 // Loop over shell block for each buffer index which contributes to this derivative
-                // Overlap and Kinetic
-                for(auto i = 0; i < buffer_indices.size(); ++i) {
-                    auto overlap_shellset = overlap_buffer[buffer_indices[i]];
-                    auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
-                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                        for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                            overlap_shellset_slab[f1][f2][nuc_idx] += overlap_shellset[idx];
-                            kinetic_shellset_slab[f1][f2][nuc_idx] += kinetic_shellset[idx];
+                if (p1 != p2) {
+                    // Overlap and Kinetic
+                    for(auto i = 0; i < buffer_indices.size(); ++i) {
+                        auto overlap_shellset = overlap_buffer[buffer_indices[i]];
+                        auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                                overlap_shellset_slab_12[f1][f2][nuc_idx] =
+                                    overlap_shellset_slab_21[f2][f1][nuc_idx] += overlap_shellset[idx];
+                                kinetic_shellset_slab_12[f1][f2][nuc_idx] =
+                                    kinetic_shellset_slab_21[f2][f1][nuc_idx] += kinetic_shellset[idx];
+                            }
                         }
                     }
-                }
-                // Potential
-                for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
-                    auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
-                    for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
-                        for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                            potential_shellset_slab[f1][f2][nuc_idx] += potential_shellset[idx];
+                    // Potential
+                    for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
+                        auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                                potential_shellset_slab_12[f1][f2][nuc_idx] =
+                                    potential_shellset_slab_21[f2][f1][nuc_idx] += potential_shellset[idx];
+                            }
+                        }
+                    }
+                } else { 
+                    // Overlap and Kinetic
+                    for(auto i = 0; i < buffer_indices.size(); ++i) {
+                        auto overlap_shellset = overlap_buffer[buffer_indices[i]];
+                        auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                                overlap_shellset_slab_12[f1][f2][nuc_idx] += overlap_shellset[idx];
+                                kinetic_shellset_slab_12[f1][f2][nuc_idx] += kinetic_shellset[idx];
+                            }
+                        }
+                    }
+                    // Potential
+                    for(auto i = 0; i < potential_buffer_indices.size(); ++i) {
+                        auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                                potential_shellset_slab_12[f1][f2][nuc_idx] += potential_shellset[idx];
+                            }
                         }
                     }
                 }
@@ -1779,8 +1833,6 @@ void oei_deriv_disk(int max_deriv_order) {
 
             // Now write this shell set slab to HDF5 file
             // Create file space hyperslab, defining where to write data to in file
-            hsize_t stride[3] = {1, 1, 1}; // stride and block can be used to 
-            hsize_t block[3] = {1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
             hsize_t count[3] = {n1, n2, nderivs_triu};
             hsize_t start[3] = {bf1, bf2, 0};
             fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
@@ -1789,26 +1841,24 @@ void oei_deriv_disk(int max_deriv_order) {
             DataSpace mspace(3, mem_dims);
             mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
             // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
-            overlap_dataset->write(overlap_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
-            kinetic_dataset->write(kinetic_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
-            potential_dataset->write(potential_shellset_slab, PredType::NATIVE_DOUBLE, mspace, fspace);
+            overlap_dataset->write(overlap_shellset_slab_12, PredType::NATIVE_DOUBLE, mspace, fspace);
+            kinetic_dataset->write(kinetic_shellset_slab_12, PredType::NATIVE_DOUBLE, mspace, fspace);
+            potential_dataset->write(potential_shellset_slab_12, PredType::NATIVE_DOUBLE, mspace, fspace);
 
             if (p1 != p2) {
                 // Now write this shell set slab to HDF5 file
                 // Create file space hyperslab, defining where to write data to in file
-                hsize_t stride_T[3] = {1, 1, 1}; // stride and block can be used to 
-                hsize_t block_T[3] = {n2, 1, 1};  // add values to multiple places, useful if symmetry ever used.
-                hsize_t count_T[3] = {1, n1, nderivs_triu};
+                hsize_t count_T[3] = {n2, n1, nderivs_triu};
                 hsize_t start_T[3] = {bf2, bf1, 0};
-                fspace.selectHyperslab(H5S_SELECT_SET, count_T, start_T, stride_T, block_T);
+                fspace.selectHyperslab(H5S_SELECT_SET, count_T, start_T, stride, block);
                 // Create dataspace defining for memory dataset to write to file
                 hsize_t mem_dims_T[] = {n2, n1, nderivs_triu};
                 DataSpace mspace_T(3, mem_dims_T);
-                mspace_T.selectHyperslab(H5S_SELECT_SET, count_T, zerostart, stride_T, block_T);
+                mspace_T.selectHyperslab(H5S_SELECT_SET, count_T, zerostart, stride, block);
                 // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
-                overlap_dataset->write(overlap_shellset_slab, PredType::NATIVE_DOUBLE, mspace_T, fspace);
-                kinetic_dataset->write(kinetic_shellset_slab, PredType::NATIVE_DOUBLE, mspace_T, fspace);
-                potential_dataset->write(potential_shellset_slab, PredType::NATIVE_DOUBLE, mspace_T, fspace);
+                overlap_dataset->write(overlap_shellset_slab_21, PredType::NATIVE_DOUBLE, mspace_T, fspace);
+                kinetic_dataset->write(kinetic_shellset_slab_21, PredType::NATIVE_DOUBLE, mspace_T, fspace);
+                potential_dataset->write(potential_shellset_slab_21, PredType::NATIVE_DOUBLE, mspace_T, fspace);
             }
 
             /* Release lock */
@@ -1962,10 +2012,10 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
                     auto kinetic_shellset = kinetic_buffer[buffer_indices[i]];
                     for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                         for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                            S[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += overlap_shellset[idx];
-                            S[(bf2 + f2) * nbf1 + bf1 + f1 + offset_nuc_idx] += overlap_shellset[idx];
-                            T[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += kinetic_shellset[idx];
-                            T[(bf2 + f2) * nbf1 + bf1 + f1 + offset_nuc_idx] += kinetic_shellset[idx];
+                            S[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] =
+                                S[(bf2 + f2) * nbf1 + bf1 + f1 + offset_nuc_idx] += overlap_shellset[idx];
+                            T[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] =
+                                T[(bf2 + f2) * nbf1 + bf1 + f1 + offset_nuc_idx] += kinetic_shellset[idx];
                         }
                     }
                 }
@@ -1987,8 +2037,8 @@ std::vector<py::array> oei_deriv_core(int deriv_order) {
                     auto potential_shellset = potential_buffer[potential_buffer_indices[i]];
                     for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                         for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                            V[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] += potential_shellset[idx];
-                            V[(bf2 + f2) * nbf1 + bf1 + f1 + offset_nuc_idx] += potential_shellset[idx];
+                            V[(bf1 + f1) * nbf2 + bf2 + f2 + offset_nuc_idx] =
+                                V[(bf2 + f2) * nbf1 + bf1 + f1 + offset_nuc_idx] += potential_shellset[idx];
                         }
                     }
                 }
@@ -2018,9 +2068,11 @@ py::array eri_deriv_core(int deriv_order) {
     // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
     const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
 
-    // Shell screening
-    const auto shellpairs_bra = schwarz_screening(bs1, bs2);
-    const auto shellpairs_ket = schwarz_screening(bs3, bs4);
+    // Shell screening assumes bs1 == bs2 == bs3 == bs4 for Hartree-Fock
+    std::vector<std::pair<int, int>> shellpairs;
+    std::vector<double> schwarz;
+    std::tie(shellpairs, schwarz) = schwarz_screening(bs1, bs2);
+    auto threshold_sq = threshold * threshold;
 
     // Libint engine for computing shell quartet derivatives
     std::vector<libint2::Engine> engines(nthreads);
@@ -2035,32 +2087,35 @@ py::array eri_deriv_core(int deriv_order) {
     std::vector<double> result(length);
 
 #pragma omp parallel for num_threads(nthreads)
-    for (const auto &pair : shellpairs_bra) {
+    for (const auto &pair : shellpairs) {
         int p1 = pair.first;
         int p2 = pair.second;
 
         const auto &s1 = bs1[p1];
         const auto &s2 = bs2[p2];
         auto n1 = bs1[p1].size(); // number of basis functions in first shell
-        auto n2 = bs2[p2].size(); // number of basis functions in first shell
+        auto n2 = bs2[p2].size(); // number of basis functions in second shell
         auto bf1 = shell2bf_1[p1];  // first basis function in first shell
         auto bf2 = shell2bf_2[p2];  // first basis function in second shell
         auto atom1 = shell2atom_1[p1]; // Atom index of shell 1
         auto atom2 = shell2atom_2[p2]; // Atom index of shell 2
 
-        for (const auto &pair : shellpairs_ket) {
+        for (const auto &pair : shellpairs) {
             int p3 = pair.first;
             int p4 = pair.second;
 
             const auto &s3 = bs3[p3];
             const auto &s4 = bs4[p4];
-            auto n3 = bs3[p3].size(); // number of basis functions in first shell
-            auto n4 = bs4[p4].size(); // number of basis functions in first shell
-            auto bf3 = shell2bf_3[p3];  // first basis function in first shell
-            auto bf4 = shell2bf_4[p4];  // first basis function in second shell
+            auto n3 = bs3[p3].size(); // number of basis functions in third shell
+            auto n4 = bs4[p4].size(); // number of basis functions in fourth shell
+            auto bf3 = shell2bf_3[p3];  // first basis function in third shell
+            auto bf4 = shell2bf_4[p4];  // first basis function in fourth shell
             auto atom3 = shell2atom_3[p3]; // Atom index of shell 3
             auto atom4 = shell2atom_4[p4]; // Atom index of shell 4
 
+            // Perform schwarz screening
+            if (schwarz[p1 * bs2.size() + p2] * schwarz[p3 * bs4.size() + p4] < threshold_sq) continue;
+
             // If the atoms are the same we ignore it as the derivatives will be zero.
             if (atom1 == atom2 && atom1 == atom3 && atom1 == atom4) continue;
             std::vector<long> shell_atom_index_list{atom1, atom2, atom3, atom4};

From 579ed1f67991fd0464e5607192b849eb30e76750 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Fri, 16 Feb 2024 13:39:14 -0500
Subject: [PATCH 50/91] FC Approx Working

---
 quax/core.py                 | 54 ++++++++++++++++++++----------------
 quax/methods/ccsd.py         | 17 ++++++------
 quax/methods/ccsd_t.py       |  4 +--
 quax/methods/hartree_fock.py |  6 ++--
 quax/methods/mp2.py          | 12 ++++----
 quax/methods/mp2f12.py       | 51 +++++++++++++++++++---------------
 quax/utils.py                | 35 +++++++++++++++++++++++
 7 files changed, 111 insertions(+), 68 deletions(-)

diff --git a/quax/core.py b/quax/core.py
index ccfe62e..de83ca6 100644
--- a/quax/core.py
+++ b/quax/core.py
@@ -14,7 +14,7 @@
 from .methods.mp2f12 import restricted_mp2_f12
 from .methods.ccsd import rccsd
 from .methods.ccsd_t import rccsd_t
-from .utils import get_required_deriv_vecs
+from .utils import get_required_deriv_vecs, n_frozen_core
 
 psi4.core.be_quiet()
 
@@ -39,6 +39,7 @@ def check_options(options):
                        'spectral_shift': True,
                        'integral_algo': 'libint_core',
                        'ints_tolerance': 1.0e-14,
+                       'freeze_core': False,
                        'beta': 1.0
                       }
 
@@ -78,35 +79,39 @@ def compute(molecule, basis_name, method, options=None, deriv_order=0, partial=N
     mult = molecule.multiplicity()
     charge = molecule.molecular_charge()
     nuclear_charges = jnp.asarray([molecule.charge(i) for i in range(geom2d.shape[0])])
+    nelectrons = int(jnp.sum(nuclear_charges)) - charge
+    nfrzn = n_frozen_core(molecule, charge) if options['freeze_core'] else 0
 
     basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
     nbf = basis_set.nbf()
-    natoms = molecule.natom()
     print("Number of basis functions: ", nbf)
 
     if 'f12' in method:
         cabs_set = build_RIBS(molecule, basis_set, basis_name + '-cabs')
 
     # Energy and full derivative tensor evaluations
-    args = (geom, basis_set, xyz_path, nuclear_charges, charge, options)
     if not partial:
         # Create energy evaluation function
         if method == 'scf' or method == 'hf' or method == 'rhf':
-            def electronic_energy(*args, deriv_order=deriv_order):
-                return restricted_hartree_fock(*args, deriv_order=deriv_order)
+            args = (geom, basis_set, nelectrons, nuclear_charges, xyz_path)
+            def electronic_energy(*args, options=options, deriv_order=deriv_order):
+                return restricted_hartree_fock(*args, options=options, deriv_order=deriv_order)
         elif method =='mp2':
-            def electronic_energy(*args, deriv_order=deriv_order):
-                return restricted_mp2(*args, deriv_order=deriv_order)
+            args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+            def electronic_energy(*args, options=options, deriv_order=deriv_order):
+                return restricted_mp2(*args, options=options, deriv_order=deriv_order)
         elif method =='mp2-f12':
-            args += (cabs_set,)
-            def electronic_energy(*args, deriv_order=deriv_order):
-                return restricted_mp2_f12(*args, deriv_order=deriv_order)
+            args = (geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+            def electronic_energy(*args, options=options, deriv_order=deriv_order):
+                return restricted_mp2_f12(*args, options=options, deriv_order=deriv_order)
         elif method =='ccsd':
-            def electronic_energy(*args, deriv_order=deriv_order):
-                return rccsd(*args, deriv_order=deriv_order)
+            args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+            def electronic_energy(*args, options=options, deriv_order=deriv_order):
+                return rccsd(*args, options=options, deriv_order=deriv_order)
         elif method =='ccsd(t)':
-            def electronic_energy(*args, deriv_order=deriv_order):
-                return rccsd_t(*args, deriv_order=deriv_order)
+            args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+            def electronic_energy(*args, options=options, deriv_order=deriv_order):
+                return rccsd_t(*args, options=options, deriv_order=deriv_order)
         else:
             print("Desired electronic structure method not understood. Use 'scf' 'hf' 'mp2' 'ccsd' or 'ccsd(t)' ")
 
@@ -137,6 +142,7 @@ def electronic_energy(*args, deriv_order=deriv_order):
             raise Exception("The length of the index coordinates given by 'partial' argument should be the same as the order of differentiation")
 
         # Estimate memory footprint of two electron integrals partial derivatives
+        natoms = molecule.natom()
         nderivs = get_required_deriv_vecs(natoms, deriv_order, partial).shape[0]
         ngigabytes = nbf**4 * 64 * 8 * nderivs / 1e9
         print("Estimated memory footprint from two-electron integral partial derivatives: {} GB".format(ngigabytes))
@@ -149,28 +155,28 @@ def electronic_energy(*args, deriv_order=deriv_order):
         # JAX will then collect the internal coordinate partial derivative instead. 
         if method == 'scf' or method == 'hf' or method == 'rhf':
             def partial_wrapper(*args):
-                geom = jnp.asarray(args)
-                E_scf = restricted_hartree_fock(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=False)
+                E_scf = restricted_hartree_fock(geom, basis_set, nelectrons, nuclear_charges, xyz_path,\
+                                                options=options, deriv_order=deriv_order, return_aux_data=False)
                 return E_scf
         elif method =='mp2':
             def partial_wrapper(*args):
-                geom = jnp.asarray(args)
-                E_mp2 = restricted_mp2(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order)
+                E_mp2 = restricted_mp2(geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path,\
+                                       options=options, deriv_order=deriv_order)
                 return E_mp2
         elif method =='mp2-f12':
             def partial_wrapper(*args):
-                geom = jnp.asarray(args)
-                E_mp2f12 = restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, options, cabs_set, deriv_order=deriv_order)
+                E_mp2f12 = restricted_mp2_f12(geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path,\
+                                              options=options, deriv_order=deriv_order)
                 return E_mp2f12
         elif method =='ccsd':
             def partial_wrapper(*args):
-                geom = jnp.asarray(args)
-                E_ccsd = rccsd(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order)
+                E_ccsd = rccsd(geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path,\
+                               options=options, deriv_order=deriv_order)
                 return E_ccsd
         elif method =='ccsd(t)':
             def partial_wrapper(*args):
-                geom = jnp.asarray(args)
-                E_ccsd_t = rccsd_t(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order)
+                E_ccsd_t = rccsd_t(geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path,\
+                                   options=options, deriv_order=deriv_order)
                 return E_ccsd_t
         else:
             raise Exception("Error: Method {} not supported.".format(method))
diff --git a/quax/methods/ccsd.py b/quax/methods/ccsd.py
index df578d7..fe0f32c 100644
--- a/quax/methods/ccsd.py
+++ b/quax/methods/ccsd.py
@@ -6,17 +6,16 @@
 from .energy_utils import tei_transformation
 from .hartree_fock import restricted_hartree_fock
 
-def rccsd(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=0, return_aux_data=False):
-    # Do HF
-    E_scf, C, eps, V = restricted_hartree_fock(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
+def rccsd(geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path, options, deriv_order=0, return_aux_data=False):
+    ndocc = nelectrons // 2
+    ncore = nfrzn // 2
+    E_scf, C, eps, V = restricted_hartree_fock(geom, basis_set, nelectrons, nuclear_charges, xyz_path, options, deriv_order=deriv_order, return_aux_data=True)
 
     print("Running CCSD Computation...")
-    nelectrons = int(jnp.sum(nuclear_charges)) - charge
-    ndocc = nelectrons // 2
     nbf = V.shape[0]
     nvir = nbf - ndocc
 
-    o = slice(0, ndocc)
+    o = slice(ncore, ndocc)
     v = slice(ndocc, nbf)
 
     # Save slices of two-electron repulsion integrals in MO basis
@@ -32,7 +31,7 @@ def rccsd(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_ord
     d = 1.0 / (fock_Od.reshape(-1, 1) - fock_Vd)
 
     # Initial Amplitudes
-    T1 = jnp.zeros((ndocc, nvir))
+    T1 = jnp.zeros((ndocc - ncore, nvir))
     T2 = D * V[2]
 
     maxit = options['maxit']
@@ -42,7 +41,7 @@ def rccsd(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_ord
     while abs(E_ccsd - E_old)  > 1e-9:
         E_old = E_ccsd * 1
 
-        T1, T2 = rccsd_iter(T1, T2, V, d, D, ndocc, nvir)
+        T1, T2 = rccsd_iter(T1, T2, V, d, D)
         E_ccsd = rccsd_energy(T1, T2, V[2])
 
         iteration += 1
@@ -69,7 +68,7 @@ def rccsd_energy(T1, T2, Voovv):
 
 # Jit compiling ccsd is a BAD IDEA.
 # TODO consider breaking up function and jit compiling those which do not use more memory than TEI transformation
-def rccsd_iter(T1, T2, V, d, D, ndocc, nvir):
+def rccsd_iter(T1, T2, V, d, D):
     Voooo, Vooov, Voovv, Vovov, Vovvv, Vvvvv = V
 
     newT1 = jnp.zeros(T1.shape)
diff --git a/quax/methods/ccsd_t.py b/quax/methods/ccsd_t.py
index 8aaf6eb..3bb89a8 100644
--- a/quax/methods/ccsd_t.py
+++ b/quax/methods/ccsd_t.py
@@ -93,8 +93,8 @@ def loop_k(arr2):
     i, j, k, pT = while_loop(lambda arr0: arr0[0] < o, loop_i, (0, 0, 0, 0.0)) # (i, j, k, pT)
     return pT
 
-def rccsd_t(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=0):
-    E_ccsd, T1, T2, V, fock_Od, fock_Vd = rccsd(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
+def rccsd_t(geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path, options, deriv_order=0):
+    E_ccsd, T1, T2, V, fock_Od, fock_Vd = rccsd(geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path, options, deriv_order=deriv_order, return_aux_data=True)
 
     print("Running (T) Correction...")
     pT = perturbative_triples(T1, T2, V, fock_Od, fock_Vd)
diff --git a/quax/methods/hartree_fock.py b/quax/methods/hartree_fock.py
index 026028a..9ff1696 100644
--- a/quax/methods/hartree_fock.py
+++ b/quax/methods/hartree_fock.py
@@ -6,7 +6,7 @@
 from .ints import compute_integrals
 from .energy_utils import nuclear_repulsion, cholesky_orthogonalization
 
-def restricted_hartree_fock(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=0, return_aux_data=False):
+def restricted_hartree_fock(geom, basis_set, nelectrons, nuclear_charges, xyz_path, options, deriv_order=0, return_aux_data=False):
     print("Running Hartree-Fock Computation...")
     # Load keyword options
     maxit = options['maxit']
@@ -14,8 +14,6 @@ def restricted_hartree_fock(geom, basis_set, xyz_path, nuclear_charges, charge,
     damp_factor = options['damp_factor']
     spectral_shift = options['spectral_shift']
     convergence = 1e-10
-
-    nelectrons = int(jnp.sum(nuclear_charges)) - charge
     ndocc = nelectrons // 2
 
     # If we are doing MP2 or CCSD after, might as well use jit-compiled JK-build, since HF will not be memory bottleneck
@@ -93,6 +91,6 @@ def rhf_iter(F,D):
     if not return_aux_data:
         return E_scf
     else:
-        #print("RHF Energy:                ", E_scf)
+        # print("RHF Energy:                ", E_scf)
         return E_scf, C, eps, G
 
diff --git a/quax/methods/mp2.py b/quax/methods/mp2.py
index 5bcb663..88f8548 100644
--- a/quax/methods/mp2.py
+++ b/quax/methods/mp2.py
@@ -7,18 +7,18 @@
 from .energy_utils import partial_tei_transformation, cartesian_product
 from .hartree_fock import restricted_hartree_fock
 
-def restricted_mp2(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=0, return_aux_data=False):
-    nelectrons = int(jnp.sum(nuclear_charges)) - charge
+def restricted_mp2(geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path, options, deriv_order=0, return_aux_data=False):
     ndocc = nelectrons // 2
-    E_scf, C, eps, G = restricted_hartree_fock(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order=deriv_order, return_aux_data=True)
+    ncore = nfrzn // 2
+    E_scf, C, eps, G = restricted_hartree_fock(geom, basis_set, nelectrons, nuclear_charges, xyz_path, options, deriv_order=deriv_order, return_aux_data=True)
 
     print("Running MP2 Computation...")
     nvirt = G.shape[0] - ndocc
 
-    G = partial_tei_transformation(G, C[:,:ndocc], C[:,ndocc:], C[:,:ndocc], C[:,ndocc:])
+    G = partial_tei_transformation(G, C[:,ncore:ndocc], C[:,ndocc:], C[:,ncore:ndocc], C[:,ndocc:])
 
     # Create tensor dim (occ,vir,occ,vir) of all possible orbital energy denominators
-    eps_occ, eps_vir = eps[:ndocc], eps[ndocc:]
+    eps_occ, eps_vir = eps[ncore:ndocc], eps[ndocc:]
     e_denom = jnp.reciprocal(eps_occ.reshape(-1, 1, 1, 1) - eps_vir.reshape(-1, 1, 1) + eps_occ.reshape(-1, 1) - eps_vir)
 
     # Tensor contraction algo 
@@ -29,7 +29,7 @@ def restricted_mp2(geom, basis_set, xyz_path, nuclear_charges, charge, options,
 
     # Loop algo (lower memory, but tei transform is the memory bottleneck)
     # Create all combinations of four loop variables to make XLA compilation easier
-    indices = cartesian_product(jnp.arange(ndocc), jnp.arange(ndocc), jnp.arange(nvirt), jnp.arange(nvirt))
+    indices = cartesian_product(jnp.arange(ndocc-ncore), jnp.arange(ndocc-ncore), jnp.arange(nvirt), jnp.arange(nvirt))
 
     def loop_mp2(idx, mp2_corr):
         i,j,a,b = indices[idx]
diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index 09aad86..9bb85c0 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -11,10 +11,10 @@
 from .energy_utils import partial_tei_transformation, cartesian_product
 from .mp2 import restricted_mp2
 
-def restricted_mp2_f12(geom, basis_set, xyz_path, nuclear_charges, charge, options, cabs_set, deriv_order=0):
-    nelectrons = int(jnp.sum(nuclear_charges)) - charge
+def restricted_mp2_f12(geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path, options, deriv_order=0):
+    E_mp2, C_obs, eps = restricted_mp2(geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path, options, deriv_order, return_aux_data=True)
     ndocc = nelectrons // 2
-    E_mp2, C_obs, eps = restricted_mp2(geom, basis_set, xyz_path, nuclear_charges, charge, options, deriv_order, return_aux_data=True)
+    ncore = nfrzn // 2
     eps_occ, eps_vir = eps[:ndocc], eps[ndocc:]
 
     print("Running MP2-F12 Computation...")
@@ -82,12 +82,36 @@ def loop_energy(idx, f12_corr):
 
         return f12_corr
 
-    dE_mp2f12 = fori_loop(0, indices.shape[0], loop_energy, 0.0)
+    start = ndocc if ncore > 0 else 0
+    dE_mp2f12 = fori_loop(start, indices.shape[0], loop_energy, 0.0)
 
     E_s = cabs_singles(f, ndocc, nri)
 
+    print(E_mp2)
+    print(dE_mp2f12)
+    print(E_s)
+
     return E_mp2 + dE_mp2f12 + E_s
 
+# CABS Singles
+def cabs_singles(f, ndocc, nri):
+    all_vir = nri - ndocc
+
+    e_ij, C_ij = jnp.linalg.eigh(f[:ndocc, :ndocc])
+    e_AB, C_AB = jnp.linalg.eigh(f[ndocc:, ndocc:])
+
+    f_iA = C_ij.T @ f[:ndocc, ndocc:] @ C_AB
+
+    indices = cartesian_product(jnp.arange(ndocc), jnp.arange(all_vir))
+
+    def loop_singles(idx, singles):
+        i, A = indices[idx]
+        singles += 2 * f_iA[i, A]**2 / (e_ij[i] - e_AB[A])
+        return singles
+    E_s = fori_loop(0, indices.shape[0], loop_singles, 0.0)
+
+    return E_s
+
 # Fixed Amplitude Ansatz
 @jax.jit
 def t_(p, q, r, s):
@@ -239,25 +263,6 @@ def form_Fock(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_pa
 
     return f, fk, k
 
-# CABS Singles
-def cabs_singles(f, ndocc, nri):
-    all_vir = nri - ndocc
-
-    e_ij, C_ij = jnp.linalg.eigh(f[:ndocc, :ndocc])
-    e_AB, C_AB = jnp.linalg.eigh(f[ndocc:, ndocc:])
-
-    f_iA = C_ij.T @ f[:ndocc, ndocc:] @ C_AB
-
-    indices = cartesian_product(jnp.arange(ndocc), jnp.arange(all_vir))
-    
-    def loop_singles(idx, singles):
-        i, A = indices[idx]
-        singles += 2 * f_iA[i, A]**2 / (e_ij[i] - e_AB[A])
-        return singles
-    E_s = fori_loop(0, indices.shape[0], loop_singles, 0.0)
-
-    return E_s
-
 # F12 Intermediates
 def form_V(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
     C_occ = C_obs.at[:, :ndocc].get()
diff --git a/quax/utils.py b/quax/utils.py
index 7dcbf7a..50d771d 100644
--- a/quax/utils.py
+++ b/quax/utils.py
@@ -1,6 +1,41 @@
 import numpy as np
 import itertools
 
+def atom_to_period(Z):
+    # Period 1, 2, 3, 4, 5, 6, 7
+    full_shell_values = [0, 2, 10, 18, 36, 54, 86, 118]
+
+    for p, shell in enumerate(full_shell_values):
+        if shell > Z:
+            return p
+   
+def period_to_full_shell(p):
+    # Period 1, 2, 3, 4, 5, 6, 7
+    full_shell_values = [0, 2, 10, 18, 36, 54, 86, 118]
+
+    return full_shell_values[p]
+
+def n_frozen_core(mol, Z_mol):
+    nfrzn = 0
+    mol_valence = -1 * Z_mol
+    largest_shell = 0
+
+    for A in range(mol.natom()):
+        Z = mol.charge(A)
+        current_shell = atom_to_period(Z)
+        delta = period_to_full_shell(current_shell - 1)
+
+        if largest_shell < current_shell:
+            largest_shell = current_shell
+
+        mol_valence = mol_valence + Z - delta
+        nfrzn += delta
+
+    if mol_valence <= 0:
+        nfrzn -= period_to_full_shell(largest_shell - 1) - period_to_full_shell(largest_shell - 2)
+
+    return nfrzn
+
 def how_many_derivs(k,n):
     """How many unique Cartesian derivatives for k atoms at nth order"""
     val = 1

From ed76ab63a5fdd4f9166f1248d5c78d0083608abb Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Fri, 16 Feb 2024 14:27:17 -0500
Subject: [PATCH 51/91] Remove printing

---
 quax/methods/mp2f12.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index 9bb85c0..5393067 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -87,10 +87,6 @@ def loop_energy(idx, f12_corr):
 
     E_s = cabs_singles(f, ndocc, nri)
 
-    print(E_mp2)
-    print(dE_mp2f12)
-    print(E_s)
-
     return E_mp2 + dE_mp2f12 + E_s
 
 # CABS Singles

From c610ff59e4905864248de202d1026b92e129ddbe Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Fri, 16 Feb 2024 14:42:54 -0500
Subject: [PATCH 52/91] Fix partial

---
 quax/core.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/quax/core.py b/quax/core.py
index de83ca6..7546592 100644
--- a/quax/core.py
+++ b/quax/core.py
@@ -155,26 +155,31 @@ def electronic_energy(*args, options=options, deriv_order=deriv_order):
         # JAX will then collect the internal coordinate partial derivative instead. 
         if method == 'scf' or method == 'hf' or method == 'rhf':
             def partial_wrapper(*args):
+                geom = jnp.asarray(args)
                 E_scf = restricted_hartree_fock(geom, basis_set, nelectrons, nuclear_charges, xyz_path,\
                                                 options=options, deriv_order=deriv_order, return_aux_data=False)
                 return E_scf
         elif method =='mp2':
             def partial_wrapper(*args):
+                geom = jnp.asarray(args)
                 E_mp2 = restricted_mp2(geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path,\
                                        options=options, deriv_order=deriv_order)
                 return E_mp2
         elif method =='mp2-f12':
             def partial_wrapper(*args):
-                E_mp2f12 = restricted_mp2_f12(geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path,\
-                                              options=options, deriv_order=deriv_order)
+                geom = jnp.asarray(args)
+                E_mp2f12 = restricted_mp2_f12(geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges,\
+                                               xyz_path, options=options, deriv_order=deriv_order)
                 return E_mp2f12
         elif method =='ccsd':
             def partial_wrapper(*args):
+                geom = jnp.asarray(args)
                 E_ccsd = rccsd(geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path,\
                                options=options, deriv_order=deriv_order)
                 return E_ccsd
         elif method =='ccsd(t)':
             def partial_wrapper(*args):
+                geom = jnp.asarray(args)
                 E_ccsd_t = rccsd_t(geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path,\
                                    options=options, deriv_order=deriv_order)
                 return E_ccsd_t

From b60a65a080ff0149cf9c1d947a52dc559c24844d Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Fri, 22 Mar 2024 13:12:54 -0400
Subject: [PATCH 53/91] JAX Loops for Iterations

---
 quax/methods/ccsd.py         | 26 ++++++-------
 quax/methods/hartree_fock.py | 74 +++++++++++++++++++-----------------
 2 files changed, 51 insertions(+), 49 deletions(-)

diff --git a/quax/methods/ccsd.py b/quax/methods/ccsd.py
index fe0f32c..f018af4 100644
--- a/quax/methods/ccsd.py
+++ b/quax/methods/ccsd.py
@@ -31,22 +31,20 @@ def rccsd(geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path, options
     d = 1.0 / (fock_Od.reshape(-1, 1) - fock_Vd)
 
     # Initial Amplitudes
-    T1 = jnp.zeros((ndocc - ncore, nvir))
-    T2 = D * V[2]
 
     maxit = options['maxit']
-    iteration = 0
-    E_ccsd = 1.0
-    E_old = 0.0
-    while abs(E_ccsd - E_old)  > 1e-9:
-        E_old = E_ccsd * 1
-
-        T1, T2 = rccsd_iter(T1, T2, V, d, D)
-        E_ccsd = rccsd_energy(T1, T2, V[2])
-
-        iteration += 1
-        if iteration == maxit:
-            break
+    def ccsd_procedure(arr):
+        iter, de_, T1_,T2_, e_old = arr
+
+        T1_,T2_ = rccsd_iter(T1_,T2_, V, d, D)
+        e_ccsd = rccsd_energy(T1_,T2_, V[2])
+
+        de_ = jax.lax.cond(iter + 1 == maxit, lambda: 1.e-12, lambda: e_ccsd - e_old)
+
+        return (iter + 1, de_, T1_, T2_, e_ccsd)
+
+    iteration, _, T1, T2, E_ccsd = jax.lax.while_loop(lambda arr: abs(arr[1])  > 1e-10, ccsd_procedure,
+                                   (0, 1.0, jnp.zeros((ndocc - ncore, nvir)), D * V[2], 0.0)) # (iter, dE, T1, T2, E_ccsd)
 
     print(iteration, " CCSD iterations performed")
     if return_aux_data:
diff --git a/quax/methods/hartree_fock.py b/quax/methods/hartree_fock.py
index 9ff1696..703e17c 100644
--- a/quax/methods/hartree_fock.py
+++ b/quax/methods/hartree_fock.py
@@ -13,7 +13,6 @@ def restricted_hartree_fock(geom, basis_set, nelectrons, nuclear_charges, xyz_pa
     damping = options['damping']
     damp_factor = options['damp_factor']
     spectral_shift = options['spectral_shift']
-    convergence = 1e-10
     ndocc = nelectrons // 2
 
     # If we are doing MP2 or CCSD after, might as well use jit-compiled JK-build, since HF will not be memory bottleneck
@@ -30,19 +29,19 @@ def restricted_hartree_fock(geom, basis_set, nelectrons, nuclear_charges, xyz_pa
 
     # For slightly shifting eigenspectrum of transformed Fock for degenerate eigenvalues 
     # (JAX cannot differentiate degenerate eigenvalue eigh) 
-    if spectral_shift:
-        # Shifting eigenspectrum requires lower convergence.
-        convergence = 1e-8 
-        fudge = jnp.asarray(jnp.linspace(0, 1, nbf)) * convergence
-        shift = jnp.diag(fudge)
-    else:
-        shift = jnp.zeros_like(S)
+    def form_shift():
+        fudge = jnp.asarray(jnp.linspace(0, 1, nbf)) * 1e-8 
+        return jnp.diag(fudge)
+    
+    shift = jax.lax.cond(spectral_shift, lambda: form_shift(), lambda: jnp.zeros_like(S))
+
+    # Shifting eigenspectrum requires lower convergence.
+    convergence = jax.lax.cond(spectral_shift, lambda: 1.0e-9, lambda: 1.0e-10)
 
     H = T + V
     Enuc = nuclear_repulsion(geom.reshape(-1,3), nuclear_charges)
-    D = jnp.zeros_like(H)
     
-    def rhf_iter(F,D):
+    def rhf_iter(F, D):
         E_scf = jnp.einsum('pq,pq->', F + H, D) + Enuc
         Fp = A.T @ F @ A
         Fp = Fp + shift 
@@ -51,36 +50,41 @@ def rhf_iter(F,D):
         Cocc = C[:, :ndocc]
         D = Cocc @ Cocc.T
         return E_scf, D, C, eps
+    
+    def DIIS(F, D, S):
+        diis_e = jnp.einsum('ij,jk,kl->il', F, D, S) - jnp.einsum('ij,jk,kl->il', S, D, F)
+        diis_e = A @ diis_e @ A
+        return jnp.mean(diis_e ** 2) ** 0.5
+    
+    def scf_procedure(carry):
+        iter, de_, drms_, eps_, C_, D_old, D_, e_old = carry
 
-    iteration = 0
-    E_scf = 1.0
-    E_old = 0.0
-    Dold = jnp.zeros_like(D)
-    dRMS = 1.0
-
-    # Converge according to energy and DIIS residual to ensure eigenvalues and eigenvectors are maximally converged.
-    # This is crucial for numerical stability for higher order derivatives of correlated methods.
-    while ((abs(E_scf - E_old) > convergence) or (dRMS > convergence)):
-        E_old = E_scf * 1
-        if damping:
-            if iteration < 10:
-                D = Dold * damp_factor + D * damp_factor
-                Dold = D * 1.0
+        D_ = jax.lax.cond(damping and (iter < 10), lambda: D_old * damp_factor + D_ * damp_factor, lambda: D_)
+        D_old = jnp.copy(D_)
         # Build JK matrix: 2 * J - K
-        JK = 2 * jk_build(G, D)
-        JK -= jk_build(G.transpose((0,2,1,3)), D)
+        JK = 2 * jk_build(G, D_)
+        JK -= jk_build(G.transpose((0,2,1,3)), D_)
         # Build Fock
         F = H + JK
-        # Update convergence error
-        if iteration > 1:
-            diis_e = jnp.einsum('ij,jk,kl->il', F, D, S) - jnp.einsum('ij,jk,kl->il', S, D, F)
-            diis_e = A @ diis_e @ A
-            dRMS = jnp.mean(diis_e ** 2) ** 0.5
         # Compute energy, transform Fock and diagonalize, get new density
-        E_scf, D, C, eps = rhf_iter(F, D)
-        iteration += 1
-        if iteration == maxit:
-            break
+        e_scf, D_, C_, eps_ = rhf_iter(F, D_)
+
+        de_, drms_ = jax.lax.cond(iter + 1 == maxit, lambda: (1.e-15, 1.e-15), lambda: (e_old - e_scf, DIIS(F, D_, S)))
+
+        return (iter + 1, de_, drms_, eps_, C_, D_old, D_, e_scf)
+
+    # Create Guess Density
+    D = jnp.copy(H)
+    JK = 2 * jk_build(G, D)
+    JK -= jk_build(G.transpose((0,2,1,3)), D)
+    F = H + JK
+    E_init, D_init, C_init, eps_init = rhf_iter(F, D)
+
+    # Perform SCF Procedure
+    iteration, dE, dRMS, eps, C, _, D, E_scf = jax.lax.while_loop(lambda arr: (abs(arr[1]) > convergence) | (arr[2] > convergence),
+                                                           scf_procedure, (0, 1.0, 1.0, eps_init, C_init, D, D_init, E_init))
+                                                           # (iter, dE, dRMS, eps, C, D_old, D, E_scf)
+
     print(iteration, " RHF iterations performed")
 
     # If many orbitals are degenerate, warn that higher order derivatives may be unstable 

From f6f7a2ef82d9fbcabf50f3f564dec24adfe55856 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Fri, 22 Mar 2024 14:25:28 -0400
Subject: [PATCH 54/91] First commit

---
 quax/methods/hartree_fock.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/quax/methods/hartree_fock.py b/quax/methods/hartree_fock.py
index 703e17c..44056d6 100644
--- a/quax/methods/hartree_fock.py
+++ b/quax/methods/hartree_fock.py
@@ -30,9 +30,9 @@ def restricted_hartree_fock(geom, basis_set, nelectrons, nuclear_charges, xyz_pa
     # For slightly shifting eigenspectrum of transformed Fock for degenerate eigenvalues 
     # (JAX cannot differentiate degenerate eigenvalue eigh) 
     def form_shift():
-        fudge = jnp.asarray(jnp.linspace(0, 1, nbf)) * 1e-8 
+        fudge = jnp.asarray(jnp.linspace(0, 1, nbf)) * 1.e-8
         return jnp.diag(fudge)
-    
+
     shift = jax.lax.cond(spectral_shift, lambda: form_shift(), lambda: jnp.zeros_like(S))
 
     # Shifting eigenspectrum requires lower convergence.
@@ -50,12 +50,12 @@ def rhf_iter(F, D):
         Cocc = C[:, :ndocc]
         D = Cocc @ Cocc.T
         return E_scf, D, C, eps
-    
+
     def DIIS(F, D, S):
         diis_e = jnp.einsum('ij,jk,kl->il', F, D, S) - jnp.einsum('ij,jk,kl->il', S, D, F)
         diis_e = A @ diis_e @ A
         return jnp.mean(diis_e ** 2) ** 0.5
-    
+
     def scf_procedure(carry):
         iter, de_, drms_, eps_, C_, D_old, D_, e_old = carry
 

From 08af29320f29e20bd0bf05b0a90b131fcfa9854e Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Fri, 22 Mar 2024 22:26:04 -0400
Subject: [PATCH 55/91] Dipole Ints working

---
 quax/integrals/libint_interface.cc | 97 ++++++++++++++++++++++++------
 quax/integrals/oei.py              | 12 ++++
 quax/methods/hartree_fock.py       | 10 +--
 quax/methods/ints.py               | 14 ++++-
 4 files changed, 108 insertions(+), 25 deletions(-)

diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index 8c64568..60c3350 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -77,25 +77,6 @@ libint2::BasisSet make_ao_cabs(std::string obs_name, libint2::BasisSet cabs) {
     return cabs;
 }
 
-// Returns number of basis functions
-int nbf(std::string basis, std::string xyzfilename) {
-    libint2::initialize();
-    atoms = get_atoms(xyzfilename);
-
-    // Move harddrive load of basis and xyz to happen only once
-    libint2::BasisSet bs = libint2::BasisSet(basis, atoms);
-    bs.set_pure(false); // use cartesian gaussians
-    if (basis.find("-cabs", 10) != std::string::npos) {
-        bs = make_ao_cabs(basis, bs);
-    }
-
-    int nbf = static_cast<int>(bs.nbf());
-
-    libint2::finalize();
-
-    return nbf;
-}
-
 // Must call initialize before computing ints 
 void initialize(std::string xyzfilename, std::string basis1, std::string basis2,
                 std::string basis3, std::string basis4, double ints_tol) {
@@ -485,6 +466,83 @@ py::array compute_1e_int(std::string type) {
     return py::array(result.size(), result.data()); 
 }
 
+// Compute one-electron dipole integrals
+std::vector<py::array> compute_dipole_ints() {
+    // Shell pairs after screening
+    const auto bs1_equiv_bs2 = (bs1 == bs2);
+    auto shellpairs = build_shellpairs(bs1, bs2);
+
+    // Integral engine
+    std::vector<libint2::Engine> engines(nthreads);
+
+    // COM generator
+    std::array<double,3> COM = {0.000, 0.000, 0.000};
+
+    // Will compute overlap + electric dipole moments
+    engines[0] = libint2::Engine(libint2::Operator::emultipole1, max_nprim, max_l);
+    engines[0].set_params(COM); // with COM as the multipole origin
+    engines[0].set_precision(max_engine_precision);
+    engines[0].prescale_by(-2);
+    for (size_t i = 1; i != nthreads; ++i) {
+        engines[i] = engines[0];
+    }
+
+    size_t length = nbf1 * nbf2;
+    std::vector<double> Mu_X(length); // Mu_X Vector
+    std::vector<double> Mu_Y(length); // Mu_Y Vector
+    std::vector<double> Mu_Z(length); // Mu_Z Vector
+
+#pragma omp parallel for num_threads(nthreads)
+    for (const auto &pair : shellpairs) {
+        int p1 = pair.first;
+        int p2 = pair.second;
+
+        const auto &s1 = bs1[p1];
+        const auto &s2 = bs2[p2];
+        auto n1 = bs1[p1].size(); // number of basis functions in first shell
+        auto n2 = bs2[p2].size(); // number of basis functions in first shell
+        auto bf1 = shell2bf_1[p1];  // first basis function in first shell
+        auto bf2 = shell2bf_2[p2];  // first basis function in second shell
+
+        int thread_id = 0;
+#ifdef _OPENMP
+        thread_id = omp_get_thread_num();
+#endif
+        engines[thread_id].compute(s1, s2); // Compute shell set
+        const auto& buf_vec = engines[thread_id].results(); // will point to computed shell sets
+        auto mu_x_shellset = buf_vec[1];
+        auto mu_y_shellset = buf_vec[2];
+        auto mu_z_shellset = buf_vec[3];
+
+        if (mu_x_shellset == nullptr && mu_y_shellset == nullptr && mu_z_shellset == nullptr)
+            continue;  // nullptr returned if the entire shell-set was screened out
+
+        // Loop over shell block, keeping a total count idx for the size of shell set
+        if (bs1_equiv_bs2 && p1 != p2) {
+            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                    Mu_X[(bf1 + f1) * nbf2 + bf2 + f2] = mu_x_shellset[idx];
+                    Mu_X[(bf2 + f2) * nbf1 + bf1 + f1] = mu_x_shellset[idx];
+                    Mu_Y[(bf1 + f1) * nbf2 + bf2 + f2] = mu_y_shellset[idx];
+                    Mu_Y[(bf2 + f2) * nbf1 + bf1 + f1] = mu_y_shellset[idx];
+                    Mu_Z[(bf1 + f1) * nbf2 + bf2 + f2] = mu_z_shellset[idx];
+                    Mu_Z[(bf2 + f2) * nbf1 + bf1 + f1] = mu_z_shellset[idx];
+                }
+            }
+        } else {
+            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                    Mu_X[(bf1 + f1) * nbf2 + bf2 + f2] = mu_x_shellset[idx];
+                    Mu_Y[(bf1 + f1) * nbf2 + bf2 + f2] = mu_y_shellset[idx];
+                    Mu_Z[(bf1 + f1) * nbf2 + bf2 + f2] = mu_z_shellset[idx];
+                }
+            }
+        }
+    }
+    return {py::array(Mu_X.size(), Mu_X.data()), py::array(Mu_Y.size(), Mu_Y.data()),
+            py::array(Mu_Z.size(), Mu_Z.data())};
+}
+
 // Computes two-electron integrals
 py::array compute_2e_int(std::string type, double beta) {
     // Shell screening
@@ -2270,6 +2328,7 @@ PYBIND11_MODULE(libint_interface, m) {
     m.def("initialize", &initialize, "Initializes libint, builds geom and basis, assigns globals");
     m.def("finalize", &finalize, "Kills libint");
     m.def("compute_1e_int", &compute_1e_int, "Computes one-electron integrals with libint");
+    m.def("compute_dipole_ints", &compute_dipole_ints, "Computes electric (Cartesian) dipole integrals");
     m.def("compute_2e_int", &compute_2e_int, "Computes two-electron integrals with libint");
     m.def("compute_1e_deriv", &compute_1e_deriv, "Computes one-electron integral nuclear derivatives with libint");
     m.def("compute_2e_deriv", &compute_2e_deriv, "Computes two-electron integral nuclear derivatives with libint");
diff --git a/quax/integrals/oei.py b/quax/integrals/oei.py
index 608f275..5ace737 100644
--- a/quax/integrals/oei.py
+++ b/quax/integrals/oei.py
@@ -47,6 +47,7 @@ def __init__(self, basis1, basis2, xyz_path, max_deriv_order, mode):
         self.kinetic_deriv_p = jax.core.Primitive("kinetic_deriv")
         self.potential_p = jax.core.Primitive("potential")
         self.potential_deriv_p = jax.core.Primitive("potential_deriv")
+        self.dipole_p = jax.core.Primitive("dipole")
 
         # Register primitive evaluation rules
         self.overlap_p.def_impl(self.overlap_impl)
@@ -55,6 +56,7 @@ def __init__(self, basis1, basis2, xyz_path, max_deriv_order, mode):
         self.kinetic_deriv_p.def_impl(self.kinetic_deriv_impl)
         self.potential_p.def_impl(self.potential_impl)
         self.potential_deriv_p.def_impl(self.potential_deriv_impl)
+        self.dipole_p.def_impl(self.dipole_impl)
 
         # Register the JVP rules with JAX
         jax.interpreters.ad.primitive_jvps[self.overlap_p] = self.overlap_jvp
@@ -88,6 +90,9 @@ def potential(self, geom):
     def potential_deriv(self, geom, deriv_vec):
         return self.potential_deriv_p.bind(geom, deriv_vec)
 
+    def dipole(self, geom):
+        return self.dipole_p.bind(geom)
+
     # Create primitive evaluation rules
     def overlap_impl(self, geom):
         S = libint_interface.compute_1e_int("overlap")
@@ -104,6 +109,13 @@ def potential_impl(self, geom):
         V = V.reshape(self.nbf1, self.nbf2)
         return jnp.asarray(V)
 
+    def dipole_impl(self, geom):
+        Mu_X, Mu_Y, Mu_Z = libint_interface.compute_dipole_ints()
+        Mu_X = Mu_X.reshape(self.nbf1, self.nbf2)
+        Mu_Y = Mu_Y.reshape(self.nbf1, self.nbf2)
+        Mu_Z = Mu_Z.reshape(self.nbf1, self.nbf2)
+        return jnp.asarray(Mu_X), jnp.asarray(Mu_Y), jnp.asarray(Mu_Z)
+
     def overlap_deriv_impl(self, geom, deriv_vec):
         deriv_vec = np.asarray(deriv_vec, int)
         deriv_order = np.sum(deriv_vec)
diff --git a/quax/methods/hartree_fock.py b/quax/methods/hartree_fock.py
index 44056d6..38a58a3 100644
--- a/quax/methods/hartree_fock.py
+++ b/quax/methods/hartree_fock.py
@@ -30,7 +30,7 @@ def restricted_hartree_fock(geom, basis_set, nelectrons, nuclear_charges, xyz_pa
     # For slightly shifting eigenspectrum of transformed Fock for degenerate eigenvalues 
     # (JAX cannot differentiate degenerate eigenvalue eigh) 
     def form_shift():
-        fudge = jnp.asarray(jnp.linspace(0, 1, nbf)) * 1.e-8
+        fudge = jnp.asarray(jnp.linspace(0, 1, nbf)) * 1.e-9
         return jnp.diag(fudge)
 
     shift = jax.lax.cond(spectral_shift, lambda: form_shift(), lambda: jnp.zeros_like(S))
@@ -81,10 +81,9 @@ def scf_procedure(carry):
     E_init, D_init, C_init, eps_init = rhf_iter(F, D)
 
     # Perform SCF Procedure
-    iteration, dE, dRMS, eps, C, _, D, E_scf = jax.lax.while_loop(lambda arr: (abs(arr[1]) > convergence) | (arr[2] > convergence),
-                                                           scf_procedure, (0, 1.0, 1.0, eps_init, C_init, D, D_init, E_init))
-                                                           # (iter, dE, dRMS, eps, C, D_old, D, E_scf)
-
+    iteration, _, _, eps, C, _, D, E_scf = jax.lax.while_loop(lambda arr: (abs(arr[1]) > convergence) | (arr[2] > convergence),
+                                                              scf_procedure, (0, 1.0, 1.0, eps_init, C_init, D, D_init, E_init))
+                                                              # (iter, dE, dRMS, eps, C, D_old, D, E_scf)
     print(iteration, " RHF iterations performed")
 
     # If many orbitals are degenerate, warn that higher order derivatives may be unstable 
@@ -92,6 +91,7 @@ def scf_procedure(carry):
     ndegen_orbs =  tmp.shape[0] - jnp.unique(tmp).shape[0] 
     if (ndegen_orbs / nbf) > 0.20:
         print("Hartree-Fock warning: More than 20% of orbitals have degeneracies. Higher order derivatives may be unstable due to eigendecomposition AD rule")
+
     if not return_aux_data:
         return E_scf
     else:
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index f137153..70dba83 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -14,7 +14,7 @@
      
 
 def compute_integrals(geom, basis_set, xyz_path, deriv_order, options):
-    # Load integral algo, decides to compute integrals in memory or use disk 
+    # Load integral algo, decides to compute integrals in memory or use disk
     algo = options['integral_algo']
     basis_name = basis_set.name()
     libint_interface.initialize(xyz_path, basis_name, basis_name, basis_name, basis_name, options['ints_tolerance'])
@@ -56,6 +56,18 @@ def compute_integrals(geom, basis_set, xyz_path, deriv_order, options):
     libint_interface.finalize()
     return S, T, V, G
 
+def compute_dipole_ints(geom, basis_set, xyz_path, deriv_order, options):
+    # Load integral algo, decides to compute integrals in memory or use disk
+    algo = options['integral_algo']
+    basis_name = basis_set.name()
+    libint_interface.initialize(xyz_path, basis_name, basis_name, basis_name, basis_name, options['ints_tolerance'])
+
+    oei_obj = OEI(basis_set, basis_set, xyz_path, deriv_order, 'dipole')
+
+    Mu_X, Mu_Y, Mu_Z = oei_obj.dipole(geom)
+
+    return (Mu_X, Mu_Y, Mu_Z)
+
 def compute_f12_oeints(geom, basis1, basis2, xyz_path, deriv_order, options, cabs):
     # Load integral algo, decides to compute integrals in memory or use disk
     algo = options['integral_algo']

From bca4bc0954a6862f4120771a0990d204eb8ca07c Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Thu, 28 Mar 2024 12:57:35 -0400
Subject: [PATCH 56/91] Derivative dipole ints

---
 quax/integrals/libint_interface.cc | 141 ++++++++++++++++++++++++++++-
 quax/integrals/oei.py              |  81 ++++++++++++++++-
 quax/methods/ints.py               |   4 +-
 3 files changed, 222 insertions(+), 4 deletions(-)

diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index 60c3350..4f83cac 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -851,6 +851,143 @@ py::array compute_1e_deriv(std::string type, std::vector<int> deriv_vec) {
     return py::array(result.size(), result.data()); 
 }
 
+// Computes nuclear derivatives of dipole integrals
+std::vector<py::array> compute_dipole_derivs(std::vector<int> deriv_vec) {
+    assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
+    // Get order of differentiation
+    int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
+
+    // Convert deriv_vec to set of atom indices and their cartesian components which we are differentiating wrt
+    std::vector<int> desired_atom_indices;
+    std::vector<int> desired_coordinates;
+    process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
+
+    // Create mappings from 1d buffer index (flattened upper triangle shell derivative index)
+    // to multidimensional shell derivative index
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
+
+    // Shell pairs after screening
+    const auto bs1_equiv_bs2 = (bs1 == bs2);
+    auto shellpairs = build_shellpairs(bs1, bs2);
+
+    // Integral engine
+    std::vector<libint2::Engine> engines(nthreads);
+
+    // COM generator
+    std::array<double,3> COM = {0.000, 0.000, 0.000};
+
+    // Will compute overlap + electric dipole moments
+    engines[0] = libint2::Engine(libint2::Operator::emultipole1, max_nprim, max_l, deriv_order);
+    engines[0].set_params(COM); // with COM as the multipole origin
+    engines[0].set_precision(max_engine_precision);
+    engines[0].prescale_by(-1);
+    for (size_t i = 1; i != nthreads; ++i) {
+        engines[i] = engines[0];
+    }
+
+    size_t length = nbf1 * nbf2;
+    std::vector<double> Mu_X(length); // Mu_X Vector
+    std::vector<double> Mu_Y(length); // Mu_Y Vector
+    std::vector<double> Mu_Z(length); // Mu_Z Vector
+
+#pragma omp parallel for num_threads(nthreads)
+    for (const auto &pair : shellpairs) {
+        int p1 = pair.first;
+        int p2 = pair.second;
+
+        const auto &s1 = bs1[p1];
+        const auto &s2 = bs2[p2];
+        auto n1 = bs1[p1].size(); // number of basis functions in first shell
+        auto n2 = bs2[p2].size(); // number of basis functions in first shell
+        auto bf1 = shell2bf_1[p1];  // first basis function in first shell
+        auto bf2 = shell2bf_2[p2];  // first basis function in second shell
+        auto atom1 = shell2atom_1[p1]; // Atom index of shell 1
+        auto atom2 = shell2atom_2[p2]; // Atom index of shell 2
+
+        // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
+        std::vector<long> shell_atom_index_list{atom1, atom2};
+
+        int thread_id = 0;
+#ifdef _OPENMP
+        thread_id = omp_get_thread_num();
+#endif
+        engines[thread_id].compute(s1, s2); // Compute shell set
+        const auto& buf_vec = engines[thread_id].results(); // will point to computed shell sets
+
+        // For every desired atom derivative, check shell and nuclear indices for a match,
+        // add it to subvector for that derivative
+        // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
+        std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+        for (int j = 0; j < desired_atom_indices.size(); j++){
+            int desired_atom_idx = desired_atom_indices[j];
+            // Shell indices
+            for (int i = 0; i < 2; i++){
+                int atom_idx = shell_atom_index_list[i];
+                if (atom_idx == desired_atom_idx) {
+                    int tmp = 3 * i + desired_coordinates[j];
+                    indices[j].push_back(tmp);
+                    continue; // Avoid adding same atom and coord twice
+                }
+            }
+        }
+
+        // Now indices is a vector of vectors, where each subvector is your choices
+        // for the first derivative operator, second, third, etc
+        // and the total number of subvectors is the order of differentiation
+        // Now we want all combinations where we pick exactly one index from each subvector.
+        // This is achievable through a cartesian product
+        std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+        std::vector<int> buffer_indices;
+
+        // Collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+        for (auto vec : index_combos)  {
+            std::sort(vec.begin(), vec.end());
+            int buf_idx = 0;
+            auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+            if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+            buffer_indices.push_back(buf_idx * 4);
+        }
+
+        // Loop over every buffer index and accumulate for every shell set.
+        if (bs1_equiv_bs2 && p1 != p2) {
+            for(auto i = 0; i < buffer_indices.size(); ++i) {
+                auto mu_x_shellset = buf_vec[buffer_indices[i] + 1];
+                auto mu_y_shellset = buf_vec[buffer_indices[i] + 2];
+                auto mu_z_shellset = buf_vec[buffer_indices[i] + 3];
+                if (mu_x_shellset == nullptr && mu_y_shellset == nullptr && mu_z_shellset == nullptr)
+                    continue;  // nullptr returned if the entire shell-set was screened out
+                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                    for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                        Mu_X[(bf1 + f1) * nbf2 + bf2 + f2] += mu_x_shellset[idx];
+                        Mu_X[(bf2 + f2) * nbf1 + bf1 + f1] += mu_x_shellset[idx];
+                        Mu_Y[(bf1 + f1) * nbf2 + bf2 + f2] += mu_y_shellset[idx];
+                        Mu_Y[(bf2 + f2) * nbf1 + bf1 + f1] += mu_y_shellset[idx];
+                        Mu_Z[(bf1 + f1) * nbf2 + bf2 + f2] += mu_z_shellset[idx];
+                        Mu_Z[(bf2 + f2) * nbf1 + bf1 + f1] += mu_z_shellset[idx];
+                    }
+                }
+            }
+        } else {
+            for(auto i = 0; i < buffer_indices.size(); ++i) {
+                auto mu_x_shellset = buf_vec[buffer_indices[i] + 1];
+                auto mu_y_shellset = buf_vec[buffer_indices[i] + 2];
+                auto mu_z_shellset = buf_vec[buffer_indices[i] + 3];
+                if (mu_x_shellset == nullptr && mu_y_shellset == nullptr && mu_z_shellset == nullptr)
+                    continue;  // nullptr returned if the entire shell-set was screened out
+                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                    for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                        Mu_X[(bf1 + f1) * nbf2 + bf2 + f2] += mu_x_shellset[idx];
+                        Mu_Y[(bf1 + f1) * nbf2 + bf2 + f2] += mu_y_shellset[idx];
+                        Mu_Z[(bf1 + f1) * nbf2 + bf2 + f2] += mu_z_shellset[idx];
+                    }
+                }
+            }
+        }
+    }
+    return {py::array(Mu_X.size(), Mu_X.data()), py::array(Mu_Y.size(), Mu_Y.data()),
+            py::array(Mu_Z.size(), Mu_Z.data())};
+}
+
 // Computes nuclear derivatives of two-electron integrals
 py::array compute_2e_deriv(std::string type, double beta, std::vector<int> deriv_vec) {
     assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
@@ -2326,11 +2463,13 @@ py::array eri_deriv_core(int deriv_order) {
 PYBIND11_MODULE(libint_interface, m) {
     m.doc() = "pybind11 libint interface to molecular integrals"; // optional module docstring
     m.def("initialize", &initialize, "Initializes libint, builds geom and basis, assigns globals");
+    m.def("generate_multi_index_lookup", &generate_multi_index_lookup, "Flattened upper triangular map");
     m.def("finalize", &finalize, "Kills libint");
     m.def("compute_1e_int", &compute_1e_int, "Computes one-electron integrals with libint");
-    m.def("compute_dipole_ints", &compute_dipole_ints, "Computes electric (Cartesian) dipole integrals");
+    m.def("compute_dipole_ints", &compute_dipole_ints, "Computes electric (Cartesian) dipole integrals with libint");
     m.def("compute_2e_int", &compute_2e_int, "Computes two-electron integrals with libint");
     m.def("compute_1e_deriv", &compute_1e_deriv, "Computes one-electron integral nuclear derivatives with libint");
+    m.def("compute_dipole_derivs", &compute_dipole_derivs, "Computes electric (Cartesian) dipole nuclear integrals with libint");
     m.def("compute_2e_deriv", &compute_2e_deriv, "Computes two-electron integral nuclear derivatives with libint");
     m.def("compute_1e_deriv_disk", &compute_1e_deriv_disk, "Computes one-electron nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
     m.def("compute_2e_deriv_disk", &compute_2e_deriv_disk, "Computes coulomb integral nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
diff --git a/quax/integrals/oei.py b/quax/integrals/oei.py
index 5ace737..2679ab3 100644
--- a/quax/integrals/oei.py
+++ b/quax/integrals/oei.py
@@ -48,6 +48,7 @@ def __init__(self, basis1, basis2, xyz_path, max_deriv_order, mode):
         self.potential_p = jax.core.Primitive("potential")
         self.potential_deriv_p = jax.core.Primitive("potential_deriv")
         self.dipole_p = jax.core.Primitive("dipole")
+        self.dipole_deriv_p = jax.core.Primitive("dipole_deriv")
 
         # Register primitive evaluation rules
         self.overlap_p.def_impl(self.overlap_impl)
@@ -57,6 +58,7 @@ def __init__(self, basis1, basis2, xyz_path, max_deriv_order, mode):
         self.potential_p.def_impl(self.potential_impl)
         self.potential_deriv_p.def_impl(self.potential_deriv_impl)
         self.dipole_p.def_impl(self.dipole_impl)
+        self.dipole_deriv_p.def_impl(self.dipole_deriv_impl)
 
         # Register the JVP rules with JAX
         jax.interpreters.ad.primitive_jvps[self.overlap_p] = self.overlap_jvp
@@ -65,11 +67,14 @@ def __init__(self, basis1, basis2, xyz_path, max_deriv_order, mode):
         jax.interpreters.ad.primitive_jvps[self.kinetic_deriv_p] = self.kinetic_deriv_jvp
         jax.interpreters.ad.primitive_jvps[self.potential_p] = self.potential_jvp
         jax.interpreters.ad.primitive_jvps[self.potential_deriv_p] = self.potential_deriv_jvp
+        jax.interpreters.ad.primitive_jvps[self.dipole_p] = self.dipole_jvp
+        jax.interpreters.ad.primitive_jvps[self.dipole_deriv_p] = self.dipole_deriv_jvp
 
         # Register the batching rules with JAX
         jax.interpreters.batching.primitive_batchers[self.overlap_deriv_p] = self.overlap_deriv_batch
         jax.interpreters.batching.primitive_batchers[self.kinetic_deriv_p] = self.kinetic_deriv_batch
         jax.interpreters.batching.primitive_batchers[self.potential_deriv_p] = self.potential_deriv_batch
+        jax.interpreters.batching.primitive_batchers[self.dipole_deriv_p] = self.dipole_deriv_batch
 
     # Create functions to call primitives
     def overlap(self, geom):
@@ -93,6 +98,9 @@ def potential_deriv(self, geom, deriv_vec):
     def dipole(self, geom):
         return self.dipole_p.bind(geom)
 
+    def dipole_deriv(self, geom, deriv_vec):
+        return self.dipole_deriv_p.bind(geom, deriv_vec)
+
     # Create primitive evaluation rules
     def overlap_impl(self, geom):
         S = libint_interface.compute_1e_int("overlap")
@@ -114,7 +122,7 @@ def dipole_impl(self, geom):
         Mu_X = Mu_X.reshape(self.nbf1, self.nbf2)
         Mu_Y = Mu_Y.reshape(self.nbf1, self.nbf2)
         Mu_Z = Mu_Z.reshape(self.nbf1, self.nbf2)
-        return jnp.asarray(Mu_X), jnp.asarray(Mu_Y), jnp.asarray(Mu_Z)
+        return jnp.stack([Mu_X, Mu_Y, Mu_Z])
 
     def overlap_deriv_impl(self, geom, deriv_vec):
         deriv_vec = np.asarray(deriv_vec, int)
@@ -212,6 +220,52 @@ def potential_deriv_impl(self, geom, deriv_vec):
                     raise Exception("Something went wrong reading integral derivative file")
             return jnp.asarray(V)
 
+    def dipole_deriv_impl(self, geom, deriv_vec):
+        deriv_vec = np.asarray(deriv_vec, int)
+        deriv_order = np.sum(deriv_vec)
+        idx = get_deriv_vec_idx(deriv_vec)
+
+        if self.mode == 'dipole':
+            Mu_X, Mu_Y, Mu_Z = libint_interface.compute_dipole_derivs(deriv_vec)
+            Mu_X = Mu_X.reshape(self.nbf1, self.nbf2)
+            Mu_Y = Mu_Y.reshape(self.nbf1, self.nbf2)
+            Mu_Z = Mu_Z.reshape(self.nbf1, self.nbf2)
+            return jnp.stack([Mu_X, Mu_Y, Mu_Z])
+        elif self.mode == 'disk':
+            if os.path.exists("dipole_derivs.h5"):
+                file_name = "dipole_derivs.h5"
+                dataset1_name = "mu_x_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order)
+                dataset2_name = "mu_y_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order)
+                dataset3_name = "mu_z_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order)
+            elif os.path.exists("dipole_partials.h5"):
+                file_name = "dipole_partials.h5"
+                dataset1_name = "mu_x_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order) + "_" + str(idx)
+                dataset2_name = "mu_y_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order) + "_" + str(idx)
+                dataset3_name = "mu_z_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order) + "_" + str(idx)
+            else:
+                raise Exception("Something went wrong reading integral derivative file")
+            with h5py.File(file_name, 'r') as f:
+                mu_x_set = f[dataset1_name]
+                mu_y_set = f[dataset2_name]
+                mu_z_set = f[dataset3_name]
+                if len(mu_x_set.shape) == 3:
+                    Mu_X = mu_x_set[:,:,idx]
+                    Mu_Y = mu_y_set[:,:,idx]
+                    Mu_Z = mu_z_set[:,:,idx]
+                elif len(mu_x_set.shape) == 2:
+                    Mu_X = mu_x_set[:,:]
+                    Mu_Y = mu_y_set[:,:]
+                    Mu_Z = mu_z_set[:,:]
+                else:
+                    raise Exception("Something went wrong reading integral derivative file")
+            return jnp.stack([Mu_X, Mu_Y, Mu_Z])
+
     def overlap_jvp(self, primals, tangents):
         geom, = primals
         primals_out = self.overlap(geom)
@@ -248,6 +302,18 @@ def potential_deriv_jvp(self, primals, tangents):
         tangents_out = self.potential_deriv(geom, deriv_vec + tangents[0])
         return primals_out, tangents_out
 
+    def dipole_jvp(self, primals, tangents):
+        geom, = primals
+        primals_out = self.dipole(geom)
+        tangents_out = self.dipole_deriv(geom, tangents[0])
+        return primals_out, tangents_out
+
+    def dipole_deriv_jvp(self, primals, tangents):
+        geom, deriv_vec = primals
+        primals_out = self.dipole_deriv(geom, deriv_vec)
+        tangents_out = self.dipole_deriv(geom, deriv_vec + tangents[0])
+        return primals_out, tangents_out
+
     # Define Batching rules, this is only needed since jax.jacfwd will call vmap on the JVP's
     # of each oei function
     # When the input argument of deriv_batch is batched along the 0'th axis
@@ -286,3 +352,16 @@ def potential_deriv_batch(self, batched_args, batch_dims):
         results = jnp.concatenate(results, axis=0)
         return results, 0
 
+    def dipole_deriv_batch(self, batched_args, batch_dims):
+        geom_batch, deriv_batch = batched_args
+        geom_dim, deriv_dim = batch_dims
+        results = []
+        for i in deriv_batch:
+            tmp1, tmp2, tmp3 = self.dipole_deriv(geom_batch, i)
+            mu_x = jnp.expand_dims(tmp1, axis=0)
+            mu_y = jnp.expand_dims(tmp2, axis=0)
+            mu_z = jnp.expand_dims(tmp3, axis=0)
+            results.append(jnp.stack([mu_x, mu_y, mu_z], axis=1))
+        results = jnp.concatenate(results, axis=0)
+        return results, 0
+
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index 70dba83..7a3722a 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -64,9 +64,9 @@ def compute_dipole_ints(geom, basis_set, xyz_path, deriv_order, options):
 
     oei_obj = OEI(basis_set, basis_set, xyz_path, deriv_order, 'dipole')
 
-    Mu_X, Mu_Y, Mu_Z = oei_obj.dipole(geom)
+    Mu_ = oei_obj.dipole(geom)
 
-    return (Mu_X, Mu_Y, Mu_Z)
+    return Mu_[0], Mu_[1], Mu_[2]
 
 def compute_f12_oeints(geom, basis1, basis2, xyz_path, deriv_order, options, cabs):
     # Load integral algo, decides to compute integrals in memory or use disk

From 17072218f3ecc444d237d96e9b88e81e5356b593 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Fri, 29 Mar 2024 17:25:51 -0400
Subject: [PATCH 57/91] HF Dipole Moments

---
 quax/core.py                       | 24 ++++++++++++++++++------
 quax/integrals/libint_interface.cc |  2 +-
 quax/methods/hartree_fock.py       | 16 ++++++++++++++--
 3 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/quax/core.py b/quax/core.py
index 7546592..f49b0f7 100644
--- a/quax/core.py
+++ b/quax/core.py
@@ -40,7 +40,8 @@ def check_options(options):
                        'integral_algo': 'libint_core',
                        'ints_tolerance': 1.0e-14,
                        'freeze_core': False,
-                       'beta': 1.0
+                       'beta': 1.0,
+                       'dipole': False
                       }
 
     for key in options.keys():
@@ -54,7 +55,7 @@ def check_options(options):
             print("{} keyword option not recognized.".format(key))
     return keyword_options
 
-def compute(molecule, basis_name, method, options=None, deriv_order=0, partial=None):
+def compute(molecule, basis_name, method, electric_field=None, options=None, deriv_order=0, partial=None):
     """
     General function for computing energies, derivatives, and partial derivatives.
     """
@@ -84,16 +85,22 @@ def compute(molecule, basis_name, method, options=None, deriv_order=0, partial=N
 
     basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
     nbf = basis_set.nbf()
+    print("Basis name: ", basis_set.name())
     print("Number of basis functions: ", nbf)
 
     if 'f12' in method:
         cabs_set = build_RIBS(molecule, basis_set, basis_name + '-cabs')
 
+    if options['dipole'] and type(electric_field) == type(None):
+        raise Exception("Electric field must be given for dipole computation.")
+
     # Energy and full derivative tensor evaluations
     if not partial:
         # Create energy evaluation function
         if method == 'scf' or method == 'hf' or method == 'rhf':
             args = (geom, basis_set, nelectrons, nuclear_charges, xyz_path)
+            if options['dipole']:
+                args = (electric_field,) + args
             def electronic_energy(*args, options=options, deriv_order=deriv_order):
                 return restricted_hartree_fock(*args, options=options, deriv_order=deriv_order)
         elif method =='mp2':
@@ -134,6 +141,11 @@ def electronic_energy(*args, options=options, deriv_order=deriv_order):
         else:
             print("Error: Order {} derivatives are not exposed to the API.".format(deriv_order))
             deriv = 0
+
+        if options['dipole']:
+            dip_nuc = jnp.einsum('q,qx', nuclear_charges, geom.reshape(-1,3))
+            deriv += dip_nuc
+
         return np.asarray(deriv)
 
     # Partial derivatives
@@ -209,7 +221,7 @@ def partial_wrapper(*args):
             partial_deriv = 0
         return jnp.round(partial_deriv, 10)
 
-def energy(molecule, basis_name, method, options=None):
+def energy(molecule, basis_name, method, electric_field=None, options=None):
     """
     Call an energy method on a molecule and basis set.
 
@@ -251,10 +263,10 @@ def energy(molecule, basis_name, method, options=None):
     -------
     The electronic energy in a.u. (Hartrees)
     """
-    E = compute(molecule, basis_name, method, options)
+    E = compute(molecule, basis_name, method, electric_field, options)
     return E
 
-def derivative(molecule, basis_name, method, deriv_order, options=None):
+def derivative(molecule, basis_name, method, electric_field=None, deriv_order=1, options=None):
     """
     Compute the full Cartesian derivative tensor for a particular energy method, molecule, and basis set. 
 
@@ -300,7 +312,7 @@ def derivative(molecule, basis_name, method, deriv_order, options=None):
     deriv : float
         The requested derivative tensor, elements have units of Hartree/bohr^(n)
     """
-    deriv = compute(molecule, basis_name, method, options, deriv_order)
+    deriv = compute(molecule, basis_name, method, electric_field, options, deriv_order)
     return deriv
 
 def partial_derivative(molecule, basis_name, method, deriv_order, partial, options=None):
diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index 4f83cac..cdb3214 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -482,7 +482,7 @@ std::vector<py::array> compute_dipole_ints() {
     engines[0] = libint2::Engine(libint2::Operator::emultipole1, max_nprim, max_l);
     engines[0].set_params(COM); // with COM as the multipole origin
     engines[0].set_precision(max_engine_precision);
-    engines[0].prescale_by(-2);
+    engines[0].prescale_by(-1);
     for (size_t i = 1; i != nthreads; ++i) {
         engines[i] = engines[0];
     }
diff --git a/quax/methods/hartree_fock.py b/quax/methods/hartree_fock.py
index 38a58a3..ac3bf79 100644
--- a/quax/methods/hartree_fock.py
+++ b/quax/methods/hartree_fock.py
@@ -3,10 +3,17 @@
 import jax.numpy as jnp
 import psi4
 
-from .ints import compute_integrals
+from .ints import compute_integrals, compute_dipole_ints
 from .energy_utils import nuclear_repulsion, cholesky_orthogonalization
 
-def restricted_hartree_fock(geom, basis_set, nelectrons, nuclear_charges, xyz_path, options, deriv_order=0, return_aux_data=False):
+def restricted_hartree_fock(*args, options, deriv_order=0, return_aux_data=False):
+    if options['dipole']:
+        electric_field, geom, basis_set, nelectrons, nuclear_charges, xyz_path = args
+        deriv_order = 0
+        print("Deriv_Order for Integrals: ", deriv_order)
+    else:
+        geom, basis_set, nelectrons, nuclear_charges, xyz_path = args
+
     print("Running Hartree-Fock Computation...")
     # Load keyword options
     maxit = options['maxit']
@@ -40,6 +47,11 @@ def form_shift():
 
     H = T + V
     Enuc = nuclear_repulsion(geom.reshape(-1,3), nuclear_charges)
+
+    if options['dipole']:
+        Mu_XYZ = compute_dipole_ints(geom, basis_set, xyz_path, 0, options)
+        val = jnp.einsum('x,xij->ij', electric_field, Mu_XYZ)
+        H += val
     
     def rhf_iter(F, D):
         E_scf = jnp.einsum('pq,pq->', F + H, D) + Enuc

From cd5cec999d329f9c267827e450184056e09f9e4a Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 1 Apr 2024 15:51:00 -0400
Subject: [PATCH 58/91] Dipole for all methods

---
 quax/core.py                 | 10 ++++++++++
 quax/methods/ccsd.py         | 13 +++++++++++--
 quax/methods/ccsd_t.py       | 12 ++++++++++--
 quax/methods/hartree_fock.py |  1 -
 quax/methods/mp2.py          | 15 +++++++++++++--
 quax/methods/mp2f12.py       | 12 ++++++++++--
 6 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/quax/core.py b/quax/core.py
index f49b0f7..16e7024 100644
--- a/quax/core.py
+++ b/quax/core.py
@@ -105,18 +105,26 @@ def electronic_energy(*args, options=options, deriv_order=deriv_order):
                 return restricted_hartree_fock(*args, options=options, deriv_order=deriv_order)
         elif method =='mp2':
             args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+            if options['dipole']:
+                args = (electric_field,) + args
             def electronic_energy(*args, options=options, deriv_order=deriv_order):
                 return restricted_mp2(*args, options=options, deriv_order=deriv_order)
         elif method =='mp2-f12':
             args = (geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+            if options['dipole']:
+                args = (electric_field,) + args
             def electronic_energy(*args, options=options, deriv_order=deriv_order):
                 return restricted_mp2_f12(*args, options=options, deriv_order=deriv_order)
         elif method =='ccsd':
             args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+            if options['dipole']:
+                args = (electric_field,) + args
             def electronic_energy(*args, options=options, deriv_order=deriv_order):
                 return rccsd(*args, options=options, deriv_order=deriv_order)
         elif method =='ccsd(t)':
             args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+            if options['dipole']:
+                args = (electric_field,) + args
             def electronic_energy(*args, options=options, deriv_order=deriv_order):
                 return rccsd_t(*args, options=options, deriv_order=deriv_order)
         else:
@@ -143,7 +151,9 @@ def electronic_energy(*args, options=options, deriv_order=deriv_order):
             deriv = 0
 
         if options['dipole']:
+            print("Electric Dipole: ", deriv.reshape(-1, 3))
             dip_nuc = jnp.einsum('q,qx', nuclear_charges, geom.reshape(-1,3))
+            print("Nuclear Dipole: ", dip_nuc.reshape(-1, 3))
             deriv += dip_nuc
 
         return np.asarray(deriv)
diff --git a/quax/methods/ccsd.py b/quax/methods/ccsd.py
index f018af4..629f42b 100644
--- a/quax/methods/ccsd.py
+++ b/quax/methods/ccsd.py
@@ -6,10 +6,19 @@
 from .energy_utils import tei_transformation
 from .hartree_fock import restricted_hartree_fock
 
-def rccsd(geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path, options, deriv_order=0, return_aux_data=False):
+def rccsd(*args, options, deriv_order=0, return_aux_data=False):
+    if options['dipole']:
+        electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
+        deriv_order = 0
+        scf_args = electric_field, geom, basis_set, nelectrons, nuclear_charges, xyz_path
+    else:
+        geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
+        scf_args = (geom, basis_set, nelectrons, nuclear_charges, xyz_path)
+
+    # Load keywords
     ndocc = nelectrons // 2
     ncore = nfrzn // 2
-    E_scf, C, eps, V = restricted_hartree_fock(geom, basis_set, nelectrons, nuclear_charges, xyz_path, options, deriv_order=deriv_order, return_aux_data=True)
+    E_scf, C, eps, V = restricted_hartree_fock(*scf_args, options=options, deriv_order=deriv_order, return_aux_data=True)
 
     print("Running CCSD Computation...")
     nbf = V.shape[0]
diff --git a/quax/methods/ccsd_t.py b/quax/methods/ccsd_t.py
index 3bb89a8..8454be6 100644
--- a/quax/methods/ccsd_t.py
+++ b/quax/methods/ccsd_t.py
@@ -93,8 +93,16 @@ def loop_k(arr2):
     i, j, k, pT = while_loop(lambda arr0: arr0[0] < o, loop_i, (0, 0, 0, 0.0)) # (i, j, k, pT)
     return pT
 
-def rccsd_t(geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path, options, deriv_order=0):
-    E_ccsd, T1, T2, V, fock_Od, fock_Vd = rccsd(geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path, options, deriv_order=deriv_order, return_aux_data=True)
+def rccsd_t(*args, options, deriv_order=0):
+    if options['dipole']:
+        electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
+        deriv_order = 0
+        ccsd_args = electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path
+    else:
+        geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
+        ccsd_args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+
+    E_ccsd, T1, T2, V, fock_Od, fock_Vd = rccsd(*ccsd_args, options=options, deriv_order=deriv_order, return_aux_data=True)
 
     print("Running (T) Correction...")
     pT = perturbative_triples(T1, T2, V, fock_Od, fock_Vd)
diff --git a/quax/methods/hartree_fock.py b/quax/methods/hartree_fock.py
index ac3bf79..747a9f7 100644
--- a/quax/methods/hartree_fock.py
+++ b/quax/methods/hartree_fock.py
@@ -10,7 +10,6 @@ def restricted_hartree_fock(*args, options, deriv_order=0, return_aux_data=False
     if options['dipole']:
         electric_field, geom, basis_set, nelectrons, nuclear_charges, xyz_path = args
         deriv_order = 0
-        print("Deriv_Order for Integrals: ", deriv_order)
     else:
         geom, basis_set, nelectrons, nuclear_charges, xyz_path = args
 
diff --git a/quax/methods/mp2.py b/quax/methods/mp2.py
index 88f8548..7f4e26e 100644
--- a/quax/methods/mp2.py
+++ b/quax/methods/mp2.py
@@ -7,10 +7,21 @@
 from .energy_utils import partial_tei_transformation, cartesian_product
 from .hartree_fock import restricted_hartree_fock
 
-def restricted_mp2(geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path, options, deriv_order=0, return_aux_data=False):
+def restricted_mp2(*args, options, deriv_order=0, return_aux_data=False):
+    if options['dipole']:
+        electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
+        deriv_order = 0
+        print("Deriv_Order for Integrals: ", deriv_order)
+        scf_args = electric_field, geom, basis_set, nelectrons, nuclear_charges, xyz_path
+    else:
+        geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
+        scf_args = (geom, basis_set, nelectrons, nuclear_charges, xyz_path)
+
+    E_scf, C, eps, G = restricted_hartree_fock(*scf_args, options=options, deriv_order=deriv_order, return_aux_data=True)
+
+    # Load keyword options
     ndocc = nelectrons // 2
     ncore = nfrzn // 2
-    E_scf, C, eps, G = restricted_hartree_fock(geom, basis_set, nelectrons, nuclear_charges, xyz_path, options, deriv_order=deriv_order, return_aux_data=True)
 
     print("Running MP2 Computation...")
     nvirt = G.shape[0] - ndocc
diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index 5393067..a3f9b31 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -11,8 +11,16 @@
 from .energy_utils import partial_tei_transformation, cartesian_product
 from .mp2 import restricted_mp2
 
-def restricted_mp2_f12(geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path, options, deriv_order=0):
-    E_mp2, C_obs, eps = restricted_mp2(geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path, options, deriv_order, return_aux_data=True)
+def restricted_mp2_f12(*args, options, deriv_order=0):
+    if options['dipole']:
+        electric_field, geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
+        deriv_order = 0
+        mp2_args = electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path
+    else:
+        geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
+        mp2_args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+
+    E_mp2, C_obs, eps = restricted_mp2(*mp2_args, options=options, deriv_order=deriv_order, return_aux_data=True)
     ndocc = nelectrons // 2
     ncore = nfrzn // 2
     eps_occ, eps_vir = eps[:ndocc], eps[ndocc:]

From 69f2a6f01077b4c16116bdc4b452893bac7f2bd9 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Tue, 2 Apr 2024 17:42:55 -0400
Subject: [PATCH 59/91] IR Intensities WIP

---
 quax/core.py                 | 89 ++++++++++++++++++++++--------------
 quax/methods/ccsd.py         |  3 +-
 quax/methods/ccsd_t.py       |  3 +-
 quax/methods/hartree_fock.py |  7 ++-
 quax/methods/ints.py         |  2 +-
 quax/methods/mp2.py          |  4 +-
 quax/methods/mp2f12.py       |  3 +-
 7 files changed, 62 insertions(+), 49 deletions(-)

diff --git a/quax/core.py b/quax/core.py
index 16e7024..c4a089d 100644
--- a/quax/core.py
+++ b/quax/core.py
@@ -41,7 +41,7 @@ def check_options(options):
                        'ints_tolerance': 1.0e-14,
                        'freeze_core': False,
                        'beta': 1.0,
-                       'dipole': False
+                       'electric_field': False
                       }
 
     for key in options.keys():
@@ -73,7 +73,6 @@ def compute(molecule, basis_name, method, electric_field=None, options=None, der
     geom2d = np.asarray(molecule.geometry())
     geom_list = geom2d.reshape(-1).tolist()
     geom = jnp.asarray(geom2d.flatten())
-    dim = geom.reshape(-1).shape[0]
     xyz_file_name = "geom.xyz"
     molecule.save_xyz_file(xyz_file_name, True)
     xyz_path = os.path.abspath(os.getcwd()) + "/" + xyz_file_name
@@ -91,7 +90,7 @@ def compute(molecule, basis_name, method, electric_field=None, options=None, der
     if 'f12' in method:
         cabs_set = build_RIBS(molecule, basis_set, basis_name + '-cabs')
 
-    if options['dipole'] and type(electric_field) == type(None):
+    if options['electric_field'] and type(electric_field) == type(None):
         raise Exception("Electric field must be given for dipole computation.")
 
     # Energy and full derivative tensor evaluations
@@ -99,31 +98,31 @@ def compute(molecule, basis_name, method, electric_field=None, options=None, der
         # Create energy evaluation function
         if method == 'scf' or method == 'hf' or method == 'rhf':
             args = (geom, basis_set, nelectrons, nuclear_charges, xyz_path)
-            if options['dipole']:
+            if options['electric_field']:
                 args = (electric_field,) + args
             def electronic_energy(*args, options=options, deriv_order=deriv_order):
                 return restricted_hartree_fock(*args, options=options, deriv_order=deriv_order)
         elif method =='mp2':
             args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
-            if options['dipole']:
+            if options['electric_field']:
                 args = (electric_field,) + args
             def electronic_energy(*args, options=options, deriv_order=deriv_order):
                 return restricted_mp2(*args, options=options, deriv_order=deriv_order)
         elif method =='mp2-f12':
             args = (geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
-            if options['dipole']:
+            if options['electric_field']:
                 args = (electric_field,) + args
             def electronic_energy(*args, options=options, deriv_order=deriv_order):
                 return restricted_mp2_f12(*args, options=options, deriv_order=deriv_order)
         elif method =='ccsd':
             args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
-            if options['dipole']:
+            if options['electric_field']:
                 args = (electric_field,) + args
             def electronic_energy(*args, options=options, deriv_order=deriv_order):
                 return rccsd(*args, options=options, deriv_order=deriv_order)
         elif method =='ccsd(t)':
             args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
-            if options['dipole']:
+            if options['electric_field']:
                 args = (electric_field,) + args
             def electronic_energy(*args, options=options, deriv_order=deriv_order):
                 return rccsd_t(*args, options=options, deriv_order=deriv_order)
@@ -139,18 +138,18 @@ def electronic_energy(*args, options=options, deriv_order=deriv_order):
             deriv = jnp.round(grad, 10)
         elif deriv_order == 2:
             hess = jacfwd(jacfwd(electronic_energy, 0))(*args)
-            deriv = jnp.round(hess.reshape(dim,dim), 10)
+            deriv = jnp.round(hess, 10)
         elif deriv_order == 3:
             cubic = jacfwd(jacfwd(jacfwd(electronic_energy, 0)))(*args)
-            deriv = jnp.round(cubic.reshape(dim,dim,dim), 10)
+            deriv = jnp.round(cubic, 10)
         elif deriv_order == 4:
             quartic = jacfwd(jacfwd(jacfwd(jacfwd(electronic_energy, 0))))(*args)
-            deriv = jnp.round(quartic.reshape(dim,dim,dim,dim), 10)
+            deriv = jnp.round(quartic, 10)
         else:
             print("Error: Order {} derivatives are not exposed to the API.".format(deriv_order))
             deriv = 0
 
-        if options['dipole']:
+        if options['electric_field'] and deriv_order == 1:
             print("Electric Dipole: ", deriv.reshape(-1, 3))
             dip_nuc = jnp.einsum('q,qx', nuclear_charges, geom.reshape(-1,3))
             print("Nuclear Dipole: ", dip_nuc.reshape(-1, 3))
@@ -177,55 +176,75 @@ def electronic_energy(*args, options=options, deriv_order=deriv_order):
         # JAX will then collect the internal coordinate partial derivative instead. 
         if method == 'scf' or method == 'hf' or method == 'rhf':
             def partial_wrapper(*args):
-                geom = jnp.asarray(args)
-                E_scf = restricted_hartree_fock(geom, basis_set, nelectrons, nuclear_charges, xyz_path,\
-                                                options=options, deriv_order=deriv_order, return_aux_data=False)
+                if options['electric_field']:
+                    method_args = args + (basis_set, nelectrons, nuclear_charges, xyz_path)
+                else:
+                    geom = jnp.asarray(args)
+                    method_args = (geom, basis_set, nelectrons, nuclear_charges, xyz_path)
+                E_scf = restricted_hartree_fock(*method_args, options=options, deriv_order=deriv_order, return_aux_data=False)
                 return E_scf
         elif method =='mp2':
             def partial_wrapper(*args):
-                geom = jnp.asarray(args)
-                E_mp2 = restricted_mp2(geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path,\
-                                       options=options, deriv_order=deriv_order)
+                if options['electric_field']:
+                    method_args = args + (basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+                else:
+                    geom = jnp.asarray(args)
+                    method_args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+                E_mp2 = restricted_mp2(*method_args, options=options, deriv_order=deriv_order)
                 return E_mp2
         elif method =='mp2-f12':
             def partial_wrapper(*args):
-                geom = jnp.asarray(args)
-                E_mp2f12 = restricted_mp2_f12(geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges,\
-                                               xyz_path, options=options, deriv_order=deriv_order)
+                if options['electric_field']:
+                    method_args = args + (basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+                else:
+                    geom = jnp.asarray(args)
+                    method_args = (geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+                E_mp2f12 = restricted_mp2_f12(*method_args, options=options, deriv_order=deriv_order)
                 return E_mp2f12
         elif method =='ccsd':
             def partial_wrapper(*args):
-                geom = jnp.asarray(args)
-                E_ccsd = rccsd(geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path,\
-                               options=options, deriv_order=deriv_order)
+                if options['electric_field']:
+                    method_args = args + (basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+                else:
+                    geom = jnp.asarray(args)
+                    method_args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+                E_ccsd = rccsd(*method_args, options=options, deriv_order=deriv_order)
                 return E_ccsd
         elif method =='ccsd(t)':
             def partial_wrapper(*args):
-                geom = jnp.asarray(args)
-                E_ccsd_t = rccsd_t(geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path,\
-                                   options=options, deriv_order=deriv_order)
+                if options['electric_field']:
+                    method_args = args + (basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+                else:
+                    geom = jnp.asarray(args)
+                    method_args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+                E_ccsd_t = rccsd_t(*method_args, options=options, deriv_order=deriv_order)
                 return E_ccsd_t
         else:
             raise Exception("Error: Method {} not supported.".format(method))
+        
+        if options['electric_field']:
+            params = (electric_field, geom)
+        else:
+            params = geom_list
 
         if deriv_order == 1:
             i = partial[0]
-            partial_deriv = jacfwd(partial_wrapper, i)(*geom_list)
+            partial_deriv = jacfwd(partial_wrapper, i)(*params)
         elif deriv_order == 2:
             i,j = partial[0], partial[1]
-            partial_deriv = jacfwd(jacfwd(partial_wrapper, i), j)(*geom_list)
+            partial_deriv = jacfwd(jacfwd(partial_wrapper, i), j)(*params)
         elif deriv_order == 3:
             i,j,k = partial[0], partial[1], partial[2]
-            partial_deriv = jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k)(*geom_list)
+            partial_deriv = jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k)(*params)
         elif deriv_order == 4:
             i,j,k,l = partial[0], partial[1], partial[2], partial[3]
-            partial_deriv = jacfwd(jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k), l)(*geom_list)
+            partial_deriv = jacfwd(jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k), l)(*params)
         elif deriv_order == 5:
             i,j,k,l,m = partial[0], partial[1], partial[2], partial[3], partial[4]
-            partial_deriv = jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k), l), m)(*geom_list)
+            partial_deriv = jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k), l), m)(*params)
         elif deriv_order == 6:
             i,j,k,l,m,n = partial[0], partial[1], partial[2], partial[3], partial[4], partial[5]
-            partial_deriv = jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k), l), m), n)(*geom_list)
+            partial_deriv = jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k), l), m), n)(*params)
         else:
             print("Error: Order {} partial derivatives are not exposed to the API.".format(deriv_order))
             partial_deriv = 0
@@ -325,7 +344,7 @@ def derivative(molecule, basis_name, method, electric_field=None, deriv_order=1,
     deriv = compute(molecule, basis_name, method, electric_field, options, deriv_order)
     return deriv
 
-def partial_derivative(molecule, basis_name, method, deriv_order, partial, options=None):
+def partial_derivative(molecule, basis_name, method, electric_field=None, deriv_order=0, partial=None, options=None):
     """
     Computes one particular nth-order partial derivative of the energy of an electronic structure method
     w.r.t. a set of cartesian coordinates. If you have N cartesian coordinates in your molecule, the nuclear derivative tensor
@@ -388,6 +407,6 @@ def partial_derivative(molecule, basis_name, method, deriv_order, partial, optio
     partial_deriv : float
         The requested partial derivative of the energy in units of Hartree/bohr^(n)
     """
-    partial_deriv = compute(molecule, basis_name, method, options, deriv_order, partial)
+    partial_deriv = compute(molecule, basis_name, method, electric_field, options, deriv_order, partial)
     return partial_deriv
 
diff --git a/quax/methods/ccsd.py b/quax/methods/ccsd.py
index 629f42b..91a1697 100644
--- a/quax/methods/ccsd.py
+++ b/quax/methods/ccsd.py
@@ -7,9 +7,8 @@
 from .hartree_fock import restricted_hartree_fock
 
 def rccsd(*args, options, deriv_order=0, return_aux_data=False):
-    if options['dipole']:
+    if options['electric_field']:
         electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
-        deriv_order = 0
         scf_args = electric_field, geom, basis_set, nelectrons, nuclear_charges, xyz_path
     else:
         geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
diff --git a/quax/methods/ccsd_t.py b/quax/methods/ccsd_t.py
index 8454be6..5cf5c51 100644
--- a/quax/methods/ccsd_t.py
+++ b/quax/methods/ccsd_t.py
@@ -94,9 +94,8 @@ def loop_k(arr2):
     return pT
 
 def rccsd_t(*args, options, deriv_order=0):
-    if options['dipole']:
+    if options['electric_field']:
         electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
-        deriv_order = 0
         ccsd_args = electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path
     else:
         geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
diff --git a/quax/methods/hartree_fock.py b/quax/methods/hartree_fock.py
index 747a9f7..f1512c8 100644
--- a/quax/methods/hartree_fock.py
+++ b/quax/methods/hartree_fock.py
@@ -7,9 +7,8 @@
 from .energy_utils import nuclear_repulsion, cholesky_orthogonalization
 
 def restricted_hartree_fock(*args, options, deriv_order=0, return_aux_data=False):
-    if options['dipole']:
+    if options['electric_field']:
         electric_field, geom, basis_set, nelectrons, nuclear_charges, xyz_path = args
-        deriv_order = 0
     else:
         geom, basis_set, nelectrons, nuclear_charges, xyz_path = args
 
@@ -47,8 +46,8 @@ def form_shift():
     H = T + V
     Enuc = nuclear_repulsion(geom.reshape(-1,3), nuclear_charges)
 
-    if options['dipole']:
-        Mu_XYZ = compute_dipole_ints(geom, basis_set, xyz_path, 0, options)
+    if options['electric_field']:
+        Mu_XYZ = compute_dipole_ints(geom, basis_set, xyz_path, deriv_order, options)
         val = jnp.einsum('x,xij->ij', electric_field, Mu_XYZ)
         H += val
     
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index 7a3722a..670a25f 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -66,7 +66,7 @@ def compute_dipole_ints(geom, basis_set, xyz_path, deriv_order, options):
 
     Mu_ = oei_obj.dipole(geom)
 
-    return Mu_[0], Mu_[1], Mu_[2]
+    return Mu_
 
 def compute_f12_oeints(geom, basis1, basis2, xyz_path, deriv_order, options, cabs):
     # Load integral algo, decides to compute integrals in memory or use disk
diff --git a/quax/methods/mp2.py b/quax/methods/mp2.py
index 7f4e26e..338a23b 100644
--- a/quax/methods/mp2.py
+++ b/quax/methods/mp2.py
@@ -8,10 +8,8 @@
 from .hartree_fock import restricted_hartree_fock
 
 def restricted_mp2(*args, options, deriv_order=0, return_aux_data=False):
-    if options['dipole']:
+    if options['electric_field']:
         electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
-        deriv_order = 0
-        print("Deriv_Order for Integrals: ", deriv_order)
         scf_args = electric_field, geom, basis_set, nelectrons, nuclear_charges, xyz_path
     else:
         geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index a3f9b31..5101d1e 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -12,9 +12,8 @@
 from .mp2 import restricted_mp2
 
 def restricted_mp2_f12(*args, options, deriv_order=0):
-    if options['dipole']:
+    if options['electric_field']:
         electric_field, geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
-        deriv_order = 0
         mp2_args = electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path
     else:
         geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args

From 3c6bacd4c14bdd8081d02776bbccc95db3336e8e Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Thu, 4 Apr 2024 12:12:30 -0400
Subject: [PATCH 60/91] IR Intensities

---
 quax/core.py                 | 33 +++++++++------------------------
 quax/methods/hartree_fock.py |  6 ++++--
 2 files changed, 13 insertions(+), 26 deletions(-)

diff --git a/quax/core.py b/quax/core.py
index c4a089d..4179b57 100644
--- a/quax/core.py
+++ b/quax/core.py
@@ -6,7 +6,6 @@
 import psi4
 import numpy as np
 import os
-import h5py
 
 from .methods.basis_utils import build_RIBS
 from .methods.hartree_fock import restricted_hartree_fock
@@ -14,7 +13,7 @@
 from .methods.mp2f12 import restricted_mp2_f12
 from .methods.ccsd import rccsd
 from .methods.ccsd_t import rccsd_t
-from .utils import get_required_deriv_vecs, n_frozen_core
+from .utils import n_frozen_core
 
 psi4.core.be_quiet()
 
@@ -148,27 +147,13 @@ def electronic_energy(*args, options=options, deriv_order=deriv_order):
         else:
             print("Error: Order {} derivatives are not exposed to the API.".format(deriv_order))
             deriv = 0
-
-        if options['electric_field'] and deriv_order == 1:
-            print("Electric Dipole: ", deriv.reshape(-1, 3))
-            dip_nuc = jnp.einsum('q,qx', nuclear_charges, geom.reshape(-1,3))
-            print("Nuclear Dipole: ", dip_nuc.reshape(-1, 3))
-            deriv += dip_nuc
-
         return np.asarray(deriv)
 
     # Partial derivatives
     else:
-        if len(partial) != deriv_order:
-            raise Exception("The length of the index coordinates given by 'partial' argument should be the same as the order of differentiation")
-
-        # Estimate memory footprint of two electron integrals partial derivatives
-        natoms = molecule.natom()
-        nderivs = get_required_deriv_vecs(natoms, deriv_order, partial).shape[0]
-        ngigabytes = nbf**4 * 64 * 8 * nderivs / 1e9
-        print("Estimated memory footprint from two-electron integral partial derivatives: {} GB".format(ngigabytes))
+        nderivs = len(partial)
 
-        # For partial derivatives, need to unpack each geometric coordinate into separate arguments
+        # For partial derivatives of geometry, need to unpack each geometric coordinate into separate arguments
         # to differentiate wrt specific coordinates using JAX AD utilities. 
 
         #TODO support internal coordinate wrapper function.
@@ -227,22 +212,22 @@ def partial_wrapper(*args):
         else:
             params = geom_list
 
-        if deriv_order == 1:
+        if nderivs == 1:
             i = partial[0]
             partial_deriv = jacfwd(partial_wrapper, i)(*params)
-        elif deriv_order == 2:
+        elif nderivs == 2:
             i,j = partial[0], partial[1]
             partial_deriv = jacfwd(jacfwd(partial_wrapper, i), j)(*params)
-        elif deriv_order == 3:
+        elif nderivs == 3:
             i,j,k = partial[0], partial[1], partial[2]
             partial_deriv = jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k)(*params)
-        elif deriv_order == 4:
+        elif nderivs == 4:
             i,j,k,l = partial[0], partial[1], partial[2], partial[3]
             partial_deriv = jacfwd(jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k), l)(*params)
-        elif deriv_order == 5:
+        elif nderivs == 5:
             i,j,k,l,m = partial[0], partial[1], partial[2], partial[3], partial[4]
             partial_deriv = jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k), l), m)(*params)
-        elif deriv_order == 6:
+        elif nderivs == 6:
             i,j,k,l,m,n = partial[0], partial[1], partial[2], partial[3], partial[4], partial[5]
             partial_deriv = jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k), l), m), n)(*params)
         else:
diff --git a/quax/methods/hartree_fock.py b/quax/methods/hartree_fock.py
index f1512c8..d329fdb 100644
--- a/quax/methods/hartree_fock.py
+++ b/quax/methods/hartree_fock.py
@@ -48,8 +48,7 @@ def form_shift():
 
     if options['electric_field']:
         Mu_XYZ = compute_dipole_ints(geom, basis_set, xyz_path, deriv_order, options)
-        val = jnp.einsum('x,xij->ij', electric_field, Mu_XYZ)
-        H += val
+        H += jnp.einsum('x,xij->ij', electric_field, Mu_XYZ)
     
     def rhf_iter(F, D):
         E_scf = jnp.einsum('pq,pq->', F + H, D) + Enuc
@@ -96,6 +95,9 @@ def scf_procedure(carry):
                                                               # (iter, dE, dRMS, eps, C, D_old, D, E_scf)
     print(iteration, " RHF iterations performed")
 
+    if options['electric_field']:
+        E_scf += jnp.einsum('x,q,qx', electric_field, nuclear_charges, geom.reshape(-1,3))
+
     # If many orbitals are degenerate, warn that higher order derivatives may be unstable 
     tmp = jnp.round(eps, 6)
     ndegen_orbs =  tmp.shape[0] - jnp.unique(tmp).shape[0] 

From b80c306f6b6f92591cd6ed1567ab05e8c7186c5d Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Thu, 4 Apr 2024 15:30:47 -0400
Subject: [PATCH 61/91] Rewrite core.py

---
 quax/core.py | 615 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 361 insertions(+), 254 deletions(-)

diff --git a/quax/core.py b/quax/core.py
index 4179b57..c1eedb0 100644
--- a/quax/core.py
+++ b/quax/core.py
@@ -54,344 +54,451 @@ def check_options(options):
             print("{} keyword option not recognized.".format(key))
     return keyword_options
 
-def compute(molecule, basis_name, method, electric_field=None, options=None, deriv_order=0, partial=None):
+def compute_standard(method, method_args, deriv_order=0, partial=None, options=None):
     """
-    General function for computing energies, derivatives, and partial derivatives.
+    General function for computing energies, derivatives, and partial derivatives with respect to one input variable.
     """
-    # Set keyword options
-    if options:
-        options = check_options(options)
-        if deriv_order == 0:
-            options['integral_algo'] = 'libint_core'
-    else:
-        options = check_options({})
-    print("Using integral method: {}".format(options['integral_algo']))
-    print("Number of OMP Threads: {}".format(psi4.core.get_num_threads()))
-
-    # Load molecule data
-    geom2d = np.asarray(molecule.geometry())
-    geom_list = geom2d.reshape(-1).tolist()
-    geom = jnp.asarray(geom2d.flatten())
-    xyz_file_name = "geom.xyz"
-    molecule.save_xyz_file(xyz_file_name, True)
-    xyz_path = os.path.abspath(os.getcwd()) + "/" + xyz_file_name
-    mult = molecule.multiplicity()
-    charge = molecule.molecular_charge()
-    nuclear_charges = jnp.asarray([molecule.charge(i) for i in range(geom2d.shape[0])])
-    nelectrons = int(jnp.sum(nuclear_charges)) - charge
-    nfrzn = n_frozen_core(molecule, charge) if options['freeze_core'] else 0
-
-    basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
-    nbf = basis_set.nbf()
-    print("Basis name: ", basis_set.name())
-    print("Number of basis functions: ", nbf)
-
-    if 'f12' in method:
-        cabs_set = build_RIBS(molecule, basis_set, basis_name + '-cabs')
-
-    if options['electric_field'] and type(electric_field) == type(None):
-        raise Exception("Electric field must be given for dipole computation.")
-
     # Energy and full derivative tensor evaluations
     if not partial:
         # Create energy evaluation function
         if method == 'scf' or method == 'hf' or method == 'rhf':
-            args = (geom, basis_set, nelectrons, nuclear_charges, xyz_path)
-            if options['electric_field']:
-                args = (electric_field,) + args
             def electronic_energy(*args, options=options, deriv_order=deriv_order):
                 return restricted_hartree_fock(*args, options=options, deriv_order=deriv_order)
         elif method =='mp2':
-            args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
-            if options['electric_field']:
-                args = (electric_field,) + args
             def electronic_energy(*args, options=options, deriv_order=deriv_order):
                 return restricted_mp2(*args, options=options, deriv_order=deriv_order)
         elif method =='mp2-f12':
-            args = (geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
-            if options['electric_field']:
-                args = (electric_field,) + args
             def electronic_energy(*args, options=options, deriv_order=deriv_order):
                 return restricted_mp2_f12(*args, options=options, deriv_order=deriv_order)
         elif method =='ccsd':
-            args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
-            if options['electric_field']:
-                args = (electric_field,) + args
             def electronic_energy(*args, options=options, deriv_order=deriv_order):
                 return rccsd(*args, options=options, deriv_order=deriv_order)
         elif method =='ccsd(t)':
-            args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
-            if options['electric_field']:
-                args = (electric_field,) + args
             def electronic_energy(*args, options=options, deriv_order=deriv_order):
                 return rccsd_t(*args, options=options, deriv_order=deriv_order)
         else:
-            print("Desired electronic structure method not understood. Use 'scf' 'hf' 'mp2' 'ccsd' or 'ccsd(t)' ")
+            raise Exception("Error: Method {} not supported.".format(method))
 
         # Evaluate energy or derivative 
         if deriv_order == 0:
-            energy = electronic_energy(*args)
+            energy = electronic_energy(*method_args)
             return energy
         elif deriv_order == 1:
-            grad = jacfwd(electronic_energy, 0)(*args)
+            grad = jacfwd(electronic_energy, 0)(*method_args)
             deriv = jnp.round(grad, 10)
         elif deriv_order == 2:
-            hess = jacfwd(jacfwd(electronic_energy, 0))(*args)
+            hess = jacfwd(jacfwd(electronic_energy, 0))(*method_args)
             deriv = jnp.round(hess, 10)
         elif deriv_order == 3:
-            cubic = jacfwd(jacfwd(jacfwd(electronic_energy, 0)))(*args)
+            cubic = jacfwd(jacfwd(jacfwd(electronic_energy, 0)))(*method_args)
             deriv = jnp.round(cubic, 10)
         elif deriv_order == 4:
-            quartic = jacfwd(jacfwd(jacfwd(jacfwd(electronic_energy, 0))))(*args)
+            quartic = jacfwd(jacfwd(jacfwd(jacfwd(electronic_energy, 0))))(*method_args)
             deriv = jnp.round(quartic, 10)
         else:
-            print("Error: Order {} derivatives are not exposed to the API.".format(deriv_order))
+            raise Exception("Error: Order {} derivatives are not exposed to the API.".format(deriv_order))
             deriv = 0
         return np.asarray(deriv)
+    
+    # Partial derivatives
+    else:
+        if len(partial) != deriv_order:
+            raise Exception("The length of the index coordinates given by 'partial' argument should be the same as the order of differentiation")
 
+        # For partial derivatives, need to unpack each geometric or electric field coordinate into separate arguments
+        # to differentiate wrt specific coordinates using JAX AD utilities.
+        param_list = method_args[0]
+
+        #TODO support internal coordinate wrapper function.
+        # This will take in internal coordinates, transform them into cartesians, and then compute integrals, energy
+        # JAX will then collect the internal coordinate partial derivative instead. 
+        if method == 'scf' or method == 'hf' or method == 'rhf':
+            def partial_wrapper(*args):
+                param = jnp.asarray(args)
+                args = (param,) + method_args[1:]
+                E_scf = restricted_hartree_fock(*args, options=options, deriv_order=deriv_order, return_aux_data=False)
+                return E_scf
+        elif method =='mp2':
+            def partial_wrapper(*args):
+                param = jnp.asarray(args)
+                args = (param,) + method_args[1:]
+                E_mp2 = restricted_mp2(*args, options=options, deriv_order=deriv_order)
+                return E_mp2
+        elif method =='mp2-f12':
+            def partial_wrapper(*args):
+                param = jnp.asarray(args)
+                args = (param,) + method_args[1:]
+                E_mp2f12 = restricted_mp2_f12(*args, options=options, deriv_order=deriv_order)
+                return E_mp2f12
+        elif method =='ccsd':
+            def partial_wrapper(*args):
+                param = jnp.asarray(args)
+                args = (param,) + method_args[1:]
+                E_ccsd = rccsd(*args, options=options, deriv_order=deriv_order)
+                return E_ccsd
+        elif method =='ccsd(t)':
+            def partial_wrapper(*args):
+                param = jnp.asarray(args)
+                args = (param,) + method_args[1:]
+                E_ccsd_t = rccsd_t(*args, options=options, deriv_order=deriv_order)
+                return E_ccsd_t
+        else:
+            raise Exception("Error: Method {} not supported.".format(method))
+
+        if deriv_order == 1:
+            i = partial[0]
+            partial_deriv = jacfwd(partial_wrapper, i)(*param_list)
+        elif deriv_order == 2:
+            i,j = partial[0], partial[1]
+            partial_deriv = jacfwd(jacfwd(partial_wrapper, i), j)(*param_list)
+        elif deriv_order == 3:
+            i,j,k = partial[0], partial[1], partial[2]
+            partial_deriv = jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k)(*param_list)
+        elif deriv_order == 4:
+            i,j,k,l = partial[0], partial[1], partial[2], partial[3]
+            partial_deriv = jacfwd(jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k), l)(*param_list)
+        elif deriv_order == 5:
+            i,j,k,l,m = partial[0], partial[1], partial[2], partial[3], partial[4]
+            partial_deriv = jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k), l), m)(*param_list)
+        elif deriv_order == 6:
+            i,j,k,l,m,n = partial[0], partial[1], partial[2], partial[3], partial[4], partial[5]
+            partial_deriv = jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k), l), m), n)(*param_list)
+        else:
+            raise Exception("Error: Order {} partial derivatives are not exposed to the API.".format(deriv_order))
+            partial_deriv = 0
+        return jnp.round(partial_deriv, 14)
+    
+def compute_mixed(method, method_args, deriv_order_F=1, deriv_order_R=1, partial_F=None, partial_R=None, options=None):
+    """
+    General function for computing energies, derivatives, and partial derivatives with respect to two input variables.
+    """
+    # Number of differentiation calls depends on the total
+    total_deriv_order = deriv_order_F + deriv_order_R
+    
+    # Energy and full derivative tensor evaluations
+    if not partial_F or not partial_R:
+        # Creates indices list to decide electric_field or coordinate differentiation
+        FR_list = np.append(np.zeros(deriv_order_F, int), np.ones(deriv_order_R, int))
+
+        # Create energy evaluation function
+        if method == 'scf' or method == 'hf' or method == 'rhf':
+            def electronic_energy(*args, options=options, deriv_order=deriv_order_R):
+                return restricted_hartree_fock(*args, options=options, deriv_order=deriv_order)
+        elif method =='mp2':
+            def electronic_energy(*args, options=options, deriv_order=deriv_order_R):
+                return restricted_mp2(*args, options=options, deriv_order=deriv_order)
+        elif method =='mp2-f12':
+            def electronic_energy(*args, options=options, deriv_order=deriv_order_R):
+                return restricted_mp2_f12(*args, options=options, deriv_order=deriv_order)
+        elif method =='ccsd':
+            def electronic_energy(*args, options=options, deriv_order=deriv_order_R):
+                return rccsd(*args, options=options, deriv_order=deriv_order)
+        elif method =='ccsd(t)':
+            def electronic_energy(*args, options=options, deriv_order=deriv_order_R):
+                return rccsd_t(*args, options=options, deriv_order=deriv_order)
+        else:
+            print("Desired electronic structure method not understood. Use 'scf' 'hf' 'mp2' 'ccsd' or 'ccsd(t)' ")
+
+        if total_deriv_order == 2:
+            i,j = FR_list[0], FR_list[1]
+            deriv = jacfwd(jacfwd(electronic_energy, i), j)(*method_args)
+        elif total_deriv_order == 3:
+            i,j,k = FR_list[0], FR_list[1], FR_list[2]
+            deriv = jacfwd(jacfwd(jacfwd(electronic_energy, i), j), k)(*method_args)
+        elif total_deriv_order == 4:
+            i,j,k,l = FR_list[0], FR_list[1], FR_list[2], FR_list[3]
+            deriv = jacfwd(jacfwd(jacfwd(jacfwd(electronic_energy, i), j), k), l)(*method_args)
+        elif total_deriv_order == 5:
+            i,j,k,l,m = FR_list[0], FR_list[1], FR_list[2], FR_list[3], FR_list[4]
+            deriv = jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(electronic_energy, i), j), k), l), m)(*method_args)
+        elif total_deriv_order == 6:
+            i,j,k,l,m,n = FR_list[0], FR_list[1], FR_list[2], FR_list[3], FR_list[4], FR_list[5]
+            deriv = jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(electronic_energy, i), j), k), l), m), n)(*method_args)
+        else:
+            print("Error: Order {},{} mixed derivatives are not exposed to the API.".format(deriv_order_F, deriv_order_R))
+            deriv = 0
+        return np.asarray(deriv)
+    
     # Partial derivatives
     else:
-        nderivs = len(partial)
+        if len(partial_F) != deriv_order_F or len(partial_R) != deriv_order_R:
+            raise Exception("The length of the index coordinates given by 'partial' argument should be the same as the order of differentiation")
 
-        # For partial derivatives of geometry, need to unpack each geometric coordinate into separate arguments
-        # to differentiate wrt specific coordinates using JAX AD utilities. 
+        # For partial derivatives, need to unpack each geometric or electric field coordinate into separate arguments
+        # to differentiate wrt specific coordinates using JAX AD utilities.
+        param_list = (*method_args[0],) + (*method_args[1],)
 
         #TODO support internal coordinate wrapper function.
         # This will take in internal coordinates, transform them into cartesians, and then compute integrals, energy
         # JAX will then collect the internal coordinate partial derivative instead. 
         if method == 'scf' or method == 'hf' or method == 'rhf':
             def partial_wrapper(*args):
-                if options['electric_field']:
-                    method_args = args + (basis_set, nelectrons, nuclear_charges, xyz_path)
-                else:
-                    geom = jnp.asarray(args)
-                    method_args = (geom, basis_set, nelectrons, nuclear_charges, xyz_path)
-                E_scf = restricted_hartree_fock(*method_args, options=options, deriv_order=deriv_order, return_aux_data=False)
+                param1 = jnp.asarray(args[0:3])
+                param2 = jnp.asarray(args[3:])
+                args = (param1, param2) + method_args[2:]
+                E_scf = restricted_hartree_fock(*args, options=options, deriv_order=deriv_order_R, return_aux_data=False)
                 return E_scf
         elif method =='mp2':
             def partial_wrapper(*args):
-                if options['electric_field']:
-                    method_args = args + (basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
-                else:
-                    geom = jnp.asarray(args)
-                    method_args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
-                E_mp2 = restricted_mp2(*method_args, options=options, deriv_order=deriv_order)
+                param1 = jnp.asarray(args[0:3])
+                param2 = jnp.asarray(args[3:])
+                args = (param1, param2) + method_args[2:]
+                E_mp2 = restricted_mp2(*args, options=options, deriv_order=deriv_order_R)
                 return E_mp2
         elif method =='mp2-f12':
             def partial_wrapper(*args):
-                if options['electric_field']:
-                    method_args = args + (basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
-                else:
-                    geom = jnp.asarray(args)
-                    method_args = (geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
-                E_mp2f12 = restricted_mp2_f12(*method_args, options=options, deriv_order=deriv_order)
+                param1 = jnp.asarray(args[0:3])
+                param2 = jnp.asarray(args[3:])
+                args = (param1, param2) + method_args[2:]
+                E_mp2f12 = restricted_mp2_f12(*args, options=options, deriv_order=deriv_order_R)
                 return E_mp2f12
         elif method =='ccsd':
             def partial_wrapper(*args):
-                if options['electric_field']:
-                    method_args = args + (basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
-                else:
-                    geom = jnp.asarray(args)
-                    method_args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
-                E_ccsd = rccsd(*method_args, options=options, deriv_order=deriv_order)
+                param1 = jnp.asarray(args[0:3])
+                param2 = jnp.asarray(args[3:])
+                args = (param1, param2) + method_args[2:]
+                E_ccsd = rccsd(*args, options=options, deriv_order=deriv_order_R)
                 return E_ccsd
         elif method =='ccsd(t)':
             def partial_wrapper(*args):
-                if options['electric_field']:
-                    method_args = args + (basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
-                else:
-                    geom = jnp.asarray(args)
-                    method_args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
-                E_ccsd_t = rccsd_t(*method_args, options=options, deriv_order=deriv_order)
+                param1 = jnp.asarray(args[0:3])
+                param2 = jnp.asarray(args[3:])
+                args = (param1, param2) + method_args[2:]
+                E_ccsd_t = rccsd_t(*args, options=options, deriv_order=deriv_order_R)
                 return E_ccsd_t
         else:
             raise Exception("Error: Method {} not supported.".format(method))
         
-        if options['electric_field']:
-            params = (electric_field, geom)
-        else:
-            params = geom_list
+        # Combine partial tuples into one array
+        partial = np.append(np.array(partial_F), np.array(partial_R) + 3)
 
-        if nderivs == 1:
-            i = partial[0]
-            partial_deriv = jacfwd(partial_wrapper, i)(*params)
-        elif nderivs == 2:
+        if total_deriv_order == 2:
             i,j = partial[0], partial[1]
-            partial_deriv = jacfwd(jacfwd(partial_wrapper, i), j)(*params)
-        elif nderivs == 3:
+            partial_deriv = jacfwd(jacfwd(partial_wrapper, i), j)(*param_list)
+        elif total_deriv_order == 3:
             i,j,k = partial[0], partial[1], partial[2]
-            partial_deriv = jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k)(*params)
-        elif nderivs == 4:
+            partial_deriv = jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k)(*param_list)
+        elif total_deriv_order == 4:
             i,j,k,l = partial[0], partial[1], partial[2], partial[3]
-            partial_deriv = jacfwd(jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k), l)(*params)
-        elif nderivs == 5:
+            partial_deriv = jacfwd(jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k), l)(*param_list)
+        elif total_deriv_order == 5:
             i,j,k,l,m = partial[0], partial[1], partial[2], partial[3], partial[4]
-            partial_deriv = jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k), l), m)(*params)
-        elif nderivs == 6:
+            partial_deriv = jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k), l), m)(*param_list)
+        elif total_deriv_order == 6:
             i,j,k,l,m,n = partial[0], partial[1], partial[2], partial[3], partial[4], partial[5]
-            partial_deriv = jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k), l), m), n)(*params)
+            partial_deriv = jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k), l), m), n)(*param_list)
+        elif total_deriv_order == 7:
+            i,j,k,l,m,n,p = partial[0], partial[1], partial[2], partial[3], partial[4], partial[5], partial[6]
+            partial_deriv = jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k), l), m), n), p)(*param_list)
+        elif total_deriv_order == 8:
+            i,j,k,l,m,n,p,q = partial[0], partial[1], partial[2], partial[3], partial[4], partial[5], partial[6], partial[7]
+            partial_deriv = jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(jacfwd(partial_wrapper, i), j), k), l), m), n), p), q)(*param_list)
         else:
-            print("Error: Order {} partial derivatives are not exposed to the API.".format(deriv_order))
+            print("Error: Order {},{} mixed derivatives are not exposed to the API.".format(deriv_order_F, deriv_order_R))
             partial_deriv = 0
-        return jnp.round(partial_deriv, 10)
+        return jnp.round(partial_deriv, 14)
 
-def energy(molecule, basis_name, method, electric_field=None, options=None):
+def energy(molecule, basis_name, method, options=None):
     """
-    Call an energy method on a molecule and basis set.
+    """
+    # Set keyword options
+    if options:
+        options = check_options(options)
+    else:
+        options = check_options({'integral_algo': 'libint_core'})
+    print("Using integral method: {}".format(options['integral_algo']))
+    print("Number of OMP Threads: {}".format(psi4.core.get_num_threads()))
 
-    Parameters
-    ----------
-    molecule : psi4.Molecule
-        A Psi4 Molecule object containing geometry, charge, multiplicity, and optionally units in a multiline string. 
-        Examples:
-        molecule = psi4.geometry('''
-                                 0 1
-                                 H 0.0 0.0 -0.55000000000
-                                 H 0.0 0.0  0.55000000000
-                                 units bohr
-                                 ''')
-
-        molecule = psi4.geometry('''
-                                 0 1
-                                 O
-                                 H 1 r1
-                                 H 1 r2 2 a1
-                        
-                                 r1 = 1.0
-                                 r2 = 1.0
-                                 a1 = 104.5
-                                 units ang
-                                 ''')
-
-    basis_name : str
-        A string representing a Gaussian basis set available in Psi4's basis set library (also needs to be in Libint's basis set library if using Libint interface).
-
-    method : str
-        A string representing a quantum chemistry method supported in Quax
-        method = 'scf', method = 'mp2', method = 'ccsd(t)'
+    # Load molecule data
+    geom2d = np.asarray(molecule.geometry())
+    geom_list = geom2d.reshape(-1).tolist()
+    geom = jnp.asarray(geom2d.flatten())
+    xyz_file_name = "geom.xyz"
+    molecule.save_xyz_file(xyz_file_name, True)
+    xyz_path = os.path.abspath(os.getcwd()) + "/" + xyz_file_name
+    mult = molecule.multiplicity()
+    charge = molecule.molecular_charge()
+    nuclear_charges = jnp.asarray([molecule.charge(i) for i in range(geom2d.shape[0])])
+    nelectrons = int(jnp.sum(nuclear_charges)) - charge
+    nfrzn = n_frozen_core(molecule, charge) if options['freeze_core'] else 0
 
-    options : dict
-        Dictionary of user-supplied keyword options.
+    basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
+    nbf = basis_set.nbf()
+    print("Basis name: ", basis_set.name())
+    print("Number of basis functions: ", nbf)
 
-    Returns
-    -------
-    The electronic energy in a.u. (Hartrees)
-    """
-    E = compute(molecule, basis_name, method, electric_field, options)
-    return E
+    if method == 'scf' or method == 'hf' or method == 'rhf':
+        args = (geom, basis_set, nelectrons, nuclear_charges, xyz_path)
+    elif method =='mp2':
+        args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+    elif method =='mp2-f12':
+        cabs_set = build_RIBS(molecule, basis_set, basis_name + '-cabs')
+        args = (geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+    elif method =='ccsd':
+        args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+    elif method =='ccsd(t)':
+        args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+    else:
+        print("Desired electronic structure method not understood. Use 'scf' 'hf' 'mp2' 'ccsd' or 'ccsd(t)' ")
+
+    return compute_standard(method, args, deriv_order=0, partial=None, options=options)
 
-def derivative(molecule, basis_name, method, electric_field=None, deriv_order=1, options=None):
+def geom_deriv(molecule, basis_name, method, deriv_order=1, partial=None, options=None):
     """
-    Compute the full Cartesian derivative tensor for a particular energy method, molecule, and basis set. 
+    """
+    # Set keyword options
+    if options:
+        options = check_options(options)
+        if deriv_order == 0:
+            options['integral_algo'] = 'libint_core'
+    else:
+        options = check_options({})
+    print("Using integral method: {}".format(options['integral_algo']))
+    print("Number of OMP Threads: {}".format(psi4.core.get_num_threads()))
 
-    Parameters
-    ----------
-    molecule : psi4.Molecule
-        A Psi4 Molecule object containing geometry, charge, multiplicity in a multiline string. 
-        Examples:
-        molecule = psi4.geometry('''
-                                 0 1
-                                 H 0.0 0.0 -0.55000000000
-                                 H 0.0 0.0  0.55000000000
-                                 units bohr
-                                 ''')
-
-        molecule = psi4.geometry('''
-                                 0 1
-                                 O
-                                 H 1 r1
-                                 H 1 r2 2 a1
-                        
-                                 r1 = 1.0
-                                 r2 = 1.0
-                                 a1 = 104.5
-                                 units ang
-                                 ''')
-
-    basis_name : str
-        A string representing a Gaussian basis set available in Psi4's basis set library (also needs to be in Libint's basis set library if using Libint interface).
-
-    method : str
-        A string representing a quantum chemistry method supported in Quax
-        method = 'scf', method = 'mp2', method = 'ccsd(t)'
-
-    deriv_order : int
-        The order of the derivative. order = 1 -> first derivative ; order = 2 --> second derivative ...
+    # Load molecule data
+    geom2d = np.asarray(molecule.geometry())
+    geom_list = geom2d.reshape(-1).tolist()
+    geom = jnp.asarray(geom2d.flatten())
+    xyz_file_name = "geom.xyz"
+    molecule.save_xyz_file(xyz_file_name, True)
+    xyz_path = os.path.abspath(os.getcwd()) + "/" + xyz_file_name
+    mult = molecule.multiplicity()
+    charge = molecule.molecular_charge()
+    nuclear_charges = jnp.asarray([molecule.charge(i) for i in range(geom2d.shape[0])])
+    nelectrons = int(jnp.sum(nuclear_charges)) - charge
+    nfrzn = n_frozen_core(molecule, charge) if options['freeze_core'] else 0
 
-    options : dict
-        Dictionary of user-supplied keyword options.
+    basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
+    nbf = basis_set.nbf()
+    print("Basis name: ", basis_set.name())
+    print("Number of basis functions: ", nbf)
 
-    Returns
-    -------
-    deriv : float
-        The requested derivative tensor, elements have units of Hartree/bohr^(n)
-    """
-    deriv = compute(molecule, basis_name, method, electric_field, options, deriv_order)
-    return deriv
+    if method == 'scf' or method == 'hf' or method == 'rhf':
+        args = (geom, basis_set, nelectrons, nuclear_charges, xyz_path)
+    elif method =='mp2':
+        args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+    elif method =='mp2-f12':
+        cabs_set = build_RIBS(molecule, basis_set, basis_name + '-cabs')
+        args = (geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+    elif method =='ccsd':
+        args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+    elif method =='ccsd(t)':
+        args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+    else:
+        print("Desired electronic structure method not understood. Use 'scf' 'hf' 'mp2' 'ccsd' or 'ccsd(t)' ")
+
+    return compute_standard(method, args, deriv_order=deriv_order, partial=partial, options=options)
 
-def partial_derivative(molecule, basis_name, method, electric_field=None, deriv_order=0, partial=None, options=None):
+def efield_deriv(molecule, basis_name, method, electric_field=None, deriv_order=1, partial=None, options=None):
     """
-    Computes one particular nth-order partial derivative of the energy of an electronic structure method
-    w.r.t. a set of cartesian coordinates. If you have N cartesian coordinates in your molecule, the nuclear derivative tensor
-    is N x N x N ... however many orders of differentiation. This function computes one element of that tensor, depending
-    on the address of the derivative you supply.
-    If you have 9 cartesian coordinates x1,y1,z1,x2,y2,z2,x3,y3,z3 and you want the quartic derivative d^4E/dx1dy2(dz3)^2
-    the partial derivative address in the quartic derivative tensor would be (0, 4, 8, 8).
-    Note that this is the same derivative as, say, (4, 8, 0, 8), or any other permutation of that tuple.
-    Also note this is dependent upon the order in which you supply the cartesian coordinates in the molecule object,
-    because that will determine the indices of the coordinates.
+    """
+    if type(electric_field) == type(None):
+        raise Exception("Electric field must be given for dipole computation.")
+    
+    try:
+        options['electric_field']
+    except:
+        options['electric_field'] = True
+    
+    # Set keyword options
+    if options:
+        options = check_options(options)
+        if deriv_order == 0:
+            options['integral_algo'] = 'libint_core'
 
-    Parameters
-    ----------
-    molecule : psi4.Molecule
-        A Psi4 Molecule object containing geometry, charge, multiplicity in a multiline string. 
-        Examples:
-        molecule = psi4.geometry('''
-                                 0 1
-                                 H 0.0 0.0 -0.55000000000
-                                 H 0.0 0.0  0.55000000000
-                                 units bohr
-                                 ''')
-
-        molecule = psi4.geometry('''
-                                 0 1
-                                 O
-                                 H 1 r1
-                                 H 1 r2 2 a1
-                        
-                                 r1 = 1.0
-                                 r2 = 1.0
-                                 a1 = 104.5
-                                 units ang
-                                 ''')
-
-    basis_name : str
-        A string representing a Gaussian basis set available in Psi4's basis set library (also needs to be in Libint's basis set library if using Libint interface).
-
-    method : str
-        A string representing a quantum chemistry method supported in Quax e.g. 'scf', 'mp2' 'ccsd(t)'
-
-    deriv_order : int
-        The order of the derivative. order = 1 -> first derivative ; order = 2 --> second derivative ...
-
-    partial : tuple of ints
-       A tuple of indices at which the desired derivative appears in the derivative tensor. 
-       Coordinates are indexed according to their location in the row-wise flattened Cartesian coordinate array:
-       atom  x   y   z
-        A    0   1   2
-        B    3   4   5 
-        C    6   7   8 
-       E.g. The second derivative w.r.t the first atoms x-components would have partial=(0,0)
-       The mixed partial derivative w.r.t. y-components on first and third atoms would be partial=(1,7)
+    print("Using integral method: {}".format(options['integral_algo']))
+    print("Number of OMP Threads: {}".format(psi4.core.get_num_threads()))
 
-    options : dict
-        Dictionary of user-supplied keyword options.
+    # Load molecule data
+    geom2d = np.asarray(molecule.geometry())
+    geom_list = geom2d.reshape(-1).tolist()
+    geom = jnp.asarray(geom2d.flatten())
+    xyz_file_name = "geom.xyz"
+    molecule.save_xyz_file(xyz_file_name, True)
+    xyz_path = os.path.abspath(os.getcwd()) + "/" + xyz_file_name
+    mult = molecule.multiplicity()
+    charge = molecule.molecular_charge()
+    nuclear_charges = jnp.asarray([molecule.charge(i) for i in range(geom2d.shape[0])])
+    nelectrons = int(jnp.sum(nuclear_charges)) - charge
+    nfrzn = n_frozen_core(molecule, charge) if options['freeze_core'] else 0
 
-    Returns
-    -------
-    partial_deriv : float
-        The requested partial derivative of the energy in units of Hartree/bohr^(n)
+    basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
+    nbf = basis_set.nbf()
+    print("Basis name: ", basis_set.name())
+    print("Number of basis functions: ", nbf)
+
+    if method == 'scf' or method == 'hf' or method == 'rhf':
+        args = (electric_field, geom, basis_set, nelectrons, nuclear_charges, xyz_path)
+    elif method =='mp2':
+        args = (electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+    elif method =='mp2-f12':
+        cabs_set = build_RIBS(molecule, basis_set, basis_name + '-cabs')
+        args = (electric_field, geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+    elif method =='ccsd':
+        args = (electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+    elif method =='ccsd(t)':
+        args = (electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+    else:
+        print("Desired electronic structure method not understood. Use 'scf' 'hf' 'mp2' 'ccsd' or 'ccsd(t)' ")
+
+    return compute_standard(method, args, deriv_order=deriv_order, partial=partial, options=options)
+
+def mixed_deriv(molecule, basis_name, method, electric_field=None,
+                deriv_order_F=1, deriv_order_R=1, partial_F=None, partial_R=None, options=None):
     """
-    partial_deriv = compute(molecule, basis_name, method, electric_field, options, deriv_order, partial)
-    return partial_deriv
+    """
+    if deriv_order_F == 0 or deriv_order_R == 0:
+        raise Exception("Error: Order of differentiation cannot equal zero. Use energy or geometry_deriv or electric_field instead.")
+
+    if type(electric_field) == type(None):
+        raise Exception("Electric field must be given for dipole computation.")
+    
+    try:
+        options['electric_field']
+    except:
+        options['electric_field'] = True
+    
+    # Set keyword options
+    if options:
+        options = check_options(options)
+        if deriv_order_F == 0 and deriv_order_R == 0:
+            options['integral_algo'] = 'libint_core'
+
+    print("Using integral method: {}".format(options['integral_algo']))
+    print("Number of OMP Threads: {}".format(psi4.core.get_num_threads()))
+
+    # Load molecule data
+    geom2d = np.asarray(molecule.geometry())
+    geom_list = geom2d.reshape(-1).tolist()
+    geom = jnp.asarray(geom2d.flatten())
+    xyz_file_name = "geom.xyz"
+    molecule.save_xyz_file(xyz_file_name, True)
+    xyz_path = os.path.abspath(os.getcwd()) + "/" + xyz_file_name
+    mult = molecule.multiplicity()
+    charge = molecule.molecular_charge()
+    nuclear_charges = jnp.asarray([molecule.charge(i) for i in range(geom2d.shape[0])])
+    nelectrons = int(jnp.sum(nuclear_charges)) - charge
+    nfrzn = n_frozen_core(molecule, charge) if options['freeze_core'] else 0
+
+    basis_set = psi4.core.BasisSet.build(molecule, 'BASIS', basis_name, puream=0)
+    nbf = basis_set.nbf()
+    print("Basis name: ", basis_set.name())
+    print("Number of basis functions: ", nbf)
+
+    if method == 'scf' or method == 'hf' or method == 'rhf':
+        args = (electric_field, geom, basis_set, nelectrons, nuclear_charges, xyz_path)
+    elif method =='mp2':
+        args = (electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+    elif method =='mp2-f12':
+        cabs_set = build_RIBS(molecule, basis_set, basis_name + '-cabs')
+        args = (electric_field, geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+    elif method =='ccsd':
+        args = (electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+    elif method =='ccsd(t)':
+        args = (electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+    else:
+        print("Desired electronic structure method not understood. Use 'scf' 'hf' 'mp2' 'ccsd' or 'ccsd(t)' ")
 
+    return compute_mixed(method, args, deriv_order_F=deriv_order_F, deriv_order_R=deriv_order_R, 
+                         partial_F=partial_F, partial_R=partial_R, options=options)
\ No newline at end of file

From 08b574a5a3600170af59a0d2de9818625acf6214 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Fri, 5 Apr 2024 15:00:09 -0400
Subject: [PATCH 62/91] Mod tests, MP2-F12

---
 quax/methods/basis_utils.py |   5 +-
 quax/methods/mp2.py         |   3 +-
 quax/methods/mp2f12.py      | 107 ++++++++++++++++++++----------------
 tests/test_dipoles.py       |  45 +++++++++++++++
 tests/test_gradients.py     |  12 ++--
 tests/test_hessians.py      |  12 ++--
 6 files changed, 121 insertions(+), 63 deletions(-)
 create mode 100644 tests/test_dipoles.py

diff --git a/quax/methods/basis_utils.py b/quax/methods/basis_utils.py
index 36538a7..04d3efd 100644
--- a/quax/methods/basis_utils.py
+++ b/quax/methods/basis_utils.py
@@ -16,7 +16,7 @@ def build_RIBS(molecule, basis_set, cabs_name):
     # Libint uses the suffix 'cabs' but Psi4 uses 'optri'
     basis_name = basis_set.name()
     try:
-        psi4_name = cabs_name.lower().replace('cabs', 'optri')
+        psi4_name = cabs_name.upper().replace('CABS', 'OPTRI')
     except:
         raise Exception("Must use a cc-pVXZ-F12 or aug-cc-pVXZ basis set for F12 methods.")
 
@@ -30,6 +30,9 @@ def build_RIBS(molecule, basis_set, cabs_name):
     ao_union['name'] = cabs_name
     ribs_set = psi4.core.BasisSet.construct_from_pydict(molecule, ao_union, 0)
 
+    print("Basis name: ", cabs_name.upper())
+    print("Number of basis functions: ", ribs_set.nbf())
+
     return ribs_set
 
 def build_CABS(geom, basis_set, cabs_set, xyz_path, deriv_order, options):
diff --git a/quax/methods/mp2.py b/quax/methods/mp2.py
index 338a23b..c846069 100644
--- a/quax/methods/mp2.py
+++ b/quax/methods/mp2.py
@@ -2,7 +2,6 @@
 from jax.config import config; config.update("jax_enable_x64", True)
 import jax.numpy as jnp
 from jax.lax import fori_loop
-import psi4
 
 from .energy_utils import partial_tei_transformation, cartesian_product
 from .hartree_fock import restricted_hartree_fock
@@ -49,7 +48,7 @@ def loop_mp2(idx, mp2_corr):
 
     if return_aux_data:
         #print("MP2 Energy:                ", E_scf + dE_mp2)
-        return E_scf + dE_mp2, C, eps
+        return E_scf + dE_mp2, C, eps, G
     else:
         return E_scf + dE_mp2
 
diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index 5101d1e..2e93479 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -19,43 +19,36 @@ def restricted_mp2_f12(*args, options, deriv_order=0):
         geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
         mp2_args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
 
-    E_mp2, C_obs, eps = restricted_mp2(*mp2_args, options=options, deriv_order=deriv_order, return_aux_data=True)
+    E_mp2, C_obs, eps, G = restricted_mp2(*mp2_args, options=options, deriv_order=deriv_order, return_aux_data=True)
     ndocc = nelectrons // 2
     ncore = nfrzn // 2
     eps_occ, eps_vir = eps[:ndocc], eps[ndocc:]
 
     print("Running MP2-F12 Computation...")
     C_cabs = build_CABS(geom, basis_set, cabs_set, xyz_path, deriv_order, options)
-
-    # S_ao = compute_f12_oeints(geom, cabs_set, cabs_set, xyz_path, deriv_order, options, True)
-    # test = C_cabs.T @ S_ao @ C_cabs
-    # print(test)
-
-    # return jnp.array([0, 0])
+    C_mats = (C_obs[:, :ndocc], C_obs, C_cabs) # C_occ, C_obs, C_cabs
 
     nobs = C_obs.shape[0]
-    nri = C_obs.shape[0] + C_cabs.shape[1]
+    spaces = (ndocc, nobs, C_cabs.shape[0]) # ndocc, nobs, nri
 
     # Fock
-    f, fk, k = form_Fock(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
+    f, fk, k = form_Fock(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options)
 
     # V Intermediate
-    V = form_V(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
+    V = form_V(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options)\
 
     # X Intermediate
-    X = form_X(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
+    X = form_X(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options)
 
     # C Intermediate
-    C = form_C(geom, basis_set, cabs_set, f[nobs:, ndocc:nobs], C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options)
+    C = form_C(geom, basis_set, cabs_set, f[nobs:, ndocc:nobs], C_mats, spaces, xyz_path, deriv_order, options)
 
     # B Intermediate
-    B = form_B(geom, basis_set, cabs_set, f, k, fk[:ndocc, :], C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
+    B = form_B(geom, basis_set, cabs_set, f, k, fk[:ndocc, :], C_mats, spaces, xyz_path, deriv_order, options)
 
     D = -1.0 / (eps_occ.reshape(-1, 1, 1, 1) + eps_occ.reshape(-1, 1, 1) - eps_vir.reshape(-1, 1) - eps_vir)
+    G = jnp.swapaxes(G, 1, 2)
 
-    G = two_body_mo_computer(geom, "eri", basis_set, basis_set, basis_set, basis_set,\
-                             C_obs, C_obs, C_obs, C_obs, xyz_path, deriv_order, options)
-    
     indices = jnp.asarray(jnp.triu_indices(ndocc)).reshape(2,-1).T
 
     def loop_energy(idx, f12_corr):
@@ -64,7 +57,7 @@ def loop_energy(idx, f12_corr):
 
         D_ij = D[i, j, :, :]
 
-        GD_ij = jnp.einsum('ab,ab->ab', G[i, j, ndocc:, ndocc:], D_ij, optimize='optimal')
+        GD_ij = jnp.einsum('ab,ab->ab', G[i - 1, j - 1, :, :], D_ij, optimize='optimal')
         V_ij = V[i, j, :, :] - jnp.einsum('klab,ab->kl', C, GD_ij, optimize='optimal')
 
         V_s = 0.25 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd * (V_ij[i, j] + V_ij[j, i])
@@ -92,12 +85,13 @@ def loop_energy(idx, f12_corr):
     start = ndocc if ncore > 0 else 0
     dE_mp2f12 = fori_loop(start, indices.shape[0], loop_energy, 0.0)
 
-    E_s = cabs_singles(f, ndocc, nri)
+    E_s = cabs_singles(f, spaces)
 
     return E_mp2 + dE_mp2f12 + E_s
 
 # CABS Singles
-def cabs_singles(f, ndocc, nri):
+def cabs_singles(f, spaces):
+    ndocc, _, nri = spaces
     all_vir = nri - ndocc
 
     e_ij, C_ij = jnp.linalg.eigh(f[:ndocc, :ndocc])
@@ -136,7 +130,10 @@ def one_body_mo_computer(geom, bs1, bs2, C1, C2, xyz_path, deriv_order, options)
     MO = C1.T @ AO @ C2
     return MO
 
-def form_h(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_order, options):
+def form_h(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options):
+    _, nobs, nri = spaces
+    _, C_obs, C_cabs = C_mats
+
     tv = jnp.zeros((nri, nri))
 
     mo1 = one_body_mo_computer(geom, basis_set, basis_set, C_obs, C_obs, xyz_path, deriv_order, options)
@@ -164,9 +161,11 @@ def two_body_mo_computer(geom, int_type, bs1, bs2, bs3, bs4, C1, C2, C3, C4, xyz
     MO = jnp.swapaxes(MO, 1, 2)
     return MO
 
-def form_J(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
+def form_J(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options):
+    ndocc, nobs, nri = spaces
+    C_occ, C_obs, C_cabs = C_mats
+
     eri = jnp.zeros((nri, ndocc, nri, ndocc))
-    C_occ = C_obs.at[:, :ndocc].get()
 
     mo1 = two_body_mo_computer(geom, "eri", basis_set, basis_set, basis_set, basis_set,\
                                C_obs, C_occ, C_obs, C_occ, xyz_path, deriv_order, options)
@@ -183,9 +182,11 @@ def form_J(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path,
 
     return eri
 
-def form_K(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
+def form_K(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options):
+    ndocc, nobs, nri = spaces
+    C_occ, C_obs, C_cabs = C_mats
+
     eri = jnp.empty((nri, ndocc, ndocc, nri))
-    C_occ = C_obs.at[:, :ndocc].get()
 
     mo1 = two_body_mo_computer(geom, "eri", basis_set, basis_set, basis_set, basis_set,\
                               C_obs, C_occ, C_occ, C_obs, xyz_path, deriv_order, options)
@@ -202,9 +203,11 @@ def form_K(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path,
 
     return eri
 
-def form_ooO1(geom, int_type, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
+def form_ooO1(geom, int_type, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options):
+    ndocc, nobs, nri = spaces
+    C_occ, C_obs, C_cabs = C_mats
+
     eri = jnp.zeros((ndocc, ndocc, nobs, nri))
-    C_occ = C_obs.at[:, :ndocc].get()
 
     mo1 = two_body_mo_computer(geom, int_type, basis_set, basis_set, basis_set, basis_set,\
                               C_occ, C_occ, C_obs, C_obs, xyz_path, deriv_order, options)
@@ -216,9 +219,11 @@ def form_ooO1(geom, int_type, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, n
 
     return eri
 
-def form_F(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
+def form_F(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options):
+    ndocc, nobs, nri = spaces
+    C_occ, C_obs, C_cabs = C_mats
+
     f12 = jnp.zeros((ndocc, ndocc, nri, nri))
-    C_occ = C_obs.at[:, :ndocc].get()
 
     mo1 = two_body_mo_computer(geom, "f12", basis_set, basis_set, basis_set, basis_set,\
                               C_occ, C_occ, C_obs, C_obs, xyz_path, deriv_order, options)
@@ -235,9 +240,11 @@ def form_F(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path,
 
     return f12
 
-def form_F2(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
+def form_F2(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options):
+    ndocc, nobs, nri = spaces
+    C_occ, C_obs, C_cabs = C_mats
+
     f12_squared = jnp.zeros((ndocc, ndocc, ndocc, nri))
-    C_occ = C_obs.at[:, :ndocc].get()
 
     mo1 = two_body_mo_computer(geom, "f12_squared", basis_set, basis_set, basis_set, basis_set,\
                               C_occ, C_occ, C_occ, C_obs, xyz_path, deriv_order, options)
@@ -250,11 +257,11 @@ def form_F2(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path
     return f12_squared
 
 # Fock
-def form_Fock(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
+def form_Fock(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options):
 
-    fk = form_h(geom, basis_set, cabs_set, C_obs, C_cabs, nobs, nri, xyz_path, deriv_order, options)
-    J = form_J(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
-    K = form_K(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
+    fk = form_h(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options)
+    J = form_J(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options)
+    K = form_K(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options)
     
     # Fock Matrix without Exchange
     fk += 2.0 * jnp.einsum('piqi->pq', J, optimize='optimal')
@@ -267,13 +274,14 @@ def form_Fock(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_pa
     return f, fk, k
 
 # F12 Intermediates
-def form_V(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
-    C_occ = C_obs.at[:, :ndocc].get()
+def form_V(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options):
+    C_occ, _, _ = C_mats
+    ndocc, nobs, _ = spaces
     
     FG = two_body_mo_computer(geom, "f12g12", basis_set, basis_set, basis_set, basis_set,\
                               C_occ, C_occ, C_occ, C_occ, xyz_path, deriv_order, options)
-    G = form_ooO1(geom, "eri", basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
-    F = form_ooO1(geom, "f12", basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
+    G = form_ooO1(geom, "eri", basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options)
+    F = form_ooO1(geom, "f12", basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options)
 
     ijkl_1 = jnp.einsum('ijmy,klmy->ijkl', G[:, :, :ndocc, nobs:], F[:, :, :ndocc, nobs:], optimize='optimal')
     ijkl_2 = jnp.transpose(ijkl_1, (1,0,3,2)) # ijxn,klxn->ijkl
@@ -281,12 +289,13 @@ def form_V(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path,
 
     return FG - ijkl_1 - ijkl_2 - ijkl_3
 
-def form_X(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
-    C_occ = C_obs.at[:, :ndocc].get()
+def form_X(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options):
+    C_occ, _, _ = C_mats
+    ndocc, nobs, _ = spaces
     
     F2 = two_body_mo_computer(geom, "f12_squared", basis_set, basis_set, basis_set, basis_set,\
                               C_occ, C_occ, C_occ, C_occ, xyz_path, deriv_order, options)
-    F = form_ooO1(geom, "f12", basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
+    F = form_ooO1(geom, "f12", basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options)
 
     ijkl_1 = jnp.einsum('ijmy,klmy->ijkl', F[:, :, :ndocc, nobs:], F[:, :, :ndocc, nobs:], optimize='optimal')
     ijkl_2 = jnp.transpose(ijkl_1, (1,0,3,2)) # ijxn,klxn->ijkl
@@ -294,8 +303,9 @@ def form_X(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path,
 
     return F2 - ijkl_1 - ijkl_2 - ijkl_3
 
-def form_C(geom, basis_set, cabs_set, f_cv, C_obs, C_cabs, ndocc, nobs, xyz_path, deriv_order, options):
-    C_occ = C_obs.at[:, :ndocc].get()
+def form_C(geom, basis_set, cabs_set, f_cv, C_mats, spaces, xyz_path, deriv_order, options):
+    C_occ, C_obs, C_cabs = C_mats
+    ndocc, nobs, _ = spaces
 
     F = two_body_mo_computer(geom, "f12", basis_set, basis_set, basis_set, cabs_set,\
                               C_occ, C_occ, C_obs, C_cabs, xyz_path, deriv_order, options)
@@ -304,13 +314,14 @@ def form_C(geom, basis_set, cabs_set, f_cv, C_obs, C_cabs, ndocc, nobs, xyz_path
 
     return klab + jnp.transpose(klab, (1,0,3,2))
 
-def form_B(geom, basis_set, cabs_set, f, k, fk_o1, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options):
-    C_occ = C_obs.at[:, :ndocc].get()
+def form_B(geom, basis_set, cabs_set, f, k, fk_o1, C_mats, spaces, xyz_path, deriv_order, options):
+    C_occ, C_obs, C_cabs = C_mats
+    ndocc, nobs, _ = spaces
     
     Uf = two_body_mo_computer(geom, "f12_double_commutator", basis_set, basis_set, basis_set, basis_set,\
                               C_occ, C_occ, C_occ, C_occ, xyz_path, deriv_order, options)
-    F2 = form_F2(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
-    F = form_F(geom, basis_set, cabs_set, C_obs, C_cabs, ndocc, nobs, nri, xyz_path, deriv_order, options)
+    F2 = form_F2(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options)
+    F = form_F(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options)
 
     # Term 2
     terms = jnp.einsum('nmlP,kP->nmlk', F2, fk_o1)
@@ -339,4 +350,4 @@ def form_B(geom, basis_set, cabs_set, f, k, fk_o1, C_obs, C_cabs, ndocc, nobs, n
 
     B_nosymm = Uf + terms + jnp.transpose(terms, (1,0,3,2)) # nmlk->mnkl
 
-    return 0.5 * (B_nosymm + jnp.transpose(B_nosymm, (2,3,0,1))) # mnkl + klmn
+    return 0.5 * (B_nosymm + jnp.transpose(B_nosymm, (2,3,0,1))) # mnkl + klmn
\ No newline at end of file
diff --git a/tests/test_dipoles.py b/tests/test_dipoles.py
new file mode 100644
index 0000000..f371108
--- /dev/null
+++ b/tests/test_dipoles.py
@@ -0,0 +1,45 @@
+"""
+Test gradient computations
+"""
+import quax
+import psi4
+import pytest
+import numpy as np
+
+molecule = psi4.geometry("""
+0 1
+O   -0.000007070942     0.125146536460     0.000000000000
+H   -1.424097055410    -0.993053750648     0.000000000000
+H    1.424209276385    -0.993112599269     0.000000000000
+units bohr
+""")
+basis_name = 'sto-3g'
+psi4.set_options({
+                  'basis': basis_name,
+                  'scf_type': 'pk',
+                  'mp2_type':'conv',
+                  'e_convergence': 1e-10,
+                  'd_convergence':1e-10,
+                  'puream': 0
+                })
+
+options = {'damping':True, 'spectral_shift':False}
+efield = np.zeros((3))
+
+def test_hartree_fock_gradient(method='hf'):
+    psi4.properties(method, properties=['dipole'])
+    psi_deriv = psi4.variable("SCF DIPOLE")
+    quax_deriv = quax.core.efield_deriv(molecule, basis_name, method, electric_field=efield, deriv_order=1, options=options).reshape(-1,3)
+    quax_partial0 = quax.core.efield_deriv(molecule, basis_name, method, electric_field=efield, deriv_order=1, partial=(0,), options=options)
+    assert np.allclose(psi_deriv, quax_deriv)
+    assert np.allclose(psi_deriv[0,0], quax_partial0)
+
+def test_ccsd_gradient(method='ccsd'):
+    psi4.properties(method, properties=['dipole'])
+    psi_deriv = psi4.variable("CC DIPOLE")
+    quax_deriv = quax.core.efield_deriv(molecule, basis_name, method, electric_field=efield, deriv_order=1, options=options).reshape(-1,3)
+    quax_partial0 = quax.core.efield_deriv(molecule, basis_name, method, electric_field=efield, deriv_order=1, partial=(0,), options=options)
+    assert np.allclose(psi_deriv, quax_deriv)
+    assert np.allclose(psi_deriv[0,0], quax_partial0)
+
+
diff --git a/tests/test_gradients.py b/tests/test_gradients.py
index a1fa101..f562c6d 100644
--- a/tests/test_gradients.py
+++ b/tests/test_gradients.py
@@ -27,22 +27,22 @@
 
 def test_hartree_fock_gradient(method='hf'):
     psi_deriv = np.round(np.asarray(psi4.gradient(method + '/' + basis_name)), 10)
-    quax_deriv = np.asarray(quax.core.derivative(molecule, basis_name, method, deriv_order=1, options=options)).reshape(-1,3)
-    quax_partial0 = quax.core.partial_derivative(molecule, basis_name, method, deriv_order=1, partial=(0,))
+    quax_deriv = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=1, options=options).reshape(-1,3)
+    quax_partial0 = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=1, partial=(0,), options=options)
     assert np.allclose(psi_deriv, quax_deriv)
     assert np.allclose(psi_deriv[0,0], quax_partial0)
 
 def test_mp2_gradient(method='mp2'):
     psi_deriv = np.round(np.asarray(psi4.gradient(method + '/' + basis_name)), 10)
-    quax_deriv = np.asarray(quax.core.derivative(molecule, basis_name, method, deriv_order=1, options=options)).reshape(-1,3)
-    quax_partial0 = quax.core.partial_derivative(molecule, basis_name, method, deriv_order=1, partial=(0,))
+    quax_deriv = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=1, options=options).reshape(-1,3)
+    quax_partial0 = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=1, partial=(0,), options=options)
     assert np.allclose(psi_deriv, quax_deriv)
     assert np.allclose(psi_deriv[0,0], quax_partial0)
 
 def test_ccsd_t_gradient(method='ccsd(t)'):
     psi_deriv = np.round(np.asarray(psi4.gradient(method + '/' + basis_name)), 10)
-    quax_deriv = np.asarray(quax.core.derivative(molecule, basis_name, method, deriv_order=1, options=options)).reshape(-1,3)
-    quax_partial0 = quax.core.partial_derivative(molecule, basis_name, method, deriv_order=1, partial=(0,))
+    quax_deriv = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=1, options=options).reshape(-1,3)
+    quax_partial0 = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=1, partial=(0,), options=options)
     assert np.allclose(psi_deriv, quax_deriv)
     assert np.allclose(psi_deriv[0,0], quax_partial0)
 
diff --git a/tests/test_hessians.py b/tests/test_hessians.py
index b0a36d2..d45ce51 100644
--- a/tests/test_hessians.py
+++ b/tests/test_hessians.py
@@ -28,24 +28,24 @@
 def test_hartree_fock_hessian(method='hf'):
     psi_deriv = np.round(np.asarray(psi4.hessian(method + '/' + basis_name)), 10)
     n = psi_deriv.shape[0]
-    quax_deriv = np.asarray(quax.core.derivative(molecule, basis_name, method, deriv_order=2, options=options)).reshape(n,n)
-    quax_partial00 = quax.core.partial_derivative(molecule, basis_name, method, deriv_order=2, partial=(0,0))
+    quax_deriv = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, options=options).reshape(n,n)
+    quax_partial00 = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, partial=(0,0), options=options)
     assert np.allclose(psi_deriv, quax_deriv)
     assert np.allclose(psi_deriv[0,0], quax_partial00)
 
 def test_mp2_hessian(method='mp2'):
     psi_deriv = np.round(np.asarray(psi4.hessian(method + '/' + basis_name, dertype='gradient')), 10)
     n = psi_deriv.shape[0]
-    quax_deriv = np.asarray(quax.core.derivative(molecule, basis_name, method, deriv_order=2, options=options)).reshape(n,n)
-    quax_partial00 = quax.core.partial_derivative(molecule, basis_name, method, deriv_order=2, partial=(0,0))
+    quax_deriv = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, options=options).reshape(n,n)
+    quax_partial00 = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, partial=(0,0), options=options)
     assert np.allclose(psi_deriv, quax_deriv)
     assert np.allclose(psi_deriv[0,0], quax_partial00)
 
 def test_ccsd_t_hessian(method='ccsd(t)'):
     psi_deriv = np.round(np.asarray(psi4.hessian(method + '/' + basis_name, dertype='energy')), 10)
     n = psi_deriv.shape[0]
-    quax_deriv = np.asarray(quax.core.derivative(molecule, basis_name, method, deriv_order=2, options=options)).reshape(n,n)
-    quax_partial00 = quax.core.partial_derivative(molecule, basis_name, method, deriv_order=2, partial=(0,0))
+    quax_deriv = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, options=options).reshape(n,n)
+    quax_partial00 = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, partial=(0,0), options=options)
     assert np.allclose(psi_deriv, quax_deriv)
     assert np.allclose(psi_deriv[0,0], quax_partial00)
 

From ffd307d80dfd28e00b15a3b8e329a521f20bf836 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Fri, 5 Apr 2024 15:07:26 -0400
Subject: [PATCH 63/91] Clean-up

---
 quax/methods/ccsd.py         |  1 -
 quax/methods/hartree_fock.py |  1 -
 quax/methods/mp2.py          |  6 ------
 quax/methods/mp2f12.py       | 11 ++++-------
 4 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/quax/methods/ccsd.py b/quax/methods/ccsd.py
index 91a1697..428a8c8 100644
--- a/quax/methods/ccsd.py
+++ b/quax/methods/ccsd.py
@@ -1,7 +1,6 @@
 import jax 
 from jax.config import config; config.update("jax_enable_x64", True)
 import jax.numpy as jnp
-import psi4
 
 from .energy_utils import tei_transformation
 from .hartree_fock import restricted_hartree_fock
diff --git a/quax/methods/hartree_fock.py b/quax/methods/hartree_fock.py
index d329fdb..90d2d8e 100644
--- a/quax/methods/hartree_fock.py
+++ b/quax/methods/hartree_fock.py
@@ -1,7 +1,6 @@
 import jax 
 jax.config.update("jax_enable_x64", True)
 import jax.numpy as jnp
-import psi4
 
 from .ints import compute_integrals, compute_dipole_ints
 from .energy_utils import nuclear_repulsion, cholesky_orthogonalization
diff --git a/quax/methods/mp2.py b/quax/methods/mp2.py
index c846069..9bf617b 100644
--- a/quax/methods/mp2.py
+++ b/quax/methods/mp2.py
@@ -29,12 +29,6 @@ def restricted_mp2(*args, options, deriv_order=0, return_aux_data=False):
     eps_occ, eps_vir = eps[ncore:ndocc], eps[ndocc:]
     e_denom = jnp.reciprocal(eps_occ.reshape(-1, 1, 1, 1) - eps_vir.reshape(-1, 1, 1) + eps_occ.reshape(-1, 1) - eps_vir)
 
-    # Tensor contraction algo 
-    #mp2_correlation = jnp.einsum('iajb,iajb,iajb->', G, G, e_denom) +\
-    #                  jnp.einsum('iajb,iajb,iajb->', G - jnp.transpose(G, (0,3,2,1)), G, e_denom)
-    #mp2_total_energy = mp2_correlation + E_scf
-    #return E_scf + mp2_correlation
-
     # Loop algo (lower memory, but tei transform is the memory bottleneck)
     # Create all combinations of four loop variables to make XLA compilation easier
     indices = cartesian_product(jnp.arange(ndocc-ncore), jnp.arange(ndocc-ncore), jnp.arange(nvirt), jnp.arange(nvirt))
diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index 2e93479..1cd4e45 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -1,10 +1,7 @@
 import jax 
 from jax.config import config; config.update("jax_enable_x64", True)
 import jax.numpy as jnp
-from jax.lax import fori_loop
-import psi4
-import sys
-jnp.set_printoptions(threshold=sys.maxsize, linewidth=100)
+from jax.lax import fori_loop, cond
 
 from .basis_utils import build_CABS
 from .ints import compute_f12_oeints, compute_f12_teints
@@ -53,7 +50,7 @@ def restricted_mp2_f12(*args, options, deriv_order=0):
 
     def loop_energy(idx, f12_corr):
         i, j = indices[idx]
-        kd = jax.lax.cond(i == j, lambda: 1.0, lambda: 2.0)
+        kd = cond(i == j, lambda: 1.0, lambda: 2.0)
 
         D_ij = D[i, j, :, :]
 
@@ -62,7 +59,7 @@ def loop_energy(idx, f12_corr):
 
         V_s = 0.25 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd * (V_ij[i, j] + V_ij[j, i])
 
-        V_t = 0.25 * jax.lax.cond(i != j, lambda: (t_(i, j, i, j) - t_(i, j, j, i))
+        V_t = 0.25 * cond(i != j, lambda: (t_(i, j, i, j) - t_(i, j, j, i))
                                                * kd * (V_ij[i, j] - V_ij[j, i]), lambda: 0.0)
 
         CD_ij = jnp.einsum('mnab,ab->mnab', C, D_ij, optimize='optimal')
@@ -72,7 +69,7 @@ def loop_energy(idx, f12_corr):
                      * (B_ij[i, j, i, j] + B_ij[j, i, i, j]) \
                      * (t_(i, j, i, j) + t_(i, j, j, i)) * kd
 
-        B_t = 0.125 * jax.lax.cond(i != j, lambda: (t_(i, j, i, j) - t_(i, j, j, i)) * kd
+        B_t = 0.125 * cond(i != j, lambda: (t_(i, j, i, j) - t_(i, j, j, i)) * kd
                                                  * (B_ij[i, j, i, j] - B_ij[j, i, i, j])
                                                  * (t_(i, j, i, j) - t_(i, j, j, i)) * kd,
                                                  lambda: 0.0)

From 459de07eaeb07dad6cbb5415a1ae32428f6d00d9 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Fri, 5 Apr 2024 15:58:55 -0400
Subject: [PATCH 64/91] Dipole ints disk

---
 quax/integrals/libint_interface.cc | 231 +++++++++++++++++++++++++++++
 quax/methods/ints.py               |  49 +++++-
 2 files changed, 278 insertions(+), 2 deletions(-)

diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index cdb3214..2d9cd9b 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -1466,6 +1466,236 @@ void compute_1e_deriv_disk(std::string type, int max_deriv_order) {
     std::cout << " done" << std::endl;
 } // compute_1e_deriv_disk 
 
+// Write dipole derivatives up to `max_deriv_order` to disk
+// HDF5 File Name: dipole_derivs.h5 
+//      HDF5 Dataset names within the file:
+//      dipole_nbf1_nbf2_deriv1 
+//          shape (nbf,nbf,n_unique_1st_derivs)
+//      dipole_nbf1_nbf2_deriv2 
+//          shape (nbf,nbf,n_unique_2nd_derivs)
+//      dipole_nbf1_nbf2_deriv3 
+//          shape (nbf,nbf,n_unique_3rd_derivs)
+//      ...
+// The number of unique derivatives is essentially equal to the size of the
+// generalized upper triangle of the derivative tensor.
+void compute_dipole_deriv_disk(int max_deriv_order) {
+    std::cout << "Writing dipole integral derivative tensors up to order " << max_deriv_order << " to disk...";
+    long total_deriv_slices = 0;
+    for (int i = 1; i <= max_deriv_order; i++){
+        total_deriv_slices += how_many_derivs(natom, i);
+    }
+
+    // Shell pairs after screening
+    auto shellpairs = build_shellpairs(bs1, bs2);
+
+    // Create H5 File and prepare to fill with 0.0's
+    const H5std_string file_name("dipole_derivs.h5");
+    H5File* file = new H5File(file_name,H5F_ACC_TRUNC);
+    double fillvalue = 0.0;
+    DSetCreatPropList plist;
+    plist.setFillValue(PredType::NATIVE_DOUBLE, &fillvalue);
+
+    for (int deriv_order = 1; deriv_order <= max_deriv_order; deriv_order++){
+        // how many unique cartesian nuclear derivatives (e.g., so we only save one of d^2/dx1dx2 and d^2/dx2dx1, etc)
+        unsigned int nderivs_triu = how_many_derivs(natom, deriv_order);
+
+        // Create mappings from 1d buffer index (flattened upper triangle shell derivative index) to multidimensional shell derivative index
+        // Overlap and kinetic have different mappings than potential since potential has more elements in the buffer 
+        const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
+        
+        // Create mapping from 1d cartesian coodinate index (flattened upper triangle cartesian derivative index) to multidimensional index
+        const std::vector<std::vector<int>> cart_multidim_lookup = generate_multi_index_lookup(ncart, deriv_order);
+
+        // Define engines and buffers
+        std::vector<libint2::Engine> engines(nthreads);
+
+        // COM generator
+        std::array<double,3> COM = {0.000, 0.000, 0.000};
+
+        // Will compute overlap + electric dipole moments
+        engines[0] = libint2::Engine(libint2::Operator::emultipole1, max_nprim, max_l, deriv_order);
+        engines[0].set_params(COM); // with COM as the multipole origin
+        engines[0].set_precision(max_engine_precision);
+        engines[0].prescale_by(-1);
+        for (size_t i = 1; i != nthreads; ++i) {
+            engines[i] = engines[0];
+        }
+
+        // Define HDF5 dataset names
+        const H5std_string Mu_X_dset_name("mu_x_" + std::to_string(nbf1) + "_" + std::to_string(nbf2) 
+                                                  + "_deriv" + std::to_string(deriv_order));
+        const H5std_string Mu_Y_dset_name("mu_y_" + std::to_string(nbf1) + "_" + std::to_string(nbf2) 
+                                                  + "_deriv" + std::to_string(deriv_order));
+        const H5std_string Mu_Z_dset_name("mu_z_" + std::to_string(nbf1) + "_" + std::to_string(nbf2) 
+                                                  + "_deriv" + std::to_string(deriv_order));
+
+        // Define rank and dimensions of data that will be written to the file
+        hsize_t file_dims[] = {nbf1, nbf2, nderivs_triu};
+        DataSpace fspace(3, file_dims);
+        // Create dataset for each integral type and write 0.0's into the file 
+        DataSet* Mu_X_dataset = new DataSet(file->createDataSet(Mu_X_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
+        DataSet* Mu_Y_dataset = new DataSet(file->createDataSet(Mu_Y_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
+        DataSet* Mu_Z_dataset = new DataSet(file->createDataSet(Mu_Z_dset_name, PredType::NATIVE_DOUBLE, fspace, plist));
+        hsize_t stride[3] = {1, 1, 1}; // stride and block can be used to 
+        hsize_t block[3] = {1, 1, 1};  // add values to multiple places, useful if symmetry ever used.
+        hsize_t zerostart[3] = {0, 0, 0};
+
+        /* Initialize lock */
+        omp_init_lock(&lock);
+
+#pragma omp parallel for num_threads(nthreads)
+        for (const auto &pair : shellpairs) {
+            int p1 = pair.first;
+            int p2 = pair.second;
+
+            const auto &s1 = bs1[p1];
+            const auto &s2 = bs2[p2];
+            auto n1 = bs1[p1].size(); // number of basis functions in first shell
+            auto n2 = bs2[p2].size(); // number of basis functions in first shell
+            auto bf1 = shell2bf_1[p1];  // first basis function in first shell
+            auto bf2 = shell2bf_2[p2];  // first basis function in second shell
+            auto atom1 = shell2atom_1[p1]; // Atom index of shell 1
+            auto atom2 = shell2atom_2[p2]; // Atom index of shell 2
+            std::vector<long> shell_atom_index_list{atom1, atom2};
+
+            int thread_id = 0;
+#ifdef _OPENMP
+            thread_id = omp_get_thread_num();
+#endif
+            engines[thread_id].compute(s1, s2); // Compute shell set
+            const auto& buf_vec = engines[thread_id].results(); // will point to computed shell sets
+
+            // Define shell set slabs
+            double Mu_X_shellset_slab_12 [n1][n2][nderivs_triu] = {};
+            double Mu_Y_shellset_slab_12 [n1][n2][nderivs_triu] = {};
+            double Mu_Z_shellset_slab_12 [n1][n2][nderivs_triu] = {};
+            double Mu_X_shellset_slab_21 [n2][n1][nderivs_triu] = {};
+            double Mu_Y_shellset_slab_21 [n2][n1][nderivs_triu] = {};
+            double Mu_Z_shellset_slab_21 [n2][n1][nderivs_triu] = {};
+
+            // Loop over every possible unique nuclear cartesian derivative index (flattened upper triangle)
+            // For 1st derivatives of 2 atom system, this is 6. 2nd derivatives of 2 atom system: 21, etc
+            for(int nuc_idx = 0; nuc_idx < nderivs_triu; ++nuc_idx) {
+                // Look up multidimensional cartesian derivative index
+                auto multi_cart_idx = cart_multidim_lookup[nuc_idx];
+                // For overlap/kinetic and potential sepearately, create a vector of vectors called `indices`, where each subvector
+                // is your possible choices for the first derivative operator, second, third, etc and the total number of subvectors is order of differentiation
+                // What follows fills these indices
+                std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+
+                // Loop over each cartesian coordinate index which we are differentiating wrt for this nuclear cartesian derivative index
+                // and check to see if it is present in the shell duet, and where it is present in the potential operator
+                for (int j = 0; j < multi_cart_idx.size(); j++){
+                    int desired_atom_idx = multi_cart_idx[j] / 3;
+                    int desired_coord = multi_cart_idx[j] % 3;
+                    // Loop over shell indices
+                    for (int i = 0; i < 2; i++){
+                        int atom_idx = shell_atom_index_list[i];
+                        if (atom_idx == desired_atom_idx) {
+                            int tmp = 3 * i + desired_coord;
+                            indices[j].push_back(tmp);
+                        }
+                    }
+                }
+
+                // Now indices is a vector of vectors, where each subvector is your choices for the first derivative operator, second, third, etc
+                // and the total number of subvectors is the order of differentiation
+                // Now we want all combinations where we pick exactly one index from each subvector.
+                // This is achievable through a cartesian product
+                std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+                std::vector<int> buffer_indices;
+                // Overlap/Kinetic integrals: collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+                for (auto vec : index_combos)  {
+                    std::sort(vec.begin(), vec.end());
+                    int buf_idx = 0;
+                    auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+                    if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+                    buffer_indices.push_back(buf_idx * 4);
+                }
+
+                // Loop over shell block for each buffer index which contributes to this derivative
+                if (p1 != p2) {
+                    for(auto i = 0; i < buffer_indices.size(); ++i) {
+                        auto mu_x_shellset = buf_vec[buffer_indices[i] + 1];
+                        auto mu_y_shellset = buf_vec[buffer_indices[i] + 2];
+                        auto mu_z_shellset = buf_vec[buffer_indices[i] + 3];
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                                Mu_X_shellset_slab_12[f1][f2][nuc_idx] =
+                                    Mu_X_shellset_slab_21[f2][f1][nuc_idx] += mu_x_shellset[idx];
+                                Mu_Y_shellset_slab_12[f1][f2][nuc_idx] =
+                                    Mu_Y_shellset_slab_21[f2][f1][nuc_idx] += mu_y_shellset[idx];
+                                Mu_Z_shellset_slab_12[f1][f2][nuc_idx] =
+                                    Mu_Z_shellset_slab_21[f2][f1][nuc_idx] += mu_z_shellset[idx];
+                            }
+                        }
+                    }
+                } else { 
+                    for(auto i = 0; i < buffer_indices.size(); ++i) {
+                        auto mu_x_shellset = buf_vec[buffer_indices[i] + 1];
+                        auto mu_y_shellset = buf_vec[buffer_indices[i] + 2];
+                        auto mu_z_shellset = buf_vec[buffer_indices[i] + 3];
+                        for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                            for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                                Mu_X_shellset_slab_12[f1][f2][nuc_idx] += mu_x_shellset[idx];
+                                Mu_Y_shellset_slab_12[f1][f2][nuc_idx] += mu_y_shellset[idx];
+                                Mu_Z_shellset_slab_12[f1][f2][nuc_idx] += mu_z_shellset[idx];
+                            }
+                        }
+                    }
+                }
+            } // Unique nuclear cartesian derivative indices loop
+
+            /* Serialize HDF dataset writing using OpenMP lock */
+            omp_set_lock(&lock);
+
+            // Now write this shell set slab to HDF5 file
+            // Create file space hyperslab, defining where to write data to in file
+            hsize_t count[3] = {n1, n2, nderivs_triu};
+            hsize_t start[3] = {bf1, bf2, 0};
+            fspace.selectHyperslab(H5S_SELECT_SET, count, start, stride, block);
+            // Create dataspace defining for memory dataset to write to file
+            hsize_t mem_dims[] = {n1, n2, nderivs_triu};
+            DataSpace mspace(3, mem_dims);
+            mspace.selectHyperslab(H5S_SELECT_SET, count, zerostart, stride, block);
+            // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
+            Mu_X_dataset->write(Mu_X_shellset_slab_12, PredType::NATIVE_DOUBLE, mspace, fspace);
+            Mu_Y_dataset->write(Mu_Y_shellset_slab_12, PredType::NATIVE_DOUBLE, mspace, fspace);
+            Mu_Z_dataset->write(Mu_Z_shellset_slab_12, PredType::NATIVE_DOUBLE, mspace, fspace);
+
+            if (p1 != p2) {
+                // Now write this shell set slab to HDF5 file
+                // Create file space hyperslab, defining where to write data to in file
+                hsize_t count_T[3] = {n2, n1, nderivs_triu};
+                hsize_t start_T[3] = {bf2, bf1, 0};
+                fspace.selectHyperslab(H5S_SELECT_SET, count_T, start_T, stride, block);
+                // Create dataspace defining for memory dataset to write to file
+                hsize_t mem_dims_T[] = {n2, n1, nderivs_triu};
+                DataSpace mspace_T(3, mem_dims_T);
+                mspace_T.selectHyperslab(H5S_SELECT_SET, count_T, zerostart, stride, block);
+                // Write buffer data 'shellset_slab' with data type double from memory dataspace `mspace` to file dataspace `fspace`
+                Mu_X_dataset->write(Mu_X_shellset_slab_21, PredType::NATIVE_DOUBLE, mspace_T, fspace);
+                Mu_Y_dataset->write(Mu_Y_shellset_slab_21, PredType::NATIVE_DOUBLE, mspace_T, fspace);
+                Mu_Z_dataset->write(Mu_Z_shellset_slab_21, PredType::NATIVE_DOUBLE, mspace_T, fspace);
+            }
+
+            /* Release lock */
+            omp_unset_lock(&lock);
+
+        } // shell duet loops
+        // Delete datasets for this derivative order
+        delete Mu_X_dataset;
+        delete Mu_Y_dataset;
+        delete Mu_Z_dataset;
+    } // deriv order loop
+
+    /* Finished lock mechanism, destroy it */
+    omp_destroy_lock(&lock);
+    // close the file
+    delete file;
+    std::cout << " done" << std::endl;
+} // compute_dipole_deriv_disk 
+
 
 // Writes TEI derivatives up to `max_deriv_order` to disk.
 // HDF5 File Name: tei_derivs.h5 
@@ -2472,6 +2702,7 @@ PYBIND11_MODULE(libint_interface, m) {
     m.def("compute_dipole_derivs", &compute_dipole_derivs, "Computes electric (Cartesian) dipole nuclear integrals with libint");
     m.def("compute_2e_deriv", &compute_2e_deriv, "Computes two-electron integral nuclear derivatives with libint");
     m.def("compute_1e_deriv_disk", &compute_1e_deriv_disk, "Computes one-electron nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
+    m.def("compute_dipole_deriv_disk", &compute_dipole_deriv_disk, "Computes dipole nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
     m.def("compute_2e_deriv_disk", &compute_2e_deriv_disk, "Computes coulomb integral nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
     m.def("oei_deriv_disk", &oei_deriv_disk, "Computes overlap, kinetic, and potential integral derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
     m.def("oei_deriv_core", &oei_deriv_core, "Computes a single OEI integral derivative tensor, in memory.");
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index 670a25f..43c31a4 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -62,10 +62,24 @@ def compute_dipole_ints(geom, basis_set, xyz_path, deriv_order, options):
     basis_name = basis_set.name()
     libint_interface.initialize(xyz_path, basis_name, basis_name, basis_name, basis_name, options['ints_tolerance'])
 
-    oei_obj = OEI(basis_set, basis_set, xyz_path, deriv_order, 'dipole')
+    if algo == 'libint_disk':
+        # Check disk for currently existing integral derivatives
+        check_multipole = check_multipole_disk('dipole', basis_set, basis_set, deriv_order)
 
-    Mu_ = oei_obj.dipole(geom)
+        oei_obj = OEI(basis_set, basis_set, xyz_path, deriv_order, 'disk')
+        # If disk integral derivs are right, nothing to do
+        if check_multipole:
+            Mu_ = oei_obj.dipole(geom)
+        else:
+            libint_interface.compute_dipole_deriv_disk(deriv_order)
+            Mu_ = oei_obj.dipole(geom)
+    else:
+        # Precompute TEI derivatives
+        oei_obj = OEI(basis_set, basis_set, xyz_path, deriv_order, 'dipole')
+        # Compute integrals
+        Mu_ = oei_obj.dipole(geom)
 
+    libint_interface.finalize()
     return Mu_
 
 def compute_f12_oeints(geom, basis1, basis2, xyz_path, deriv_order, options, cabs):
@@ -240,6 +254,37 @@ def check_oei_disk(int_type, basis1, basis2, deriv_order, address=None):
         oeifile.close()
         correct_int_derivs = correct_deriv_order and correct_nbf1 and correct_nbf2 """
 
+def check_multipole_disk(int_type, basis1, basis2, deriv_order, address=None):
+    # Check OEI's in compute_integrals
+    correct_int_derivs = False
+    correct_nbf1 = correct_nbf2 = correct_deriv_order = False
+
+    if ((os.path.exists("dipole_derivs.h5"))):
+        print("Found currently existing multipole integral derivatives in your working directory. Trying to use them.")
+        oeifile = h5py.File('dipole_derivs.h5', 'r')
+        nbf1 = basis1.nbf()
+        nbf2 = basis2.nbf()
+
+        if int_type == "dipole":
+            oei_name = ["mu_x_" + str(nbf1) + "_" + str(nbf2) + "_deriv" + str(deriv_order),\
+                        "mu_y_" + str(nbf1) + "_" + str(nbf2) + "_deriv" + str(deriv_order),\
+                        "mu_z_" + str(nbf1) + "_" + str(nbf2) + "_deriv" + str(deriv_order)]
+        else:
+            raise Exception("Only dipole integrals currently.")
+
+        for name in list(oeifile.keys()):
+            if name in oei_name:
+                correct_nbf1 = oeifile[name].shape[0] == nbf1
+                correct_nbf2 = oeifile[name].shape[1] == nbf2
+                correct_deriv_order = True
+        oeifile.close()
+
+        correct_int_derivs = correct_deriv_order and correct_nbf1 and correct_nbf2
+
+    if correct_int_derivs:
+        print("Integral derivatives appear to be correct. Avoiding recomputation.")
+    return correct_int_derivs
+
 def check_tei_disk(int_type, basis1, basis2, basis3, basis4, deriv_order, address=None):
     # Check TEI's in compute_integrals
     correct_int_derivs = False

From 08b1d16070093e7d26c422e21851bedecd18d353 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 8 Apr 2024 12:28:01 -0400
Subject: [PATCH 65/91] Quadrupole ints working

---
 quax/core.py                       |  59 +++-
 quax/integrals/libint_interface.cc | 481 ++++++++++++++++++-----------
 quax/integrals/oei.py              | 155 +++++++++-
 quax/integrals/utils.h             | 173 +++++++++++
 quax/methods/hartree_fock.py       |  22 +-
 quax/methods/ints.py               |  46 ++-
 6 files changed, 726 insertions(+), 210 deletions(-)
 create mode 100644 quax/integrals/utils.h

diff --git a/quax/core.py b/quax/core.py
index c1eedb0..2f76d57 100644
--- a/quax/core.py
+++ b/quax/core.py
@@ -40,7 +40,7 @@ def check_options(options):
                        'ints_tolerance': 1.0e-14,
                        'freeze_core': False,
                        'beta': 1.0,
-                       'electric_field': False
+                       'electric_field': 0
                       }
 
     for key in options.keys():
@@ -390,16 +390,24 @@ def geom_deriv(molecule, basis_name, method, deriv_order=1, partial=None, option
 
     return compute_standard(method, args, deriv_order=deriv_order, partial=partial, options=options)
 
-def efield_deriv(molecule, basis_name, method, electric_field=None, deriv_order=1, partial=None, options=None):
+def efield_deriv(molecule, basis_name, method, efield=None, efield_grad=None,
+                 deriv_order=1, partial=None, options=None):
     """
     """
-    if type(electric_field) == type(None):
+    if type(efield) == type(None) and type(efield_grad) == type(None):
+        raise Exception("Electric field and its gradient must be given for quadrupole computation.")
+    elif type(efield) == type(None):
         raise Exception("Electric field must be given for dipole computation.")
     
     try:
         options['electric_field']
     except:
-        options['electric_field'] = True
+        if isinstance(efield, np.ndarray) and isinstance(efield_grad, np.ndarray):
+            options['electric_field'] = 2
+        elif isinstance(efield, np.ndarray):
+            options['electric_field'] = 1
+        else:
+            raise Exception("Electric field and its gradient must be given as numpy arrays.")
     
     # Set keyword options
     if options:
@@ -428,36 +436,48 @@ def efield_deriv(molecule, basis_name, method, electric_field=None, deriv_order=
     print("Basis name: ", basis_set.name())
     print("Number of basis functions: ", nbf)
 
+    if options['electric_field'] == 2:
+        args = (efield_grad, efield)
+    else:
+        args = (efield,)
+
     if method == 'scf' or method == 'hf' or method == 'rhf':
-        args = (electric_field, geom, basis_set, nelectrons, nuclear_charges, xyz_path)
+        args += (geom, basis_set, nelectrons, nuclear_charges, xyz_path)
     elif method =='mp2':
-        args = (electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+        args += (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
     elif method =='mp2-f12':
         cabs_set = build_RIBS(molecule, basis_set, basis_name + '-cabs')
-        args = (electric_field, geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+        args += (geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
     elif method =='ccsd':
-        args = (electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+        args += (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
     elif method =='ccsd(t)':
-        args = (electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+        args += (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
     else:
         print("Desired electronic structure method not understood. Use 'scf' 'hf' 'mp2' 'ccsd' or 'ccsd(t)' ")
 
     return compute_standard(method, args, deriv_order=deriv_order, partial=partial, options=options)
 
-def mixed_deriv(molecule, basis_name, method, electric_field=None,
+def mixed_deriv(molecule, basis_name, method, efield=None, efield_grad=None,
                 deriv_order_F=1, deriv_order_R=1, partial_F=None, partial_R=None, options=None):
     """
     """
     if deriv_order_F == 0 or deriv_order_R == 0:
         raise Exception("Error: Order of differentiation cannot equal zero. Use energy or geometry_deriv or electric_field instead.")
 
-    if type(electric_field) == type(None):
+    if type(efield) == type(None) and type(efield_grad) == type(None):
+        raise Exception("Electric field and its gradient must be given for quadrupole computation.")
+    elif type(efield) == type(None):
         raise Exception("Electric field must be given for dipole computation.")
     
     try:
         options['electric_field']
     except:
-        options['electric_field'] = True
+        if isinstance(efield, np.ndarray) and isinstance(efield_grad, np.ndarray):
+            options['electric_field'] = 2
+        elif isinstance(efield, np.ndarray):
+            options['electric_field'] = 1
+        else:
+            raise Exception("Electric field and its gradient must be given as numpy arrays.")
     
     # Set keyword options
     if options:
@@ -486,17 +506,22 @@ def mixed_deriv(molecule, basis_name, method, electric_field=None,
     print("Basis name: ", basis_set.name())
     print("Number of basis functions: ", nbf)
 
+    if options['electric_field'] == 2:
+        args = (efield, efield_grad)
+    else:
+        args = (efield,)
+
     if method == 'scf' or method == 'hf' or method == 'rhf':
-        args = (electric_field, geom, basis_set, nelectrons, nuclear_charges, xyz_path)
+        args += (geom, basis_set, nelectrons, nuclear_charges, xyz_path)
     elif method =='mp2':
-        args = (electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+        args += (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
     elif method =='mp2-f12':
         cabs_set = build_RIBS(molecule, basis_set, basis_name + '-cabs')
-        args = (electric_field, geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+        args += (geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
     elif method =='ccsd':
-        args = (electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+        args += (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
     elif method =='ccsd(t)':
-        args = (electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+        args += (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
     else:
         print("Desired electronic structure method not understood. Use 'scf' 'hf' 'mp2' 'ccsd' or 'ccsd(t)' ")
 
diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index 2d9cd9b..4f5c607 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -11,6 +11,8 @@
 #include <pybind11/stl.h>
 #include <libint2.hpp>
 
+#include "utils.h"
+
 // TODO support spherical harmonic gaussians, implement symmetry considerations, support 5th, 6th derivs
 
 namespace py = pybind11;
@@ -32,51 +34,6 @@ int nthreads = 1;
 double threshold;
 double max_engine_precision;
 
-// Creates atom objects from xyz file path
-std::vector<libint2::Atom> get_atoms(std::string xyzfilename) 
-{
-    std::ifstream input_file(xyzfilename);
-    std::vector<libint2::Atom> atoms = libint2::read_dotxyz(input_file);
-    return atoms;
-}
-
-// Creates a combined basis set
-libint2::BasisSet make_ao_cabs(std::string obs_name, libint2::BasisSet cabs) {
-    // Create OBS
-    obs_name.erase(obs_name.end() - 5, obs_name.end());
-    auto obs = libint2::BasisSet(obs_name, atoms);
-    obs.set_pure(false); // use cartesian gaussians
-
-    auto obs_idx = obs.atom2shell(atoms);
-    auto cabs_idx = cabs.atom2shell(atoms);
-
-    std::vector<std::vector<libint2::Shell>> el_bases(36); // Only consider atoms up to Kr
-    for (size_t i = 0; i < atoms.size(); i++) {
-        if (el_bases[atoms[i].atomic_number].empty()) {
-            std::vector<libint2::Shell> tmp;
-
-            for(long int& idx : obs_idx[i]) {
-                tmp.push_back(obs[idx]);
-            }
-            for(long int& idx : cabs_idx[i]) {
-                tmp.push_back(cabs[idx]);
-            }
-
-            stable_sort(tmp.begin(), tmp.end(), [](const auto& a, const auto& b) -> bool
-            {
-                return a.contr[0].l < b.contr[0].l;
-            });
-
-            el_bases[atoms[i].atomic_number] = tmp;
-        }
-    }
-
-    // Create CABS, union of orbital and auxiliary basis AOs
-    cabs = libint2::BasisSet(atoms, el_bases);
-    cabs.set_pure(false);
-    return cabs;
-}
-
 // Must call initialize before computing ints 
 void initialize(std::string xyzfilename, std::string basis1, std::string basis2,
                 std::string basis3, std::string basis4, double ints_tol) {
@@ -91,25 +48,25 @@ void initialize(std::string xyzfilename, std::string basis1, std::string basis2,
     bs1 = libint2::BasisSet(basis1, atoms);
     bs1.set_pure(false); // use cartesian gaussians
     if (basis1.find("-cabs", 10) != std::string::npos) {
-        bs1 = make_ao_cabs(basis1, bs1);
+        bs1 = make_ao_cabs(atoms, basis1, bs1);
     }
 
     bs2 = libint2::BasisSet(basis2, atoms);
     bs2.set_pure(false); // use cartesian gaussians
     if (basis2.find("-cabs", 10) != std::string::npos) {
-        bs2 = make_ao_cabs(basis2, bs2);
+        bs2 = make_ao_cabs(atoms, basis2, bs2);
     }
 
     bs3 = libint2::BasisSet(basis3, atoms);
     bs3.set_pure(false); // use cartesian gaussians
     if (basis3.find("-cabs", 10) != std::string::npos) {
-        bs3 = make_ao_cabs(basis3, bs3);
+        bs3 = make_ao_cabs(atoms, basis3, bs3);
     }
 
     bs4 = libint2::BasisSet(basis4, atoms);
     bs4.set_pure(false); // use cartesian gaussians
     if (basis4.find("-cabs", 10) != std::string::npos) {
-        bs4 = make_ao_cabs(basis4, bs4);
+        bs4 = make_ao_cabs(atoms, basis4, bs4);
     }
 
     nbf1 = bs1.nbf();
@@ -141,131 +98,6 @@ void finalize() {
     libint2::finalize();
 }
 
-// Used to make contracted Gaussian-type geminal for F12 methods
-std::vector<std::pair<double, double>> make_cgtg(double exponent) {
-    // The fitting coefficients and the exponents from MPQC
-    std::vector<std::pair<double, double>> exp_coeff = {};
-    std::vector<double> coeffs = {-0.31442480597241274, -0.30369575353387201, -0.16806968430232927,
-                                  -0.098115812152857612, -0.060246640234342785, -0.037263541968504843};
-    std::vector<double> exps = {0.22085085450735284, 1.0040191632019282, 3.6212173098378728,
-                                12.162483236221904, 45.855332448029337, 254.23460688554644};
-
-    for (int i = 0; i < exps.size(); i++){
-        auto exp_scaled = (exponent * exponent) * exps[i];
-        exp_coeff.push_back(std::make_pair(exp_scaled, coeffs[i]));
-    }
-    
-    return exp_coeff;
-}
-
-// Returns square of cgtg
-std::vector<std::pair<double, double>> take_square(std::vector<std::pair<double, double>> input) {
-    auto n = input.size();
-    std::vector<std::pair<double, double>> output;
-    for (int i = 0; i < n; ++i) {
-        auto e_i = input[i].first;
-        auto c_i = input[i].second;
-        for (int j = i; j < n; ++j) {
-            auto e_j = input[j].first;
-            auto c_j = input[j].second;
-            double scale = i == j ? 1.0 : 2.0;
-            output.emplace_back(std::make_pair(e_i + e_j, scale * c_i * c_j));
-        }
-    }
-    return output;
-}
-
-// Cartesian product of arbitrary number of vectors, given a vector of vectors
-// Used to find all possible combinations of indices which correspond to desired nuclear derivatives
-// For example, if molecule has two atoms, A and B, and we want nuclear derivative d^2/dAz dBz,
-// represented by deriv_vec = [0,0,1,0,0,1], and we are looping over 4 shells in ERI's,
-// and the four shells are atoms (0,0,1,1), then possible indices 
-// of the 0-11 shell cartesian component indices are {2,5} for d/dAz and {8,11} for d/dBz.
-// So the vector passed to cartesian_product is { {{2,5},{8,11}}, and all combinations of elements
-// from first and second subvectors are produced, and the total nuclear derivative of the shell
-// is obtained by summing all of these pieces together.
-// These resulting indices are converted to flattened Libint buffer indices using the generate_*_lookup functions,
-// explained below.
-std::vector<std::vector<int>> cartesian_product (const std::vector<std::vector<int>>& v) {
-    std::vector<std::vector<int>> s = {{}};
-    for (const auto& u : v) {
-        std::vector<std::vector<int>> r;
-        for (const auto& x : s) {
-            for (const auto y : u) {
-                r.push_back(x);
-                r.back().push_back(y);
-            }
-        }
-        s = std::move(r);
-    }
-    return s;
-}
-
-// Converts a derivative vector (3*Natom array of integers defining which coordinates to 
-// differentiate wrt and how many times) to a set of atom indices and coordinate indices 0,1,2->x,y,z
-void process_deriv_vec(std::vector<int> deriv_vec, 
-                       std::vector<int> *desired_atoms, 
-                       std::vector<int> *desired_coordinates) 
-{
-    for (int i = 0; i < deriv_vec.size(); i++) {
-        if (deriv_vec[i] > 0) {
-            for (int j = 0; j < deriv_vec[i]; j++) {
-                desired_atoms->push_back(i / 3);
-                desired_coordinates->push_back(i % 3);
-            }
-        }
-    }
-}
-
-// Returns total size of the libint integral derivative buffer, which is how many unique nth order derivatives
-// wrt k objects which have 3 differentiable coordinates each
-// k: how many centers
-// n: order of differentiation
-// l: how many atoms (needed for potential integrals only!)
-int how_many_derivs(int k, int n, int l = 0) {
-    int val = 1;
-    int factorial = 1;
-    for (int i=0; i < n; i++) {
-        val *= (3 * (k + l) + i);
-        factorial *= i + 1;
-    }
-    val /= factorial;
-    return val;
-}
-
-void cwr_recursion(std::vector<int> inp,
-                   std::vector<int> &out,
-                   std::vector<std::vector<int>> &result,
-                   int k, int i, int n)
-{
-    // base case: if combination size is k, add to result 
-    if (out.size() == k){
-        result.push_back(out);
-        return;
-    }
-    for (int j = i; j < n; j++){
-        out.push_back(inp[j]);
-        cwr_recursion(inp, out, result, k, j, n);
-        // backtrack - remove current element from solution
-        out.pop_back();
-    }
-}
-
-std::vector<std::vector<int>> generate_multi_index_lookup(int nparams, int deriv_order) {
-    using namespace std;
-    // Generate vector of indices 0 through nparams-1
-    vector<int> inp;
-    for (int i = 0; i < nparams; i++) {
-        inp.push_back(i);
-    }
-    // Generate all possible combinations with repitition. 
-    // These are upper triangle indices, and the length of them is the total number of derivatives
-    vector<int> out;
-    vector<vector<int>> combos;
-    cwr_recursion(inp, out, combos, deriv_order, 0, nparams);
-    return combos;
-}
-
 // Computes non-negligible shell pair list for one-electron integrals
 std::vector<std::pair<int, int>> build_shellpairs(libint2::BasisSet A, libint2::BasisSet B) {
     const auto A_equiv_B = (A == B);
@@ -543,6 +375,116 @@ std::vector<py::array> compute_dipole_ints() {
             py::array(Mu_Z.size(), Mu_Z.data())};
 }
 
+// Compute one-electron dipole and quadrupole integrals
+std::vector<py::array> compute_quadrupole_ints() {
+    // Shell pairs after screening
+    const auto bs1_equiv_bs2 = (bs1 == bs2);
+    auto shellpairs = build_shellpairs(bs1, bs2);
+
+    // Integral engine
+    std::vector<libint2::Engine> engines(nthreads);
+
+    // COM generator
+    std::array<double,3> COM = {0.000, 0.000, 0.000};
+
+    // Will compute overlap + electric dipole moments
+    engines[0] = libint2::Engine(libint2::Operator::emultipole2, max_nprim, max_l);
+    engines[0].set_params(COM); // with COM as the multipole origin
+    engines[0].set_precision(max_engine_precision);
+    engines[0].prescale_by(-1);
+    for (size_t i = 1; i != nthreads; ++i) {
+        engines[i] = engines[0];
+    }
+
+    size_t length = nbf1 * nbf2;
+    std::vector<double> Mu_X(length);  // Mu_X Vector
+    std::vector<double> Mu_Y(length);  // Mu_Y Vector
+    std::vector<double> Mu_Z(length);  // Mu_Z Vector
+    std::vector<double> Th_XX(length); // Th_XX Vector
+    std::vector<double> Th_XY(length); // Th_XY Vector
+    std::vector<double> Th_XZ(length); // Th_XZ Vector
+    std::vector<double> Th_YY(length); // Th_YY Vector
+    std::vector<double> Th_YZ(length); // Th_YZ Vector
+    std::vector<double> Th_ZZ(length); // Th_ZZ Vector
+
+#pragma omp parallel for num_threads(nthreads)
+    for (const auto &pair : shellpairs) {
+        int p1 = pair.first;
+        int p2 = pair.second;
+
+        const auto &s1 = bs1[p1];
+        const auto &s2 = bs2[p2];
+        auto n1 = bs1[p1].size(); // number of basis functions in first shell
+        auto n2 = bs2[p2].size(); // number of basis functions in first shell
+        auto bf1 = shell2bf_1[p1];  // first basis function in first shell
+        auto bf2 = shell2bf_2[p2];  // first basis function in second shell
+
+        int thread_id = 0;
+#ifdef _OPENMP
+        thread_id = omp_get_thread_num();
+#endif
+        engines[thread_id].compute(s1, s2); // Compute shell set
+        const auto& buf_vec = engines[thread_id].results(); // will point to computed shell sets
+        auto mu_x_shellset  = buf_vec[1];
+        auto mu_y_shellset  = buf_vec[2];
+        auto mu_z_shellset  = buf_vec[3];
+        auto th_xx_shellset = buf_vec[4];
+        auto th_xy_shellset = buf_vec[5];
+        auto th_xz_shellset = buf_vec[6];
+        auto th_yy_shellset = buf_vec[7];
+        auto th_yz_shellset = buf_vec[8];
+        auto th_zz_shellset = buf_vec[9];
+
+        if (mu_x_shellset == nullptr && mu_y_shellset == nullptr && mu_z_shellset == nullptr)
+            continue;  // nullptr returned if the entire shell-set was screened out
+
+        // Loop over shell block, keeping a total count idx for the size of shell set
+        if (bs1_equiv_bs2 && p1 != p2) {
+            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                    Mu_X[(bf1 + f1) * nbf2 + bf2 + f2]  =
+                         Mu_X[(bf2 + f2) * nbf1 + bf1 + f1]  = mu_x_shellset[idx];
+                    Mu_Y[(bf1 + f1) * nbf2 + bf2 + f2]  =
+                         Mu_Y[(bf2 + f2) * nbf1 + bf1 + f1]  = mu_y_shellset[idx];
+                    Mu_Z[(bf1 + f1) * nbf2 + bf2 + f2]  =
+                         Mu_Z[(bf2 + f2) * nbf1 + bf1 + f1]  = mu_z_shellset[idx];
+                    Th_XX[(bf1 + f1) * nbf2 + bf2 + f2] =
+                         Th_XX[(bf2 + f2) * nbf1 + bf1 + f1] = th_xx_shellset[idx];
+                    Th_XY[(bf1 + f1) * nbf2 + bf2 + f2] =
+                         Th_XY[(bf2 + f2) * nbf1 + bf1 + f1] = th_xy_shellset[idx];
+                    Th_XZ[(bf1 + f1) * nbf2 + bf2 + f2] =
+                         Th_XZ[(bf2 + f2) * nbf1 + bf1 + f1] = th_xz_shellset[idx];
+                    Th_YY[(bf1 + f1) * nbf2 + bf2 + f2] =
+                         Th_YY[(bf2 + f2) * nbf1 + bf1 + f1] = th_yy_shellset[idx];
+                    Th_YZ[(bf1 + f1) * nbf2 + bf2 + f2] =
+                         Th_YZ[(bf2 + f2) * nbf1 + bf1 + f1] = th_yz_shellset[idx];
+                    Th_ZZ[(bf1 + f1) * nbf2 + bf2 + f2] =
+                         Th_ZZ[(bf2 + f2) * nbf1 + bf1 + f1] = th_zz_shellset[idx];
+                }
+            }
+        } else {
+            for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                    Mu_X[(bf1 + f1) * nbf2 + bf2 + f2]  = mu_x_shellset[idx];
+                    Mu_Y[(bf1 + f1) * nbf2 + bf2 + f2]  = mu_y_shellset[idx];
+                    Mu_Z[(bf1 + f1) * nbf2 + bf2 + f2]  = mu_z_shellset[idx];
+                    Th_XX[(bf1 + f1) * nbf2 + bf2 + f2] = th_xx_shellset[idx];
+                    Th_XY[(bf1 + f1) * nbf2 + bf2 + f2] = th_xy_shellset[idx];
+                    Th_XZ[(bf1 + f1) * nbf2 + bf2 + f2] = th_xz_shellset[idx];
+                    Th_YY[(bf1 + f1) * nbf2 + bf2 + f2] = th_yy_shellset[idx];
+                    Th_YZ[(bf1 + f1) * nbf2 + bf2 + f2] = th_yz_shellset[idx];
+                    Th_ZZ[(bf1 + f1) * nbf2 + bf2 + f2] = th_zz_shellset[idx];
+                }
+            }
+        }
+    }
+    return {py::array(Mu_X.size(), Mu_X.data()), py::array(Mu_Y.size(), Mu_Y.data()),
+            py::array(Mu_Z.size(), Mu_Z.data()), py::array(Th_XX.size(), Th_XX.data()),
+            py::array(Th_XY.size(), Th_XY.data()), py::array(Th_XZ.size(), Th_XZ.data()),
+            py::array(Th_YY.size(), Th_YY.data()), py::array(Th_YZ.size(), Th_YZ.data()),
+            py::array(Th_ZZ.size(), Th_ZZ.data())};
+}
+
 // Computes two-electron integrals
 py::array compute_2e_int(std::string type, double beta) {
     // Shell screening
@@ -958,12 +900,9 @@ std::vector<py::array> compute_dipole_derivs(std::vector<int> deriv_vec) {
                     continue;  // nullptr returned if the entire shell-set was screened out
                 for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                     for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                        Mu_X[(bf1 + f1) * nbf2 + bf2 + f2] += mu_x_shellset[idx];
-                        Mu_X[(bf2 + f2) * nbf1 + bf1 + f1] += mu_x_shellset[idx];
-                        Mu_Y[(bf1 + f1) * nbf2 + bf2 + f2] += mu_y_shellset[idx];
-                        Mu_Y[(bf2 + f2) * nbf1 + bf1 + f1] += mu_y_shellset[idx];
-                        Mu_Z[(bf1 + f1) * nbf2 + bf2 + f2] += mu_z_shellset[idx];
-                        Mu_Z[(bf2 + f2) * nbf1 + bf1 + f1] += mu_z_shellset[idx];
+                        Mu_X[(bf1 + f1) * nbf2 + bf2 + f2] = Mu_X[(bf2 + f2) * nbf1 + bf1 + f1] += mu_x_shellset[idx];
+                        Mu_Y[(bf1 + f1) * nbf2 + bf2 + f2] = Mu_Y[(bf2 + f2) * nbf1 + bf1 + f1] += mu_y_shellset[idx];
+                        Mu_Z[(bf1 + f1) * nbf2 + bf2 + f2] = Mu_Z[(bf2 + f2) * nbf1 + bf1 + f1] += mu_z_shellset[idx];
                     }
                 }
             }
@@ -988,6 +927,186 @@ std::vector<py::array> compute_dipole_derivs(std::vector<int> deriv_vec) {
             py::array(Mu_Z.size(), Mu_Z.data())};
 }
 
+// Computes nuclear derivatives of dipole and quadrupole integrals
+std::vector<py::array> compute_quadrupole_derivs(std::vector<int> deriv_vec) {
+    assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
+    // Get order of differentiation
+    int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
+
+    // Convert deriv_vec to set of atom indices and their cartesian components which we are differentiating wrt
+    std::vector<int> desired_atom_indices;
+    std::vector<int> desired_coordinates;
+    process_deriv_vec(deriv_vec, &desired_atom_indices, &desired_coordinates);
+
+    // Create mappings from 1d buffer index (flattened upper triangle shell derivative index)
+    // to multidimensional shell derivative index
+    const std::vector<std::vector<int>> buffer_multidim_lookup = generate_multi_index_lookup(6, deriv_order);
+
+    // Shell pairs after screening
+    const auto bs1_equiv_bs2 = (bs1 == bs2);
+    auto shellpairs = build_shellpairs(bs1, bs2);
+
+    // Integral engine
+    std::vector<libint2::Engine> engines(nthreads);
+
+    // COM generator
+    std::array<double,3> COM = {0.000, 0.000, 0.000};
+
+    // Will compute overlap + electric dipole moments
+    engines[0] = libint2::Engine(libint2::Operator::emultipole2, max_nprim, max_l, deriv_order);
+    engines[0].set_params(COM); // with COM as the multipole origin
+    engines[0].set_precision(max_engine_precision);
+    engines[0].prescale_by(-1);
+    for (size_t i = 1; i != nthreads; ++i) {
+        engines[i] = engines[0];
+    }
+
+    size_t length = nbf1 * nbf2;
+    std::vector<double> Mu_X(length); // Mu_X Vector
+    std::vector<double> Mu_Y(length); // Mu_Y Vector
+    std::vector<double> Mu_Z(length); // Mu_Z Vector
+    std::vector<double> Th_XX(length); // Th_XX Vector
+    std::vector<double> Th_XY(length); // Th_XY Vector
+    std::vector<double> Th_XZ(length); // Th_XZ Vector
+    std::vector<double> Th_YY(length); // Th_YY Vector
+    std::vector<double> Th_YZ(length); // Th_YZ Vector
+    std::vector<double> Th_ZZ(length); // Th_ZZ Vector
+
+#pragma omp parallel for num_threads(nthreads)
+    for (const auto &pair : shellpairs) {
+        int p1 = pair.first;
+        int p2 = pair.second;
+
+        const auto &s1 = bs1[p1];
+        const auto &s2 = bs2[p2];
+        auto n1 = bs1[p1].size(); // number of basis functions in first shell
+        auto n2 = bs2[p2].size(); // number of basis functions in first shell
+        auto bf1 = shell2bf_1[p1];  // first basis function in first shell
+        auto bf2 = shell2bf_2[p2];  // first basis function in second shell
+        auto atom1 = shell2atom_1[p1]; // Atom index of shell 1
+        auto atom2 = shell2atom_2[p2]; // Atom index of shell 2
+
+        // Create list of atom indices corresponding to each shell. Libint uses longs, so we will too.
+        std::vector<long> shell_atom_index_list{atom1, atom2};
+
+        int thread_id = 0;
+#ifdef _OPENMP
+        thread_id = omp_get_thread_num();
+#endif
+        engines[thread_id].compute(s1, s2); // Compute shell set
+        const auto& buf_vec = engines[thread_id].results(); // will point to computed shell sets
+
+        // For every desired atom derivative, check shell and nuclear indices for a match,
+        // add it to subvector for that derivative
+        // Add in the coordinate index 0,1,2 (x,y,z) in desired coordinates and offset the index appropriately.
+        std::vector<std::vector<int>> indices(deriv_order, std::vector<int> (0,0));
+        for (int j = 0; j < desired_atom_indices.size(); j++){
+            int desired_atom_idx = desired_atom_indices[j];
+            // Shell indices
+            for (int i = 0; i < 2; i++){
+                int atom_idx = shell_atom_index_list[i];
+                if (atom_idx == desired_atom_idx) {
+                    int tmp = 3 * i + desired_coordinates[j];
+                    indices[j].push_back(tmp);
+                    continue; // Avoid adding same atom and coord twice
+                }
+            }
+        }
+
+        // Now indices is a vector of vectors, where each subvector is your choices
+        // for the first derivative operator, second, third, etc
+        // and the total number of subvectors is the order of differentiation
+        // Now we want all combinations where we pick exactly one index from each subvector.
+        // This is achievable through a cartesian product
+        std::vector<std::vector<int>> index_combos = cartesian_product(indices);
+        std::vector<int> buffer_indices;
+
+        // Collect needed buffer indices which we need to sum for this nuclear cartesian derivative
+        for (auto vec : index_combos)  {
+            std::sort(vec.begin(), vec.end());
+            int buf_idx = 0;
+            auto it = lower_bound(buffer_multidim_lookup.begin(), buffer_multidim_lookup.end(), vec);
+            if (it != buffer_multidim_lookup.end()) buf_idx = it - buffer_multidim_lookup.begin();
+            buffer_indices.push_back(buf_idx * 10);
+        }
+
+        // Loop over every buffer index and accumulate for every shell set.
+        if (bs1_equiv_bs2 && p1 != p2) {
+            for(auto i = 0; i < buffer_indices.size(); ++i) {
+                auto mu_x_shellset  = buf_vec[buffer_indices[i] + 1];
+                auto mu_y_shellset  = buf_vec[buffer_indices[i] + 2];
+                auto mu_z_shellset  = buf_vec[buffer_indices[i] + 3];
+                auto th_xx_shellset = buf_vec[buffer_indices[i] + 4];
+                auto th_xy_shellset = buf_vec[buffer_indices[i] + 5];
+                auto th_xz_shellset = buf_vec[buffer_indices[i] + 6];
+                auto th_yy_shellset = buf_vec[buffer_indices[i] + 7];
+                auto th_yz_shellset = buf_vec[buffer_indices[i] + 8];
+                auto th_zz_shellset = buf_vec[buffer_indices[i] + 9];
+                if (mu_x_shellset == nullptr && mu_y_shellset == nullptr && mu_z_shellset == nullptr &&
+                    th_xx_shellset == nullptr && th_xy_shellset == nullptr && th_xz_shellset == nullptr &&
+                    th_yy_shellset == nullptr && th_yz_shellset == nullptr && th_zz_shellset == nullptr)
+                    continue;  // nullptr returned if the entire shell-set was screened out
+                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                    for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                        Mu_X[(bf1 + f1) * nbf2 + bf2 + f2]  =
+                             Mu_X[(bf2 + f2) * nbf1 + bf1 + f1]  += mu_x_shellset[idx];
+                        Mu_Y[(bf1 + f1) * nbf2 + bf2 + f2]  =
+                             Mu_Y[(bf2 + f2) * nbf1 + bf1 + f1]  += mu_y_shellset[idx];
+                        Mu_Z[(bf1 + f1) * nbf2 + bf2 + f2]  =
+                             Mu_Z[(bf2 + f2) * nbf1 + bf1 + f1]  += mu_z_shellset[idx];
+                        Th_XX[(bf1 + f1) * nbf2 + bf2 + f2] =
+                             Th_XX[(bf2 + f2) * nbf1 + bf1 + f1] += th_xx_shellset[idx];
+                        Th_XY[(bf1 + f1) * nbf2 + bf2 + f2] =
+                             Th_XY[(bf2 + f2) * nbf1 + bf1 + f1] += th_xy_shellset[idx];
+                        Th_XZ[(bf1 + f1) * nbf2 + bf2 + f2] =
+                             Th_XZ[(bf2 + f2) * nbf1 + bf1 + f1] += th_xz_shellset[idx];
+                        Th_YY[(bf1 + f1) * nbf2 + bf2 + f2] =
+                             Th_YY[(bf2 + f2) * nbf1 + bf1 + f1] += th_yy_shellset[idx];
+                        Th_YZ[(bf1 + f1) * nbf2 + bf2 + f2] =
+                             Th_YZ[(bf2 + f2) * nbf1 + bf1 + f1] += th_yz_shellset[idx];
+                        Th_ZZ[(bf1 + f1) * nbf2 + bf2 + f2] =
+                             Th_ZZ[(bf2 + f2) * nbf1 + bf1 + f1] += th_zz_shellset[idx];
+                    }
+                }
+            }
+        } else {
+            for(auto i = 0; i < buffer_indices.size(); ++i) {
+                auto mu_x_shellset  = buf_vec[buffer_indices[i] + 1];
+                auto mu_y_shellset  = buf_vec[buffer_indices[i] + 2];
+                auto mu_z_shellset  = buf_vec[buffer_indices[i] + 3];
+                auto th_xx_shellset = buf_vec[buffer_indices[i] + 4];
+                auto th_xy_shellset = buf_vec[buffer_indices[i] + 5];
+                auto th_xz_shellset = buf_vec[buffer_indices[i] + 6];
+                auto th_yy_shellset = buf_vec[buffer_indices[i] + 7];
+                auto th_yz_shellset = buf_vec[buffer_indices[i] + 8];
+                auto th_zz_shellset = buf_vec[buffer_indices[i] + 9];
+                if (mu_x_shellset == nullptr && mu_y_shellset == nullptr && mu_z_shellset == nullptr &&
+                    th_xx_shellset == nullptr && th_xy_shellset == nullptr && th_xz_shellset == nullptr &&
+                    th_yy_shellset == nullptr && th_yz_shellset == nullptr && th_zz_shellset == nullptr)
+                    continue;  // nullptr returned if the entire shell-set was screened out
+                for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
+                    for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
+                        Mu_X[(bf1 + f1) * nbf2 + bf2 + f2]  += mu_x_shellset[idx];
+                        Mu_Y[(bf1 + f1) * nbf2 + bf2 + f2]  += mu_y_shellset[idx];
+                        Mu_Z[(bf1 + f1) * nbf2 + bf2 + f2]  += mu_z_shellset[idx];
+                        Th_XX[(bf1 + f1) * nbf2 + bf2 + f2] += th_xx_shellset[idx];
+                        Th_XY[(bf1 + f1) * nbf2 + bf2 + f2] += th_xy_shellset[idx];
+                        Th_XZ[(bf1 + f1) * nbf2 + bf2 + f2] += th_xz_shellset[idx];
+                        Th_YY[(bf1 + f1) * nbf2 + bf2 + f2] += th_yy_shellset[idx];
+                        Th_YZ[(bf1 + f1) * nbf2 + bf2 + f2] += th_yz_shellset[idx];
+                        Th_ZZ[(bf1 + f1) * nbf2 + bf2 + f2] += th_zz_shellset[idx];
+                    }
+                }
+            }
+        }
+    }
+    return {py::array(Mu_X.size(), Mu_X.data()), py::array(Mu_Y.size(), Mu_Y.data()),
+            py::array(Mu_Z.size(), Mu_Z.data()), py::array(Th_XX.size(), Th_XX.data()),
+            py::array(Th_XY.size(), Th_XY.data()), py::array(Th_XZ.size(), Th_XZ.data()),
+            py::array(Th_YY.size(), Th_YY.data()), py::array(Th_YZ.size(), Th_YZ.data()),
+            py::array(Th_ZZ.size(), Th_ZZ.data())};
+}
+
 // Computes nuclear derivatives of two-electron integrals
 py::array compute_2e_deriv(std::string type, double beta, std::vector<int> deriv_vec) {
     assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
@@ -2697,9 +2816,11 @@ PYBIND11_MODULE(libint_interface, m) {
     m.def("finalize", &finalize, "Kills libint");
     m.def("compute_1e_int", &compute_1e_int, "Computes one-electron integrals with libint");
     m.def("compute_dipole_ints", &compute_dipole_ints, "Computes electric (Cartesian) dipole integrals with libint");
+    m.def("compute_quadrupole_ints", &compute_quadrupole_ints, "Computes electric (Cartesian) dipole and quadrupole integrals with libint");
     m.def("compute_2e_int", &compute_2e_int, "Computes two-electron integrals with libint");
     m.def("compute_1e_deriv", &compute_1e_deriv, "Computes one-electron integral nuclear derivatives with libint");
     m.def("compute_dipole_derivs", &compute_dipole_derivs, "Computes electric (Cartesian) dipole nuclear integrals with libint");
+    m.def("compute_quadrupole_derivs", &compute_quadrupole_derivs, "Computes electric (Cartesian) dipole and quadrupole nuclear integrals with libint");
     m.def("compute_2e_deriv", &compute_2e_deriv, "Computes two-electron integral nuclear derivatives with libint");
     m.def("compute_1e_deriv_disk", &compute_1e_deriv_disk, "Computes one-electron nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
     m.def("compute_dipole_deriv_disk", &compute_dipole_deriv_disk, "Computes dipole nuclear derivative tensors from 1st order up to nth order and writes them to disk with HDF5");
diff --git a/quax/integrals/oei.py b/quax/integrals/oei.py
index 2679ab3..280c1b6 100644
--- a/quax/integrals/oei.py
+++ b/quax/integrals/oei.py
@@ -49,6 +49,8 @@ def __init__(self, basis1, basis2, xyz_path, max_deriv_order, mode):
         self.potential_deriv_p = jax.core.Primitive("potential_deriv")
         self.dipole_p = jax.core.Primitive("dipole")
         self.dipole_deriv_p = jax.core.Primitive("dipole_deriv")
+        self.quadrupole_p = jax.core.Primitive("quadrupole")
+        self.quadrupole_deriv_p = jax.core.Primitive("quadrupole_deriv")
 
         # Register primitive evaluation rules
         self.overlap_p.def_impl(self.overlap_impl)
@@ -59,6 +61,8 @@ def __init__(self, basis1, basis2, xyz_path, max_deriv_order, mode):
         self.potential_deriv_p.def_impl(self.potential_deriv_impl)
         self.dipole_p.def_impl(self.dipole_impl)
         self.dipole_deriv_p.def_impl(self.dipole_deriv_impl)
+        self.quadrupole_p.def_impl(self.quadrupole_impl)
+        self.quadrupole_deriv_p.def_impl(self.quadrupole_deriv_impl)
 
         # Register the JVP rules with JAX
         jax.interpreters.ad.primitive_jvps[self.overlap_p] = self.overlap_jvp
@@ -69,12 +73,15 @@ def __init__(self, basis1, basis2, xyz_path, max_deriv_order, mode):
         jax.interpreters.ad.primitive_jvps[self.potential_deriv_p] = self.potential_deriv_jvp
         jax.interpreters.ad.primitive_jvps[self.dipole_p] = self.dipole_jvp
         jax.interpreters.ad.primitive_jvps[self.dipole_deriv_p] = self.dipole_deriv_jvp
+        jax.interpreters.ad.primitive_jvps[self.quadrupole_p] = self.quadrupole_jvp
+        jax.interpreters.ad.primitive_jvps[self.quadrupole_deriv_p] = self.quadrupole_deriv_jvp
 
         # Register the batching rules with JAX
         jax.interpreters.batching.primitive_batchers[self.overlap_deriv_p] = self.overlap_deriv_batch
         jax.interpreters.batching.primitive_batchers[self.kinetic_deriv_p] = self.kinetic_deriv_batch
         jax.interpreters.batching.primitive_batchers[self.potential_deriv_p] = self.potential_deriv_batch
         jax.interpreters.batching.primitive_batchers[self.dipole_deriv_p] = self.dipole_deriv_batch
+        jax.interpreters.batching.primitive_batchers[self.quadrupole_deriv_p] = self.quadrupole_deriv_batch
 
     # Create functions to call primitives
     def overlap(self, geom):
@@ -100,6 +107,12 @@ def dipole(self, geom):
 
     def dipole_deriv(self, geom, deriv_vec):
         return self.dipole_deriv_p.bind(geom, deriv_vec)
+    
+    def quadrupole(self, geom):
+        return self.quadrupole_p.bind(geom)
+
+    def quadrupole_deriv(self, geom, deriv_vec):
+        return self.quadrupole_deriv_p.bind(geom, deriv_vec)
 
     # Create primitive evaluation rules
     def overlap_impl(self, geom):
@@ -123,6 +136,20 @@ def dipole_impl(self, geom):
         Mu_Y = Mu_Y.reshape(self.nbf1, self.nbf2)
         Mu_Z = Mu_Z.reshape(self.nbf1, self.nbf2)
         return jnp.stack([Mu_X, Mu_Y, Mu_Z])
+    
+    def quadrupole_impl(self, geom):
+        Mu_X, Mu_Y, Mu_Z, Th_XX, Th_XY,\
+            Th_XZ, Th_YY, Th_YZ, Th_ZZ = libint_interface.compute_quadrupole_ints()
+        Mu_X = Mu_X.reshape(self.nbf1, self.nbf2)
+        Mu_Y = Mu_Y.reshape(self.nbf1, self.nbf2)
+        Mu_Z = Mu_Z.reshape(self.nbf1, self.nbf2)
+        Th_XX = Th_XX.reshape(self.nbf1, self.nbf2)
+        Th_XY = Th_XY.reshape(self.nbf1, self.nbf2)
+        Th_XZ = Th_XZ.reshape(self.nbf1, self.nbf2)
+        Th_YY = Th_YY.reshape(self.nbf1, self.nbf2)
+        Th_YZ = Th_YZ.reshape(self.nbf1, self.nbf2)
+        Th_ZZ = Th_ZZ.reshape(self.nbf1, self.nbf2)
+        return jnp.stack([Mu_X, Mu_Y, Mu_Z, Th_XX, Th_XY, Th_XZ, Th_YY, Th_YZ, Th_ZZ])
 
     def overlap_deriv_impl(self, geom, deriv_vec):
         deriv_vec = np.asarray(deriv_vec, int)
@@ -226,7 +253,7 @@ def dipole_deriv_impl(self, geom, deriv_vec):
         idx = get_deriv_vec_idx(deriv_vec)
 
         if self.mode == 'dipole':
-            Mu_X, Mu_Y, Mu_Z = libint_interface.compute_dipole_derivs(deriv_vec)
+            Mu_X, Mu_Y, Mu_Z = libint_interface.compute_quadrupole_derivs(deriv_vec)
             Mu_X = Mu_X.reshape(self.nbf1, self.nbf2)
             Mu_Y = Mu_Y.reshape(self.nbf1, self.nbf2)
             Mu_Z = Mu_Z.reshape(self.nbf1, self.nbf2)
@@ -266,6 +293,101 @@ def dipole_deriv_impl(self, geom, deriv_vec):
                     raise Exception("Something went wrong reading integral derivative file")
             return jnp.stack([Mu_X, Mu_Y, Mu_Z])
 
+    def quadrupole_deriv_impl(self, geom, deriv_vec):
+        deriv_vec = np.asarray(deriv_vec, int)
+        deriv_order = np.sum(deriv_vec)
+        idx = get_deriv_vec_idx(deriv_vec)
+
+        if self.mode == 'quadrupole':
+            Mu_X, Mu_Y, Mu_Z, Th_XX, Th_XY,\
+                Th_XZ, Th_YY, Th_YZ, Th_ZZ = libint_interface.compute_quadrupole_derivs()
+            Mu_X = Mu_X.reshape(self.nbf1, self.nbf2)
+            Mu_Y = Mu_Y.reshape(self.nbf1, self.nbf2)
+            Mu_Z = Mu_Z.reshape(self.nbf1, self.nbf2)
+            Th_XX = Th_XX.reshape(self.nbf1, self.nbf2)
+            Th_XY = Th_XY.reshape(self.nbf1, self.nbf2)
+            Th_XZ = Th_XZ.reshape(self.nbf1, self.nbf2)
+            Th_YY = Th_YY.reshape(self.nbf1, self.nbf2)
+            Th_YZ = Th_YZ.reshape(self.nbf1, self.nbf2)
+            Th_ZZ = Th_ZZ.reshape(self.nbf1, self.nbf2)
+            return jnp.stack([Mu_X, Mu_Y, Mu_Z, Th_XX, Th_XY, Th_XZ, Th_YY, Th_YZ, Th_ZZ])
+        elif self.mode == 'disk':
+            if os.path.exists("quadrupole_derivs.h5"):
+                file_name = "quadrupole_derivs.h5"
+                dataset1_name = "mu_x_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order)
+                dataset2_name = "mu_y_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order)
+                dataset3_name = "mu_z_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order)
+                dataset4_name = "th_xx_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order)
+                dataset5_name = "th_xy_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order)
+                dataset6_name = "th_xz_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order)
+                dataset7_name = "th_yy_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order)
+                dataset8_name = "th_yz_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order)
+                dataset9_name = "th_zz_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order)
+            elif os.path.exists("quadrupole_partials.h5"):
+                file_name = "quadrupole_partials.h5"
+                dataset1_name = "mu_x_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order) + "_" + str(idx)
+                dataset2_name = "mu_y_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order) + "_" + str(idx)
+                dataset3_name = "mu_z_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order) + "_" + str(idx)
+                dataset4_name = "th_xx_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order) + "_" + str(idx)
+                dataset5_name = "th_xy_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order) + "_" + str(idx)
+                dataset6_name = "th_xz_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order) + "_" + str(idx)
+                dataset7_name = "th_yy_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order) + "_" + str(idx)
+                dataset8_name = "th_yz_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order) + "_" + str(idx)
+                dataset9_name = "th_zz_" + str(self.nbf1) + "_" + str(self.nbf2)\
+                                          + "_deriv" + str(deriv_order) + "_" + str(idx)
+            else:
+                raise Exception("Something went wrong reading integral derivative file")
+            with h5py.File(file_name, 'r') as f:
+                mu_x_set = f[dataset1_name]
+                mu_y_set = f[dataset2_name]
+                mu_z_set = f[dataset3_name]
+                th_xx_set = f[dataset1_name]
+                th_xy_set = f[dataset2_name]
+                th_xz_set = f[dataset3_name]
+                th_yy_set = f[dataset1_name]
+                th_yz_set = f[dataset2_name]
+                th_zz_set = f[dataset3_name]
+                if len(mu_x_set.shape) == 3:
+                    Mu_X = mu_x_set[:,:,idx]
+                    Mu_Y = mu_y_set[:,:,idx]
+                    Mu_Z = mu_z_set[:,:,idx]
+                    Th_XX = th_xx_set[:,:,idx]
+                    Th_XY = th_xy_set[:,:,idx]
+                    Th_XZ = th_xz_set[:,:,idx]
+                    Th_YY = th_yy_set[:,:,idx]
+                    Th_YZ = th_yz_set[:,:,idx]
+                    Th_ZZ = th_zz_set[:,:,idx]
+                elif len(mu_x_set.shape) == 2:
+                    Mu_X = mu_x_set[:,:]
+                    Mu_Y = mu_y_set[:,:]
+                    Mu_Z = mu_z_set[:,:]
+                    Th_XX = th_xx_set[:,:]
+                    Th_XY = th_xy_set[:,:]
+                    Th_XZ = th_xz_set[:,:]
+                    Th_YY = th_yy_set[:,:]
+                    Th_YZ = th_yz_set[:,:]
+                    Th_ZZ = th_zz_set[:,:]
+                else:
+                    raise Exception("Something went wrong reading integral derivative file")
+            return jnp.stack([Mu_X, Mu_Y, Mu_Z, Th_XX, Th_XY, Th_XZ, Th_YY, Th_YZ, Th_ZZ])
+
     def overlap_jvp(self, primals, tangents):
         geom, = primals
         primals_out = self.overlap(geom)
@@ -314,6 +436,18 @@ def dipole_deriv_jvp(self, primals, tangents):
         tangents_out = self.dipole_deriv(geom, deriv_vec + tangents[0])
         return primals_out, tangents_out
 
+    def quadrupole_jvp(self, primals, tangents):
+        geom, = primals
+        primals_out = self.quadrupole(geom)
+        tangents_out = self.quadrupole_deriv(geom, tangents[0])
+        return primals_out, tangents_out
+
+    def quadrupole_deriv_jvp(self, primals, tangents):
+        geom, deriv_vec = primals
+        primals_out = self.quadrupole_deriv(geom, deriv_vec)
+        tangents_out = self.quadrupole_deriv(geom, deriv_vec + tangents[0])
+        return primals_out, tangents_out
+
     # Define Batching rules, this is only needed since jax.jacfwd will call vmap on the JVP's
     # of each oei function
     # When the input argument of deriv_batch is batched along the 0'th axis
@@ -365,3 +499,22 @@ def dipole_deriv_batch(self, batched_args, batch_dims):
         results = jnp.concatenate(results, axis=0)
         return results, 0
 
+    def quadrupole_deriv_batch(self, batched_args, batch_dims):
+        geom_batch, deriv_batch = batched_args
+        geom_dim, deriv_dim = batch_dims
+        results = []
+        for i in deriv_batch:
+            tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9 = self.quadrupole_deriv(geom_batch, i)
+            mu_x = jnp.expand_dims(tmp1, axis=0)
+            mu_y = jnp.expand_dims(tmp2, axis=0)
+            mu_z = jnp.expand_dims(tmp3, axis=0)
+            th_xx = jnp.expand_dims(tmp4, axis=0)
+            th_xy = jnp.expand_dims(tmp5, axis=0)
+            th_xz = jnp.expand_dims(tmp6, axis=0)
+            th_yy = jnp.expand_dims(tmp7, axis=0)
+            th_yz = jnp.expand_dims(tmp8, axis=0)
+            th_zz = jnp.expand_dims(tmp9, axis=0)
+            results.append(jnp.stack([mu_x, mu_y, mu_z, th_xx, th_xy, th_xz, th_yy, th_yz, th_zz], axis=1))
+        results = jnp.concatenate(results, axis=0)
+        return results, 0
+
diff --git a/quax/integrals/utils.h b/quax/integrals/utils.h
new file mode 100644
index 0000000..7d4d631
--- /dev/null
+++ b/quax/integrals/utils.h
@@ -0,0 +1,173 @@
+// Utility functions for libint_interface
+
+// Creates atom objects from xyz file path
+std::vector<libint2::Atom> get_atoms(std::string xyzfilename) 
+{
+    std::ifstream input_file(xyzfilename);
+    std::vector<libint2::Atom> atoms = libint2::read_dotxyz(input_file);
+    return atoms;
+}
+
+// Creates a combined basis set
+libint2::BasisSet make_ao_cabs(std::vector<libint2::Atom> atoms, 
+                               std::string obs_name, libint2::BasisSet cabs)
+{
+    // Create OBS
+    obs_name.erase(obs_name.end() - 5, obs_name.end());
+    auto obs = libint2::BasisSet(obs_name, atoms);
+    obs.set_pure(false); // use cartesian gaussians
+
+    auto obs_idx = obs.atom2shell(atoms);
+    auto cabs_idx = cabs.atom2shell(atoms);
+
+    std::vector<std::vector<libint2::Shell>> el_bases(36); // Only consider atoms up to Kr
+    for (size_t i = 0; i < atoms.size(); i++) {
+        if (el_bases[atoms[i].atomic_number].empty()) {
+            std::vector<libint2::Shell> tmp;
+
+            for(long int& idx : obs_idx[i]) {
+                tmp.push_back(obs[idx]);
+            }
+            for(long int& idx : cabs_idx[i]) {
+                tmp.push_back(cabs[idx]);
+            }
+
+            stable_sort(tmp.begin(), tmp.end(), [](const auto& a, const auto& b) -> bool
+            {
+                return a.contr[0].l < b.contr[0].l;
+            });
+
+            el_bases[atoms[i].atomic_number] = tmp;
+        }
+    }
+
+    // Create CABS, union of orbital and auxiliary basis AOs
+    cabs = libint2::BasisSet(atoms, el_bases);
+    cabs.set_pure(false);
+    return cabs;
+}
+
+// Used to make contracted Gaussian-type geminal for F12 methods
+std::vector<std::pair<double, double>> make_cgtg(double exponent) {
+    // The fitting coefficients and the exponents from MPQC
+    std::vector<std::pair<double, double>> exp_coeff = {};
+    std::vector<double> coeffs = {-0.31442480597241274, -0.30369575353387201, -0.16806968430232927,
+                                  -0.098115812152857612, -0.060246640234342785, -0.037263541968504843};
+    std::vector<double> exps = {0.22085085450735284, 1.0040191632019282, 3.6212173098378728,
+                                12.162483236221904, 45.855332448029337, 254.23460688554644};
+
+    for (int i = 0; i < exps.size(); i++){
+        auto exp_scaled = (exponent * exponent) * exps[i];
+        exp_coeff.push_back(std::make_pair(exp_scaled, coeffs[i]));
+    }
+    
+    return exp_coeff;
+}
+
+// Returns square of cgtg
+std::vector<std::pair<double, double>> take_square(std::vector<std::pair<double, double>> input) {
+    auto n = input.size();
+    std::vector<std::pair<double, double>> output;
+    for (int i = 0; i < n; ++i) {
+        auto e_i = input[i].first;
+        auto c_i = input[i].second;
+        for (int j = i; j < n; ++j) {
+            auto e_j = input[j].first;
+            auto c_j = input[j].second;
+            double scale = i == j ? 1.0 : 2.0;
+            output.emplace_back(std::make_pair(e_i + e_j, scale * c_i * c_j));
+        }
+    }
+    return output;
+}
+
+// Cartesian product of arbitrary number of vectors, given a vector of vectors
+// Used to find all possible combinations of indices which correspond to desired nuclear derivatives
+// For example, if molecule has two atoms, A and B, and we want nuclear derivative d^2/dAz dBz,
+// represented by deriv_vec = [0,0,1,0,0,1], and we are looping over 4 shells in ERI's,
+// and the four shells are atoms (0,0,1,1), then possible indices 
+// of the 0-11 shell cartesian component indices are {2,5} for d/dAz and {8,11} for d/dBz.
+// So the vector passed to cartesian_product is { {{2,5},{8,11}}, and all combinations of elements
+// from first and second subvectors are produced, and the total nuclear derivative of the shell
+// is obtained by summing all of these pieces together.
+// These resulting indices are converted to flattened Libint buffer indices using the generate_*_lookup functions,
+// explained below.
+std::vector<std::vector<int>> cartesian_product (const std::vector<std::vector<int>>& v) {
+    std::vector<std::vector<int>> s = {{}};
+    for (const auto& u : v) {
+        std::vector<std::vector<int>> r;
+        for (const auto& x : s) {
+            for (const auto y : u) {
+                r.push_back(x);
+                r.back().push_back(y);
+            }
+        }
+        s = std::move(r);
+    }
+    return s;
+}
+
+// Converts a derivative vector (3*Natom array of integers defining which coordinates to 
+// differentiate wrt and how many times) to a set of atom indices and coordinate indices 0,1,2->x,y,z
+void process_deriv_vec(std::vector<int> deriv_vec, 
+                       std::vector<int> *desired_atoms, 
+                       std::vector<int> *desired_coordinates) 
+{
+    for (int i = 0; i < deriv_vec.size(); i++) {
+        if (deriv_vec[i] > 0) {
+            for (int j = 0; j < deriv_vec[i]; j++) {
+                desired_atoms->push_back(i / 3);
+                desired_coordinates->push_back(i % 3);
+            }
+        }
+    }
+}
+
+// Returns total size of the libint integral derivative buffer, which is how many unique nth order derivatives
+// wrt k objects which have 3 differentiable coordinates each
+// k: how many centers
+// n: order of differentiation
+// l: how many atoms (needed for potential integrals only!)
+int how_many_derivs(int k, int n, int l = 0) {
+    int val = 1;
+    int factorial = 1;
+    for (int i=0; i < n; i++) {
+        val *= (3 * (k + l) + i);
+        factorial *= i + 1;
+    }
+    val /= factorial;
+    return val;
+}
+
+void cwr_recursion(std::vector<int> inp,
+                   std::vector<int> &out,
+                   std::vector<std::vector<int>> &result,
+                   int k, int i, int n)
+{
+    // base case: if combination size is k, add to result 
+    if (out.size() == k){
+        result.push_back(out);
+        return;
+    }
+    for (int j = i; j < n; j++){
+        out.push_back(inp[j]);
+        cwr_recursion(inp, out, result, k, j, n);
+        // backtrack - remove current element from solution
+        out.pop_back();
+    }
+}
+
+std::vector<std::vector<int>> generate_multi_index_lookup(int nparams, int deriv_order) {
+    using namespace std;
+    // Generate vector of indices 0 through nparams-1
+    vector<int> inp;
+    for (int i = 0; i < nparams; i++) {
+        inp.push_back(i);
+    }
+    // Generate all possible combinations with repitition. 
+    // These are upper triangle indices, and the length of them is the total number of derivatives
+    vector<int> out;
+    vector<vector<int>> combos;
+    cwr_recursion(inp, out, combos, deriv_order, 0, nparams);
+    return combos;
+}
\ No newline at end of file
diff --git a/quax/methods/hartree_fock.py b/quax/methods/hartree_fock.py
index 90d2d8e..9481b1c 100644
--- a/quax/methods/hartree_fock.py
+++ b/quax/methods/hartree_fock.py
@@ -2,12 +2,14 @@
 jax.config.update("jax_enable_x64", True)
 import jax.numpy as jnp
 
-from .ints import compute_integrals, compute_dipole_ints
+from .ints import compute_integrals, compute_dipole_ints, compute_quadrupole_ints
 from .energy_utils import nuclear_repulsion, cholesky_orthogonalization
 
 def restricted_hartree_fock(*args, options, deriv_order=0, return_aux_data=False):
-    if options['electric_field']:
-        electric_field, geom, basis_set, nelectrons, nuclear_charges, xyz_path = args
+    if options['electric_field'] == 1:
+        efield, geom, basis_set, nelectrons, nuclear_charges, xyz_path = args
+    elif options['electric_field'] == 2:
+        efield_grad, efield, geom, basis_set, nelectrons, nuclear_charges, xyz_path = args
     else:
         geom, basis_set, nelectrons, nuclear_charges, xyz_path = args
 
@@ -45,9 +47,13 @@ def form_shift():
     H = T + V
     Enuc = nuclear_repulsion(geom.reshape(-1,3), nuclear_charges)
 
-    if options['electric_field']:
+    if options['electric_field'] == 1:
         Mu_XYZ = compute_dipole_ints(geom, basis_set, xyz_path, deriv_order, options)
-        H += jnp.einsum('x,xij->ij', electric_field, Mu_XYZ)
+        H += jnp.einsum('x,xij->ij', efield, Mu_XYZ)
+    elif options['electric_field'] == 2:
+        Mu_Th = compute_quadrupole_ints(geom, basis_set, xyz_path, deriv_order, options)
+        H += jnp.einsum('x,xij->ij', efield, Mu_Th[:3, :, :])
+        H += jnp.einsum('x,xij->ij', efield_grad[jnp.triu_indices(3)], Mu_Th[3:, :, :])
     
     def rhf_iter(F, D):
         E_scf = jnp.einsum('pq,pq->', F + H, D) + Enuc
@@ -94,8 +100,10 @@ def scf_procedure(carry):
                                                               # (iter, dE, dRMS, eps, C, D_old, D, E_scf)
     print(iteration, " RHF iterations performed")
 
-    if options['electric_field']:
-        E_scf += jnp.einsum('x,q,qx', electric_field, nuclear_charges, geom.reshape(-1,3))
+    if options['electric_field'] > 0:
+        E_scf += jnp.einsum('x,q,qx->', efield, nuclear_charges, geom.reshape(-1,3))
+    if options['electric_field'] > 1:
+        E_scf += jnp.einsum('ab,q,qa,qb->', jnp.triu(efield_grad), nuclear_charges, geom.reshape(-1,3), geom.reshape(-1,3))
 
     # If many orbitals are degenerate, warn that higher order derivatives may be unstable 
     tmp = jnp.round(eps, 6)
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index 43c31a4..b1be65c 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -82,6 +82,32 @@ def compute_dipole_ints(geom, basis_set, xyz_path, deriv_order, options):
     libint_interface.finalize()
     return Mu_
 
+def compute_quadrupole_ints(geom, basis_set, xyz_path, deriv_order, options):
+    # Load integral algo, decides to compute integrals in memory or use disk
+    algo = options['integral_algo']
+    basis_name = basis_set.name()
+    libint_interface.initialize(xyz_path, basis_name, basis_name, basis_name, basis_name, options['ints_tolerance'])
+
+    if algo == 'libint_disk':
+        # Check disk for currently existing integral derivatives
+        check_multipole = check_multipole_disk('quadrupole', basis_set, basis_set, deriv_order)
+
+        oei_obj = OEI(basis_set, basis_set, xyz_path, deriv_order, 'disk')
+        # If disk integral derivs are right, nothing to do
+        if check_multipole:
+            Mu_Th = oei_obj.quadrupole(geom)
+        else:
+            libint_interface.compute_quadrupole_deriv_disk(deriv_order)
+            Mu_Th = oei_obj.quadrupole(geom)
+    else:
+        # Precompute TEI derivatives
+        oei_obj = OEI(basis_set, basis_set, xyz_path, deriv_order, 'dipole')
+        # Compute integrals
+        Mu_Th = oei_obj.quadrupole(geom)
+
+    libint_interface.finalize()
+    return Mu_Th
+
 def compute_f12_oeints(geom, basis1, basis2, xyz_path, deriv_order, options, cabs):
     # Load integral algo, decides to compute integrals in memory or use disk
     algo = options['integral_algo']
@@ -259,18 +285,28 @@ def check_multipole_disk(int_type, basis1, basis2, deriv_order, address=None):
     correct_int_derivs = False
     correct_nbf1 = correct_nbf2 = correct_deriv_order = False
 
-    if ((os.path.exists("dipole_derivs.h5"))):
+    if ((os.path.exists(int_type, "_derivs.h5"))):
         print("Found currently existing multipole integral derivatives in your working directory. Trying to use them.")
-        oeifile = h5py.File('dipole_derivs.h5', 'r')
+        oeifile = h5py.File(int_type + '_derivs.h5', 'r')
         nbf1 = basis1.nbf()
         nbf2 = basis2.nbf()
 
         if int_type == "dipole":
-            oei_name = ["mu_x_" + str(nbf1) + "_" + str(nbf2) + "_deriv" + str(deriv_order),\
-                        "mu_y_" + str(nbf1) + "_" + str(nbf2) + "_deriv" + str(deriv_order),\
+            oei_name = ["mu_x_" + str(nbf1) + "_" + str(nbf2) + "_deriv" + str(deriv_order),
+                        "mu_y_" + str(nbf1) + "_" + str(nbf2) + "_deriv" + str(deriv_order),
                         "mu_z_" + str(nbf1) + "_" + str(nbf2) + "_deriv" + str(deriv_order)]
+        elif int_type == "quadrupole":
+            oei_name = ["mu_x_" + str(nbf1) + "_" + str(nbf2) + "_deriv" + str(deriv_order),
+                        "mu_y_" + str(nbf1) + "_" + str(nbf2) + "_deriv" + str(deriv_order),
+                        "mu_z_" + str(nbf1) + "_" + str(nbf2) + "_deriv" + str(deriv_order),
+                        "th_xx_" + str(nbf1) + "_" + str(nbf2) + "_deriv" + str(deriv_order),
+                        "th_xy_" + str(nbf1) + "_" + str(nbf2) + "_deriv" + str(deriv_order),
+                        "th_xz_" + str(nbf1) + "_" + str(nbf2) + "_deriv" + str(deriv_order),
+                        "th_yy_" + str(nbf1) + "_" + str(nbf2) + "_deriv" + str(deriv_order),
+                        "th_yz_" + str(nbf1) + "_" + str(nbf2) + "_deriv" + str(deriv_order),
+                        "th_zz_" + str(nbf1) + "_" + str(nbf2) + "_deriv" + str(deriv_order)]
         else:
-            raise Exception("Only dipole integrals currently.")
+            raise Exception("Integral type not recognized.")
 
         for name in list(oeifile.keys()):
             if name in oei_name:

From 90381a8688e13bd326aeb08166051601eee4534a Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 8 Apr 2024 12:58:39 -0400
Subject: [PATCH 66/91] Quadrupole for all methods

---
 quax/methods/ccsd.py         |  9 ++++++---
 quax/methods/ccsd_t.py       | 31 +++++++++++++++++--------------
 quax/methods/hartree_fock.py | 15 ++++++++-------
 quax/methods/mp2.py          |  9 ++++++---
 quax/methods/mp2f12.py       |  9 ++++++---
 5 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/quax/methods/ccsd.py b/quax/methods/ccsd.py
index 428a8c8..0f5dd95 100644
--- a/quax/methods/ccsd.py
+++ b/quax/methods/ccsd.py
@@ -6,9 +6,12 @@
 from .hartree_fock import restricted_hartree_fock
 
 def rccsd(*args, options, deriv_order=0, return_aux_data=False):
-    if options['electric_field']:
-        electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
-        scf_args = electric_field, geom, basis_set, nelectrons, nuclear_charges, xyz_path
+    if options['electric_field'] == 1:
+        efield, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
+        scf_args = efield, geom, basis_set, nelectrons, nuclear_charges, xyz_path
+    elif options['electric_field'] == 2:
+        efield_grad, efield, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
+        scf_args = efield_grad, efield, geom, basis_set, nelectrons, nuclear_charges, xyz_path
     else:
         geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
         scf_args = (geom, basis_set, nelectrons, nuclear_charges, xyz_path)
diff --git a/quax/methods/ccsd_t.py b/quax/methods/ccsd_t.py
index 5cf5c51..bc27957 100644
--- a/quax/methods/ccsd_t.py
+++ b/quax/methods/ccsd_t.py
@@ -94,18 +94,21 @@ def loop_k(arr2):
     return pT
 
 def rccsd_t(*args, options, deriv_order=0):
-    if options['electric_field']:
-        electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
-        ccsd_args = electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path
-    else:
-        geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
-        ccsd_args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
-
-    E_ccsd, T1, T2, V, fock_Od, fock_Vd = rccsd(*ccsd_args, options=options, deriv_order=deriv_order, return_aux_data=True)
-
-    print("Running (T) Correction...")
-    pT = perturbative_triples(T1, T2, V, fock_Od, fock_Vd)
-    #print("(T) energy correction:     ", pT)
-    #print("CCSD(T) total energy:      ", E_ccsd + pT)
-    return E_ccsd + pT
+   if options['electric_field']:
+       efield, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
+       ccsd_args = efield, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path
+   elif options['electric_field']:
+       efield_grad, efield, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
+       ccsd_args = efield_grad, efield, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path
+   else:
+       geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
+       ccsd_args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
+
+   E_ccsd, T1, T2, V, fock_Od, fock_Vd = rccsd(*ccsd_args, options=options, deriv_order=deriv_order, return_aux_data=True)
+
+   print("Running (T) Correction...")
+   pT = perturbative_triples(T1, T2, V, fock_Od, fock_Vd)
+   #print("(T) energy correction:     ", pT)
+   #print("CCSD(T) total energy:      ", E_ccsd + pT)
+   return E_ccsd + pT
 
diff --git a/quax/methods/hartree_fock.py b/quax/methods/hartree_fock.py
index 9481b1c..4844f9a 100644
--- a/quax/methods/hartree_fock.py
+++ b/quax/methods/hartree_fock.py
@@ -49,11 +49,11 @@ def form_shift():
 
     if options['electric_field'] == 1:
         Mu_XYZ = compute_dipole_ints(geom, basis_set, xyz_path, deriv_order, options)
-        H += jnp.einsum('x,xij->ij', efield, Mu_XYZ)
+        H += jnp.einsum('x,xij->ij', efield, Mu_XYZ, optimize = 'optimal')
     elif options['electric_field'] == 2:
         Mu_Th = compute_quadrupole_ints(geom, basis_set, xyz_path, deriv_order, options)
-        H += jnp.einsum('x,xij->ij', efield, Mu_Th[:3, :, :])
-        H += jnp.einsum('x,xij->ij', efield_grad[jnp.triu_indices(3)], Mu_Th[3:, :, :])
+        H += jnp.einsum('x,xij->ij', efield, Mu_Th[:3, :, :], optimize = 'optimal')
+        H += jnp.einsum('x,xij->ij', efield_grad[jnp.triu_indices(3)], Mu_Th[3:, :, :], optimize = 'optimal')
     
     def rhf_iter(F, D):
         E_scf = jnp.einsum('pq,pq->', F + H, D) + Enuc
@@ -65,7 +65,7 @@ def rhf_iter(F, D):
         D = Cocc @ Cocc.T
         return E_scf, D, C, eps
 
-    def DIIS(F, D, S):
+    def DIIS_Err(F, D, S):
         diis_e = jnp.einsum('ij,jk,kl->il', F, D, S) - jnp.einsum('ij,jk,kl->il', S, D, F)
         diis_e = A @ diis_e @ A
         return jnp.mean(diis_e ** 2) ** 0.5
@@ -83,7 +83,7 @@ def scf_procedure(carry):
         # Compute energy, transform Fock and diagonalize, get new density
         e_scf, D_, C_, eps_ = rhf_iter(F, D_)
 
-        de_, drms_ = jax.lax.cond(iter + 1 == maxit, lambda: (1.e-15, 1.e-15), lambda: (e_old - e_scf, DIIS(F, D_, S)))
+        de_, drms_ = jax.lax.cond(iter + 1 == maxit, lambda: (1.e-15, 1.e-15), lambda: (e_old - e_scf, DIIS_Err(F, D_, S)))
 
         return (iter + 1, de_, drms_, eps_, C_, D_old, D_, e_scf)
 
@@ -101,9 +101,10 @@ def scf_procedure(carry):
     print(iteration, " RHF iterations performed")
 
     if options['electric_field'] > 0:
-        E_scf += jnp.einsum('x,q,qx->', efield, nuclear_charges, geom.reshape(-1,3))
+        E_scf += jnp.einsum('x,q,qx->', efield, nuclear_charges, geom.reshape(-1,3), optimize = 'optimal')
     if options['electric_field'] > 1:
-        E_scf += jnp.einsum('ab,q,qa,qb->', jnp.triu(efield_grad), nuclear_charges, geom.reshape(-1,3), geom.reshape(-1,3))
+        E_scf += jnp.einsum('ab,q,qa,qb->', jnp.triu(efield_grad), nuclear_charges,
+                            geom.reshape(-1,3), geom.reshape(-1,3), optimize = 'optimal')
 
     # If many orbitals are degenerate, warn that higher order derivatives may be unstable 
     tmp = jnp.round(eps, 6)
diff --git a/quax/methods/mp2.py b/quax/methods/mp2.py
index 9bf617b..fcd8401 100644
--- a/quax/methods/mp2.py
+++ b/quax/methods/mp2.py
@@ -7,9 +7,12 @@
 from .hartree_fock import restricted_hartree_fock
 
 def restricted_mp2(*args, options, deriv_order=0, return_aux_data=False):
-    if options['electric_field']:
-        electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
-        scf_args = electric_field, geom, basis_set, nelectrons, nuclear_charges, xyz_path
+    if options['electric_field'] == 1:
+        efield, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
+        scf_args = efield, geom, basis_set, nelectrons, nuclear_charges, xyz_path
+    elif options['electric_field'] == 2:
+        efield_grad, efield, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
+        scf_args = efield_grad, efield, geom, basis_set, nelectrons, nuclear_charges, xyz_path
     else:
         geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
         scf_args = (geom, basis_set, nelectrons, nuclear_charges, xyz_path)
diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index 1cd4e45..1e479ec 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -9,9 +9,12 @@
 from .mp2 import restricted_mp2
 
 def restricted_mp2_f12(*args, options, deriv_order=0):
-    if options['electric_field']:
-        electric_field, geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
-        mp2_args = electric_field, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path
+    if options['electric_field'] == 1:
+        efield, geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
+        mp2_args = efield, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path
+    elif options['electric_field'] == 2:
+        efield_grad, efield, geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
+        mp2_args = efield_grad, efield, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path
     else:
         geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
         mp2_args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)

From 72cbdf8a1034bf1f2ea125e457cd5a8f074eed28 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 8 Apr 2024 13:19:41 -0400
Subject: [PATCH 67/91] Minor fix

---
 quax/core.py          | 2 +-
 quax/integrals/oei.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/quax/core.py b/quax/core.py
index 2f76d57..33a6aac 100644
--- a/quax/core.py
+++ b/quax/core.py
@@ -526,4 +526,4 @@ def mixed_deriv(molecule, basis_name, method, efield=None, efield_grad=None,
         print("Desired electronic structure method not understood. Use 'scf' 'hf' 'mp2' 'ccsd' or 'ccsd(t)' ")
 
     return compute_mixed(method, args, deriv_order_F=deriv_order_F, deriv_order_R=deriv_order_R, 
-                         partial_F=partial_F, partial_R=partial_R, options=options)
\ No newline at end of file
+                         partial_F=partial_F, partial_R=partial_R, options=options)
diff --git a/quax/integrals/oei.py b/quax/integrals/oei.py
index 280c1b6..c92782f 100644
--- a/quax/integrals/oei.py
+++ b/quax/integrals/oei.py
@@ -253,7 +253,7 @@ def dipole_deriv_impl(self, geom, deriv_vec):
         idx = get_deriv_vec_idx(deriv_vec)
 
         if self.mode == 'dipole':
-            Mu_X, Mu_Y, Mu_Z = libint_interface.compute_quadrupole_derivs(deriv_vec)
+            Mu_X, Mu_Y, Mu_Z = libint_interface.compute_dipole_derivs(deriv_vec)
             Mu_X = Mu_X.reshape(self.nbf1, self.nbf2)
             Mu_Y = Mu_Y.reshape(self.nbf1, self.nbf2)
             Mu_Z = Mu_Z.reshape(self.nbf1, self.nbf2)

From d77e957ebec2121243e1794f132a49329519d280 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 15 Apr 2024 11:27:36 -0400
Subject: [PATCH 68/91] Better if/else for perms

---
 quax/integrals/libint_interface.cc | 68 +++++++++++-------------------
 1 file changed, 24 insertions(+), 44 deletions(-)

diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index 4f5c607..713df91 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -568,7 +568,6 @@ py::array compute_2e_int(std::string type, double beta) {
             if (ints_shellset == nullptr)
                 continue;  // nullptr returned if the entire shell-set was screened out
 
-            auto full = false;
             if (bs1_equiv_bs2 && p1 != p2 && bs3_equiv_bs4 && p3 != p4) {
                 // Loop over shell block, keeping a total count idx for the size of shell set
                 for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
@@ -584,14 +583,14 @@ py::array compute_2e_int(std::string type, double beta) {
                                 size_t offset_4 = bf4 + f4;
                                 size_t offset_4_T = (bf4 + f4) * nbf3;
                                 result[offset_1 + offset_2 + offset_3 + offset_4] = 
-                                    result[offset_1_T + offset_2_T + offset_3_T + offset_4_T] = ints_shellset[idx];
+                                    result[offset_1_T + offset_2_T + offset_3_T + offset_4_T] =
+                                        result[offset_1_T + offset_2_T + offset_3 + offset_4] =
+                                            result[offset_1 + offset_2 + offset_3_T + offset_4_T] = ints_shellset[idx];
                             }
                         }
                     }
                 }
-                full = true;
-            } 
-            if (bs1_equiv_bs2 && p1 != p2) {
+            } else if (bs1_equiv_bs2 && p1 != p2) {
                 // Loop over shell block, keeping a total count idx for the size of shell set
                 for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                     size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
@@ -609,9 +608,7 @@ py::array compute_2e_int(std::string type, double beta) {
                         }
                     }
                 }
-                full = true;
-            } 
-            if (bs3_equiv_bs4 && p3 != p4) {
+            } else if (bs3_equiv_bs4 && p3 != p4) {
                 // Loop over shell block, keeping a total count idx for the size of shell set
                 for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                     size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
@@ -629,9 +626,7 @@ py::array compute_2e_int(std::string type, double beta) {
                         }
                     }
                 }
-                full = true;
-            } 
-            if (full == false) {
+            } else {
                 // Loop over shell block, keeping a total count idx for the size of shell set
                 for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                     size_t offset_1 = (bf1 + f1) * nbf2 * nbf3 * nbf4;
@@ -1254,7 +1249,6 @@ py::array compute_2e_deriv(std::string type, double beta, std::vector<int> deriv
             engines[thread_id].compute(s1, s2, s3, s4); // Compute shell set
             const auto& buf_vec = engines[thread_id].results(); // will point to computed shell sets
             
-            auto full = false;
             if (bs1_equiv_bs2 && p1 != p2 && bs3_equiv_bs4 && p3 != p4) {
                 for(auto i = 0; i < buffer_indices.size(); ++i) {
                     auto ints_shellset = buf_vec[buffer_indices[i]];
@@ -1272,15 +1266,15 @@ py::array compute_2e_deriv(std::string type, double beta, std::vector<int> deriv
                                     size_t offset_4 = bf4 + f4;
                                     size_t offset_4_T = (bf4 + f4) * nbf3;
                                     result[offset_1 + offset_2 + offset_3 + offset_4] = 
-                                        result[offset_1_T + offset_2_T + offset_3_T + offset_4_T] += ints_shellset[idx];
+                                      result[offset_1_T + offset_2_T + offset_3_T + offset_4_T] =
+                                        result[offset_1_T + offset_2_T + offset_3 + offset_4] =
+                                          result[offset_1 + offset_2 + offset_3_T + offset_4_T] += ints_shellset[idx];
                                 }
                             }
                         }
                     }
                 }
-                full = true;
-            }
-            if (bs1_equiv_bs2 && p1 != p2) {
+            } else if (bs1_equiv_bs2 && p1 != p2) {
                 for(auto i = 0; i < buffer_indices.size(); ++i) {
                     auto ints_shellset = buf_vec[buffer_indices[i]];
                     if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
@@ -1301,9 +1295,7 @@ py::array compute_2e_deriv(std::string type, double beta, std::vector<int> deriv
                         }
                     }
                 }
-                full = true;
-            }
-            if (bs3_equiv_bs4 && p3 != p4) {
+            } else if (bs3_equiv_bs4 && p3 != p4) {
                 for(auto i = 0; i < buffer_indices.size(); ++i) {
                     auto ints_shellset = buf_vec[buffer_indices[i]];
                     if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
@@ -1325,9 +1317,7 @@ py::array compute_2e_deriv(std::string type, double beta, std::vector<int> deriv
                         }
                     }
                 }
-                full = true;
-            }
-            if (full == false) {
+            } else {
                 for(auto i = 0; i < buffer_indices.size(); ++i) {
                     auto ints_shellset = buf_vec[buffer_indices[i]];
                     if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
@@ -1995,7 +1985,6 @@ void compute_2e_deriv_disk(std::string type, double beta, int max_deriv_order) {
                         buffer_indices.push_back(buf_idx);
                     }
 
-                    auto full = false;
                     // Loop over shell block, keeping a total count idx for the size of shell set
                     if (bs1_equiv_bs2 && p1 != p2 && bs3_equiv_bs4 && p3 != p4) {
                         for(auto i = 0; i < buffer_indices.size(); ++i) {
@@ -2006,15 +1995,15 @@ void compute_2e_deriv_disk(std::string type, double beta, int max_deriv_order) {
                                     for(auto f3 = 0; f3 != n3; ++f3) {
                                         for(auto f4 = 0; f4 != n4; ++f4, ++idx) {
                                             ints_shellset_slab_1234[f1][f2][f3][f4][nuc_idx] =
-                                                ints_shellset_slab_2143[f2][f1][f4][f3][nuc_idx] += ints_shellset[idx];
+                                              ints_shellset_slab_2143[f2][f1][f4][f3][nuc_idx] =
+                                                ints_shellset_slab_2134[f2][f1][f3][f4][nuc_idx] =
+                                                  ints_shellset_slab_1243[f1][f2][f4][f3][nuc_idx] += ints_shellset[idx];
                                         }
                                     }
                                 }
                             }
                         }
-                        full = true;
-                    }
-                    if (bs1_equiv_bs2 && p1 != p2) {
+                    } else if (bs1_equiv_bs2 && p1 != p2) {
                         for(auto i = 0; i < buffer_indices.size(); ++i) {
                             auto ints_shellset = buf_vec[buffer_indices[i]];
                             if (ints_shellset == nullptr) continue;
@@ -2029,9 +2018,7 @@ void compute_2e_deriv_disk(std::string type, double beta, int max_deriv_order) {
                                 }
                             }
                         }
-                        full = true;
-                    }
-                    if (bs3_equiv_bs4 && p3 != p4) {
+                    } else if (bs3_equiv_bs4 && p3 != p4) {
                         for(auto i = 0; i < buffer_indices.size(); ++i) {
                             auto ints_shellset = buf_vec[buffer_indices[i]];
                             if (ints_shellset == nullptr) continue;
@@ -2046,9 +2033,7 @@ void compute_2e_deriv_disk(std::string type, double beta, int max_deriv_order) {
                                 }
                             }
                         }
-                        full = true;
-                    }
-                    if (full == false) {
+                    } else {
                         for(auto i = 0; i < buffer_indices.size(); ++i) {
                             auto ints_shellset = buf_vec[buffer_indices[i]];
                             if (ints_shellset == nullptr) continue;
@@ -2709,7 +2694,6 @@ py::array eri_deriv_core(int deriv_order) {
                     buffer_indices.push_back(buf_idx);
                 }
 
-                auto full = false;
                 if (p1 != p2 && p3 != p4) {
                     for(auto i = 0; i < buffer_indices.size(); ++i) {
                         auto ints_shellset = buf_vec[buffer_indices[i]];
@@ -2727,15 +2711,15 @@ py::array eri_deriv_core(int deriv_order) {
                                         size_t offset_4 = bf4 + f4;
                                         size_t offset_4_T = (bf4 + f4) * nbf3;
                                         result[offset_1 + offset_2 + offset_3 + offset_4 + offset_nuc_idx] = 
-                                            result[offset_1_T + offset_2_T + offset_3_T + offset_4_T  + offset_nuc_idx] += ints_shellset[idx];
+                                            result[offset_1_T + offset_2_T + offset_3_T + offset_4_T  + offset_nuc_idx] =
+                                            result[offset_1_T + offset_2_T + offset_3 + offset_4  + offset_nuc_idx] =
+                                            result[offset_1 + offset_2 + offset_3_T + offset_4_T  + offset_nuc_idx] += ints_shellset[idx];
                                     }
                                 }
                             }
                         }
                     }
-                    full = true;
-                }
-                if (p1 != p2) {
+                } else if (p1 != p2) {
                     for(auto i = 0; i < buffer_indices.size(); ++i) {
                         auto ints_shellset = buf_vec[buffer_indices[i]];
                         if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
@@ -2756,9 +2740,7 @@ py::array eri_deriv_core(int deriv_order) {
                             }
                         }
                     }
-                    full = true;
-                }
-                if (p3 != p4) {
+                } else if (p3 != p4) {
                     for(auto i = 0; i < buffer_indices.size(); ++i) {
                         auto ints_shellset = buf_vec[buffer_indices[i]];
                         if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out
@@ -2780,9 +2762,7 @@ py::array eri_deriv_core(int deriv_order) {
                             }
                         }
                     }
-                    full = true;
-                }
-                if (full == false) {
+                } else {
                     for(auto i = 0; i < buffer_indices.size(); ++i) {
                         auto ints_shellset = buf_vec[buffer_indices[i]];
                         if (ints_shellset == nullptr) continue;  // nullptr returned if shell-set screened out

From 5454830f4ed0a62987c00a365370384de1c91d69 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 15 Apr 2024 12:18:15 -0400
Subject: [PATCH 69/91] Perturb MP2-F12 Fock

---
 quax/methods/hartree_fock.py |  4 ++--
 quax/methods/ints.py         | 28 +++++++++++++++-------------
 quax/methods/mp2f12.py       | 30 +++++++++++++++++++++---------
 3 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/quax/methods/hartree_fock.py b/quax/methods/hartree_fock.py
index 4844f9a..f340d4d 100644
--- a/quax/methods/hartree_fock.py
+++ b/quax/methods/hartree_fock.py
@@ -48,10 +48,10 @@ def form_shift():
     Enuc = nuclear_repulsion(geom.reshape(-1,3), nuclear_charges)
 
     if options['electric_field'] == 1:
-        Mu_XYZ = compute_dipole_ints(geom, basis_set, xyz_path, deriv_order, options)
+        Mu_XYZ = compute_dipole_ints(geom, basis_set, basis_set, xyz_path, deriv_order, options)
         H += jnp.einsum('x,xij->ij', efield, Mu_XYZ, optimize = 'optimal')
     elif options['electric_field'] == 2:
-        Mu_Th = compute_quadrupole_ints(geom, basis_set, xyz_path, deriv_order, options)
+        Mu_Th = compute_quadrupole_ints(geom, basis_set, basis_set, xyz_path, deriv_order, options)
         H += jnp.einsum('x,xij->ij', efield, Mu_Th[:3, :, :], optimize = 'optimal')
         H += jnp.einsum('x,xij->ij', efield_grad[jnp.triu_indices(3)], Mu_Th[3:, :, :], optimize = 'optimal')
     
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index b1be65c..f17abef 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -56,17 +56,18 @@ def compute_integrals(geom, basis_set, xyz_path, deriv_order, options):
     libint_interface.finalize()
     return S, T, V, G
 
-def compute_dipole_ints(geom, basis_set, xyz_path, deriv_order, options):
+def compute_dipole_ints(geom, basis1, basis2, xyz_path, deriv_order, options):
     # Load integral algo, decides to compute integrals in memory or use disk
     algo = options['integral_algo']
-    basis_name = basis_set.name()
-    libint_interface.initialize(xyz_path, basis_name, basis_name, basis_name, basis_name, options['ints_tolerance'])
+    basis1_name = basis1.name()
+    basis2_name = basis2.name()
+    libint_interface.initialize(xyz_path, basis1_name, basis2_name, basis1_name, basis2_name, options['ints_tolerance'])
 
     if algo == 'libint_disk':
         # Check disk for currently existing integral derivatives
-        check_multipole = check_multipole_disk('dipole', basis_set, basis_set, deriv_order)
+        check_multipole = check_multipole_disk('dipole', basis1, basis2, deriv_order)
 
-        oei_obj = OEI(basis_set, basis_set, xyz_path, deriv_order, 'disk')
+        oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'disk')
         # If disk integral derivs are right, nothing to do
         if check_multipole:
             Mu_ = oei_obj.dipole(geom)
@@ -75,24 +76,25 @@ def compute_dipole_ints(geom, basis_set, xyz_path, deriv_order, options):
             Mu_ = oei_obj.dipole(geom)
     else:
         # Precompute TEI derivatives
-        oei_obj = OEI(basis_set, basis_set, xyz_path, deriv_order, 'dipole')
+        oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'dipole')
         # Compute integrals
         Mu_ = oei_obj.dipole(geom)
 
     libint_interface.finalize()
     return Mu_
 
-def compute_quadrupole_ints(geom, basis_set, xyz_path, deriv_order, options):
+def compute_quadrupole_ints(geom, basis1, basis2, xyz_path, deriv_order, options):
     # Load integral algo, decides to compute integrals in memory or use disk
     algo = options['integral_algo']
-    basis_name = basis_set.name()
-    libint_interface.initialize(xyz_path, basis_name, basis_name, basis_name, basis_name, options['ints_tolerance'])
+    basis1_name = basis1.name()
+    basis2_name = basis2.name()
+    libint_interface.initialize(xyz_path, basis1_name, basis2_name, basis1_name, basis2_name, options['ints_tolerance'])
 
     if algo == 'libint_disk':
         # Check disk for currently existing integral derivatives
-        check_multipole = check_multipole_disk('quadrupole', basis_set, basis_set, deriv_order)
+        check_multipole = check_multipole_disk('quadrupole', basis1, basis2, deriv_order)
 
-        oei_obj = OEI(basis_set, basis_set, xyz_path, deriv_order, 'disk')
+        oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'disk')
         # If disk integral derivs are right, nothing to do
         if check_multipole:
             Mu_Th = oei_obj.quadrupole(geom)
@@ -101,7 +103,7 @@ def compute_quadrupole_ints(geom, basis_set, xyz_path, deriv_order, options):
             Mu_Th = oei_obj.quadrupole(geom)
     else:
         # Precompute TEI derivatives
-        oei_obj = OEI(basis_set, basis_set, xyz_path, deriv_order, 'dipole')
+        oei_obj = OEI(basis1, basis2, xyz_path, deriv_order, 'dipole')
         # Compute integrals
         Mu_Th = oei_obj.quadrupole(geom)
 
@@ -285,7 +287,7 @@ def check_multipole_disk(int_type, basis1, basis2, deriv_order, address=None):
     correct_int_derivs = False
     correct_nbf1 = correct_nbf2 = correct_deriv_order = False
 
-    if ((os.path.exists(int_type, "_derivs.h5"))):
+    if ((os.path.exists(int_type + "_derivs.h5"))):
         print("Found currently existing multipole integral derivatives in your working directory. Trying to use them.")
         oeifile = h5py.File(int_type + '_derivs.h5', 'r')
         nbf1 = basis1.nbf()
diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index 1e479ec..90e0c78 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -4,19 +4,22 @@
 from jax.lax import fori_loop, cond
 
 from .basis_utils import build_CABS
-from .ints import compute_f12_oeints, compute_f12_teints
+from .ints import compute_f12_oeints, compute_f12_teints, compute_dipole_ints, compute_quadrupole_ints
 from .energy_utils import partial_tei_transformation, cartesian_product
 from .mp2 import restricted_mp2
 
 def restricted_mp2_f12(*args, options, deriv_order=0):
     if options['electric_field'] == 1:
         efield, geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
+        fields = (efield,)
         mp2_args = efield, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path
     elif options['electric_field'] == 2:
         efield_grad, efield, geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
+        fields = (efield_grad, efield)
         mp2_args = efield_grad, efield, geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path
     else:
         geom, basis_set, cabs_set, nelectrons, nfrzn, nuclear_charges, xyz_path = args
+        fields = None
         mp2_args = (geom, basis_set, nelectrons, nfrzn, nuclear_charges, xyz_path)
 
     E_mp2, C_obs, eps, G = restricted_mp2(*mp2_args, options=options, deriv_order=deriv_order, return_aux_data=True)
@@ -32,7 +35,7 @@ def restricted_mp2_f12(*args, options, deriv_order=0):
     spaces = (ndocc, nobs, C_cabs.shape[0]) # ndocc, nobs, nri
 
     # Fock
-    f, fk, k = form_Fock(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options)
+    f, fk, k = form_Fock(geom, basis_set, cabs_set, C_mats, spaces, fields, xyz_path, deriv_order, options)
 
     # V Intermediate
     V = form_V(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options)\
@@ -119,7 +122,7 @@ def t_(p, q, r, s):
     )
 
 # One-Electron Integrals
-def one_body_mo_computer(geom, bs1, bs2, C1, C2, xyz_path, deriv_order, options):
+def one_body_mo_computer(geom, bs1, bs2, C1, C2, fields, xyz_path, deriv_order, options):
     """
     General one-body MO computer
     that computes the AOs and 
@@ -127,23 +130,32 @@ def one_body_mo_computer(geom, bs1, bs2, C1, C2, xyz_path, deriv_order, options)
     """
     T, V = compute_f12_oeints(geom, bs1, bs2, xyz_path, deriv_order, options, False)
     AO = T + V
+
+    if options['electric_field'] == 1:
+        Mu_XYZ = compute_dipole_ints(geom, bs1, bs2, xyz_path, deriv_order, options)
+        AO += jnp.einsum('x,xij->ij', fields[0], Mu_XYZ, optimize = 'optimal')
+    elif options['electric_field'] == 2:
+        Mu_Th = compute_quadrupole_ints(geom, bs1, bs2, xyz_path, deriv_order, options)
+        AO += jnp.einsum('x,xij->ij', fields[0], Mu_Th[:3, :, :], optimize = 'optimal')
+        AO += jnp.einsum('x,xij->ij', fields[1][jnp.triu_indices(3)], Mu_Th[3:, :, :], optimize = 'optimal')
+
     MO = C1.T @ AO @ C2
     return MO
 
-def form_h(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options):
+def form_h(geom, basis_set, cabs_set, C_mats, spaces, fields, xyz_path, deriv_order, options):
     _, nobs, nri = spaces
     _, C_obs, C_cabs = C_mats
 
     tv = jnp.zeros((nri, nri))
 
-    mo1 = one_body_mo_computer(geom, basis_set, basis_set, C_obs, C_obs, xyz_path, deriv_order, options)
+    mo1 = one_body_mo_computer(geom, basis_set, basis_set, C_obs, C_obs, fields, xyz_path, deriv_order, options)
     tv = tv.at[:nobs, :nobs].set(mo1) # <O|O>
 
-    mo2 = one_body_mo_computer(geom, basis_set, cabs_set, C_obs, C_cabs, xyz_path, deriv_order, options)
+    mo2 = one_body_mo_computer(geom, basis_set, cabs_set, C_obs, C_cabs, fields, xyz_path, deriv_order, options)
     tv = tv.at[:nobs, nobs:nri].set(mo2) # <O|C>
     tv = tv.at[nobs:nri, :nobs].set(mo2.T) # <C|O>
 
-    mo3 = one_body_mo_computer(geom, cabs_set, cabs_set, C_cabs, C_cabs, xyz_path, deriv_order, options)
+    mo3 = one_body_mo_computer(geom, cabs_set, cabs_set, C_cabs, C_cabs, fields, xyz_path, deriv_order, options)
     tv = tv.at[nobs:nri, nobs:nri].set(mo3) # <C|C>
 
     return tv
@@ -257,9 +269,9 @@ def form_F2(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, op
     return f12_squared
 
 # Fock
-def form_Fock(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options):
+def form_Fock(geom, basis_set, cabs_set, C_mats, spaces, fields, xyz_path, deriv_order, options):
 
-    fk = form_h(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options)
+    fk = form_h(geom, basis_set, cabs_set, C_mats, spaces, fields, xyz_path, deriv_order, options)
     J = form_J(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options)
     K = form_K(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, options)
     

From f7865fcf37543ad7a053601f93f9b29ddc3a80d2 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 20 May 2024 10:52:24 -0400
Subject: [PATCH 70/91] Fix config, frzn_core, COM, and test tols

---
 quax/core.py                       |  5 ++--
 quax/integrals/libint_interface.cc | 44 ++++++++++++------------------
 quax/integrals/makefile            | 14 +++++-----
 quax/integrals/oei.py              | 13 +++++----
 quax/methods/ccsd.py               |  2 +-
 quax/methods/ccsd_t.py             |  2 +-
 quax/methods/energy_utils.py       |  2 +-
 quax/methods/hartree_fock.py       |  4 +--
 quax/methods/ints.py               |  9 ++++--
 quax/methods/mp2.py                |  2 +-
 quax/methods/mp2f12.py             | 35 +++++++++++++++++++-----
 tests/test_dipoles.py              | 14 +++++-----
 tests/test_hessians.py             |  4 +--
 13 files changed, 86 insertions(+), 64 deletions(-)

diff --git a/quax/core.py b/quax/core.py
index 33a6aac..952edf3 100644
--- a/quax/core.py
+++ b/quax/core.py
@@ -1,7 +1,6 @@
 import jax 
 from jax import jacfwd
-from jax.config import config
-config.update("jax_enable_x64", True)
+jax.config.update("jax_enable_x64", True)
 import jax.numpy as jnp
 import psi4
 import numpy as np
@@ -35,6 +34,7 @@ def check_options(options):
     keyword_options = {'maxit': 100,
                        'damping': False,
                        'damp_factor': 0.5,
+                       'guess_core': False,
                        'spectral_shift': True,
                        'integral_algo': 'libint_core',
                        'ints_tolerance': 1.0e-14,
@@ -304,6 +304,7 @@ def energy(molecule, basis_name, method, options=None):
     # Set keyword options
     if options:
         options = check_options(options)
+        options['integral_algo'] = 'libint_core'
     else:
         options = check_options({'integral_algo': 'libint_core'})
     print("Using integral method: {}".format(options['integral_algo']))
diff --git a/quax/integrals/libint_interface.cc b/quax/integrals/libint_interface.cc
index 713df91..54c5d06 100644
--- a/quax/integrals/libint_interface.cc
+++ b/quax/integrals/libint_interface.cc
@@ -299,7 +299,7 @@ py::array compute_1e_int(std::string type) {
 }
 
 // Compute one-electron dipole integrals
-std::vector<py::array> compute_dipole_ints() {
+std::vector<py::array> compute_dipole_ints(std::array<double,3> COM) {
     // Shell pairs after screening
     const auto bs1_equiv_bs2 = (bs1 == bs2);
     auto shellpairs = build_shellpairs(bs1, bs2);
@@ -308,7 +308,7 @@ std::vector<py::array> compute_dipole_ints() {
     std::vector<libint2::Engine> engines(nthreads);
 
     // COM generator
-    std::array<double,3> COM = {0.000, 0.000, 0.000};
+    // std::array<double,3> COM = {0.000, 0.000, 0.000};
 
     // Will compute overlap + electric dipole moments
     engines[0] = libint2::Engine(libint2::Operator::emultipole1, max_nprim, max_l);
@@ -353,12 +353,12 @@ std::vector<py::array> compute_dipole_ints() {
         if (bs1_equiv_bs2 && p1 != p2) {
             for(auto f1 = 0, idx = 0; f1 != n1; ++f1) {
                 for(auto f2 = 0; f2 != n2; ++f2, ++idx) {
-                    Mu_X[(bf1 + f1) * nbf2 + bf2 + f2] = mu_x_shellset[idx];
-                    Mu_X[(bf2 + f2) * nbf1 + bf1 + f1] = mu_x_shellset[idx];
-                    Mu_Y[(bf1 + f1) * nbf2 + bf2 + f2] = mu_y_shellset[idx];
-                    Mu_Y[(bf2 + f2) * nbf1 + bf1 + f1] = mu_y_shellset[idx];
-                    Mu_Z[(bf1 + f1) * nbf2 + bf2 + f2] = mu_z_shellset[idx];
-                    Mu_Z[(bf2 + f2) * nbf1 + bf1 + f1] = mu_z_shellset[idx];
+                    Mu_X[(bf1 + f1) * nbf2 + bf2 + f2] =
+                        Mu_X[(bf2 + f2) * nbf1 + bf1 + f1] = mu_x_shellset[idx];
+                    Mu_Y[(bf1 + f1) * nbf2 + bf2 + f2] =
+                        Mu_Y[(bf2 + f2) * nbf1 + bf1 + f1] = mu_y_shellset[idx];
+                    Mu_Z[(bf1 + f1) * nbf2 + bf2 + f2] =
+                        Mu_Z[(bf2 + f2) * nbf1 + bf1 + f1] = mu_z_shellset[idx];
                 }
             }
         } else {
@@ -376,7 +376,7 @@ std::vector<py::array> compute_dipole_ints() {
 }
 
 // Compute one-electron dipole and quadrupole integrals
-std::vector<py::array> compute_quadrupole_ints() {
+std::vector<py::array> compute_quadrupole_ints(std::array<double,3> COM) {
     // Shell pairs after screening
     const auto bs1_equiv_bs2 = (bs1 == bs2);
     auto shellpairs = build_shellpairs(bs1, bs2);
@@ -384,9 +384,6 @@ std::vector<py::array> compute_quadrupole_ints() {
     // Integral engine
     std::vector<libint2::Engine> engines(nthreads);
 
-    // COM generator
-    std::array<double,3> COM = {0.000, 0.000, 0.000};
-
     // Will compute overlap + electric dipole moments
     engines[0] = libint2::Engine(libint2::Operator::emultipole2, max_nprim, max_l);
     engines[0].set_params(COM); // with COM as the multipole origin
@@ -789,7 +786,7 @@ py::array compute_1e_deriv(std::string type, std::vector<int> deriv_vec) {
 }
 
 // Computes nuclear derivatives of dipole integrals
-std::vector<py::array> compute_dipole_derivs(std::vector<int> deriv_vec) {
+std::vector<py::array> compute_dipole_derivs(std::array<double,3> COM, std::vector<int> deriv_vec) {
     assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
     // Get order of differentiation
     int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
@@ -810,9 +807,6 @@ std::vector<py::array> compute_dipole_derivs(std::vector<int> deriv_vec) {
     // Integral engine
     std::vector<libint2::Engine> engines(nthreads);
 
-    // COM generator
-    std::array<double,3> COM = {0.000, 0.000, 0.000};
-
     // Will compute overlap + electric dipole moments
     engines[0] = libint2::Engine(libint2::Operator::emultipole1, max_nprim, max_l, deriv_order);
     engines[0].set_params(COM); // with COM as the multipole origin
@@ -923,7 +917,7 @@ std::vector<py::array> compute_dipole_derivs(std::vector<int> deriv_vec) {
 }
 
 // Computes nuclear derivatives of dipole and quadrupole integrals
-std::vector<py::array> compute_quadrupole_derivs(std::vector<int> deriv_vec) {
+std::vector<py::array> compute_quadrupole_derivs(std::array<double,3> COM, std::vector<int> deriv_vec) {
     assert(ncart == deriv_vec.size() && "Derivative vector incorrect size for this molecule.");
     // Get order of differentiation
     int deriv_order = accumulate(deriv_vec.begin(), deriv_vec.end(), 0);
@@ -944,9 +938,6 @@ std::vector<py::array> compute_quadrupole_derivs(std::vector<int> deriv_vec) {
     // Integral engine
     std::vector<libint2::Engine> engines(nthreads);
 
-    // COM generator
-    std::array<double,3> COM = {0.000, 0.000, 0.000};
-
     // Will compute overlap + electric dipole moments
     engines[0] = libint2::Engine(libint2::Operator::emultipole2, max_nprim, max_l, deriv_order);
     engines[0].set_params(COM); // with COM as the multipole origin
@@ -1587,14 +1578,18 @@ void compute_1e_deriv_disk(std::string type, int max_deriv_order) {
 //      ...
 // The number of unique derivatives is essentially equal to the size of the
 // generalized upper triangle of the derivative tensor.
-void compute_dipole_deriv_disk(int max_deriv_order) {
+void compute_dipole_deriv_disk(std::array<double,3> COM, int max_deriv_order) {
     std::cout << "Writing dipole integral derivative tensors up to order " << max_deriv_order << " to disk...";
     long total_deriv_slices = 0;
     for (int i = 1; i <= max_deriv_order; i++){
         total_deriv_slices += how_many_derivs(natom, i);
     }
 
+    double check = (nbf1 * nbf2 * total_deriv_slices * 8) * (1e-9);
+    assert(check < 10 && "Total disk space required for ERI's exceeds 10 GB. Increase threshold and recompile to proceed.");
+
     // Shell pairs after screening
+    const auto bs1_equiv_bs2 = (bs1 == bs2);
     auto shellpairs = build_shellpairs(bs1, bs2);
 
     // Create H5 File and prepare to fill with 0.0's
@@ -1618,9 +1613,6 @@ void compute_dipole_deriv_disk(int max_deriv_order) {
         // Define engines and buffers
         std::vector<libint2::Engine> engines(nthreads);
 
-        // COM generator
-        std::array<double,3> COM = {0.000, 0.000, 0.000};
-
         // Will compute overlap + electric dipole moments
         engines[0] = libint2::Engine(libint2::Operator::emultipole1, max_nprim, max_l, deriv_order);
         engines[0].set_params(COM); // with COM as the multipole origin
@@ -1723,7 +1715,7 @@ void compute_dipole_deriv_disk(int max_deriv_order) {
                 }
 
                 // Loop over shell block for each buffer index which contributes to this derivative
-                if (p1 != p2) {
+                if (bs1_equiv_bs2 && p1 != p2) {
                     for(auto i = 0; i < buffer_indices.size(); ++i) {
                         auto mu_x_shellset = buf_vec[buffer_indices[i] + 1];
                         auto mu_y_shellset = buf_vec[buffer_indices[i] + 2];
@@ -1772,7 +1764,7 @@ void compute_dipole_deriv_disk(int max_deriv_order) {
             Mu_Y_dataset->write(Mu_Y_shellset_slab_12, PredType::NATIVE_DOUBLE, mspace, fspace);
             Mu_Z_dataset->write(Mu_Z_shellset_slab_12, PredType::NATIVE_DOUBLE, mspace, fspace);
 
-            if (p1 != p2) {
+            if (bs1_equiv_bs2 && p1 != p2) {
                 // Now write this shell set slab to HDF5 file
                 // Create file space hyperslab, defining where to write data to in file
                 hsize_t count_T[3] = {n2, n1, nderivs_triu};
diff --git a/quax/integrals/makefile b/quax/integrals/makefile
index de1e519..d2ef2a9 100644
--- a/quax/integrals/makefile
+++ b/quax/integrals/makefile
@@ -1,12 +1,12 @@
 # NOTE: These paths below need to be edited such that they point to a set of 
 # Eigen headers, Python headers, Pybind11 headers, Libint API headers libint2.h libint2.hpp, the rest of the Libint2 headers, and the library location of libint2.a,
-CC      := g++-10
+CC      := g++
 # Options passed to compiler, add "-fopenmp" if intending to use OpenMP
-CFLAGS  := -O3 -fPIC -fopenmp -g
+CFLAGS  := -O3 -fPIC -fopenmp
 # Libint prefix location (where /include, /include/libint2, /lib, /share are located) 
-LIBINT_PREFIX := /home/ecm23353/miniconda3/envs/p4dev
+LIBINT_PREFIX := /home/ecm23353/psi_env
 # Conda prefix location, it is suggested to use conda to install nearly all dependencies
-CONDA_PREFIX := /home/ecm23353/miniconda3/envs/p4dev
+CONDA_PREFIX := /home/ecm23353/psi_env
 
 I1 := $(LIBINT_PREFIX)/include
 I2 := $(LIBINT_PREFIX)/include/libint2
@@ -14,9 +14,9 @@ L1 := $(LIBINT_PREFIX)/lib
 # Eigen headers location 
 I3 := $(CONDA_PREFIX)/include/eigen3
 # Python headers location 
-I4 := $(CONDA_PREFIX)/include/python3.11
+I4 := $(CONDA_PREFIX)/include/python3.10
 # Pybind11 headers location 
-I5 := $(CONDA_PREFIX)/lib/python3.11/site-packages/pybind11/include
+I5 := $(CONDA_PREFIX)/lib/python3.10/site-packages/pybind11/include
 # HDF5 headers, static and shared libraries 
 I6 := $(CONDA_PREFIX)/include
 L2 := $(CONDA_PREFIX)/lib
@@ -25,7 +25,7 @@ RPATH := -Wl,-rpath,"$(CONDA_PREFIX)/lib"
 
 # This 'TARGETS' suffix should be set to whatever is returned by the command `python3-config --extension-suffix` entered on command line.
 # and it should match the same python version referenced in the above include path for I4 := (3.7 in this case)
-TARGETS := libint_interface.cpython-311-x86_64-linux-gnu.so
+TARGETS := libint_interface.cpython-310-x86_64-linux-gnu.so
 OBJ     := libint_interface.o
 
 # Rest is boilerplate. Do not edit unless you know what you're doing.
diff --git a/quax/integrals/oei.py b/quax/integrals/oei.py
index c92782f..189ddc8 100644
--- a/quax/integrals/oei.py
+++ b/quax/integrals/oei.py
@@ -14,8 +14,8 @@ class OEI(object):
     def __init__(self, basis1, basis2, xyz_path, max_deriv_order, mode):
         with open(xyz_path, 'r') as f:
             tmp = f.read()
-        molecule = psi4.core.Molecule.from_string(tmp, 'xyz+')
-        natoms = molecule.natom()
+        mol = psi4.core.Molecule.from_string(tmp, 'xyz+')
+        natoms = mol.natom()
 
         nbf1 = basis1.nbf()
         nbf2 = basis2.nbf()
@@ -40,6 +40,9 @@ def __init__(self, basis1, basis2, xyz_path, max_deriv_order, mode):
         self.nbf1 = nbf1
         self.nbf2 = nbf2
 
+        com = mol.center_of_mass()
+        self.com = list([com[0], com[1], com[2]])
+
         # Create new JAX primitives for overlap, kinetic, potential evaluation and their derivatives
         self.overlap_p = jax.core.Primitive("overlap")
         self.overlap_deriv_p = jax.core.Primitive("overlap_deriv")
@@ -131,7 +134,7 @@ def potential_impl(self, geom):
         return jnp.asarray(V)
 
     def dipole_impl(self, geom):
-        Mu_X, Mu_Y, Mu_Z = libint_interface.compute_dipole_ints()
+        Mu_X, Mu_Y, Mu_Z = libint_interface.compute_dipole_ints(self.com)
         Mu_X = Mu_X.reshape(self.nbf1, self.nbf2)
         Mu_Y = Mu_Y.reshape(self.nbf1, self.nbf2)
         Mu_Z = Mu_Z.reshape(self.nbf1, self.nbf2)
@@ -139,7 +142,7 @@ def dipole_impl(self, geom):
     
     def quadrupole_impl(self, geom):
         Mu_X, Mu_Y, Mu_Z, Th_XX, Th_XY,\
-            Th_XZ, Th_YY, Th_YZ, Th_ZZ = libint_interface.compute_quadrupole_ints()
+            Th_XZ, Th_YY, Th_YZ, Th_ZZ = libint_interface.compute_quadrupole_ints(self.com)
         Mu_X = Mu_X.reshape(self.nbf1, self.nbf2)
         Mu_Y = Mu_Y.reshape(self.nbf1, self.nbf2)
         Mu_Z = Mu_Z.reshape(self.nbf1, self.nbf2)
@@ -253,7 +256,7 @@ def dipole_deriv_impl(self, geom, deriv_vec):
         idx = get_deriv_vec_idx(deriv_vec)
 
         if self.mode == 'dipole':
-            Mu_X, Mu_Y, Mu_Z = libint_interface.compute_dipole_derivs(deriv_vec)
+            Mu_X, Mu_Y, Mu_Z = libint_interface.compute_dipole_derivs(self.com, deriv_vec)
             Mu_X = Mu_X.reshape(self.nbf1, self.nbf2)
             Mu_Y = Mu_Y.reshape(self.nbf1, self.nbf2)
             Mu_Z = Mu_Z.reshape(self.nbf1, self.nbf2)
diff --git a/quax/methods/ccsd.py b/quax/methods/ccsd.py
index 0f5dd95..d60bb4b 100644
--- a/quax/methods/ccsd.py
+++ b/quax/methods/ccsd.py
@@ -1,5 +1,5 @@
 import jax 
-from jax.config import config; config.update("jax_enable_x64", True)
+jax.config.update("jax_enable_x64", True)
 import jax.numpy as jnp
 
 from .energy_utils import tei_transformation
diff --git a/quax/methods/ccsd_t.py b/quax/methods/ccsd_t.py
index bc27957..4a7e303 100644
--- a/quax/methods/ccsd_t.py
+++ b/quax/methods/ccsd_t.py
@@ -1,5 +1,5 @@
 import jax 
-from jax.config import config; config.update("jax_enable_x64", True)
+jax.config.update("jax_enable_x64", True)
 import jax.numpy as jnp
 from jax.lax import while_loop
 
diff --git a/quax/methods/energy_utils.py b/quax/methods/energy_utils.py
index bfcf9be..5653638 100644
--- a/quax/methods/energy_utils.py
+++ b/quax/methods/energy_utils.py
@@ -1,5 +1,5 @@
 import jax
-from jax.config import config; config.update("jax_enable_x64", True)
+jax.config.update("jax_enable_x64", True)
 import jax.numpy as jnp
 from functools import partial
 
diff --git a/quax/methods/hartree_fock.py b/quax/methods/hartree_fock.py
index f340d4d..c11318b 100644
--- a/quax/methods/hartree_fock.py
+++ b/quax/methods/hartree_fock.py
@@ -58,7 +58,7 @@ def form_shift():
     def rhf_iter(F, D):
         E_scf = jnp.einsum('pq,pq->', F + H, D) + Enuc
         Fp = A.T @ F @ A
-        Fp = Fp + shift 
+        Fp += shift
         eps, C2 = jnp.linalg.eigh(Fp)
         C = A @ C2
         Cocc = C[:, :ndocc]
@@ -88,7 +88,7 @@ def scf_procedure(carry):
         return (iter + 1, de_, drms_, eps_, C_, D_old, D_, e_scf)
 
     # Create Guess Density
-    D = jnp.copy(H)
+    D = jax.lax.cond(options['guess_core'], lambda: jnp.copy(H), lambda: jnp.zeros_like(H))
     JK = 2 * jk_build(G, D)
     JK -= jk_build(G.transpose((0,2,1,3)), D)
     F = H + JK
diff --git a/quax/methods/ints.py b/quax/methods/ints.py
index f17abef..fb71603 100644
--- a/quax/methods/ints.py
+++ b/quax/methods/ints.py
@@ -72,7 +72,12 @@ def compute_dipole_ints(geom, basis1, basis2, xyz_path, deriv_order, options):
         if check_multipole:
             Mu_ = oei_obj.dipole(geom)
         else:
-            libint_interface.compute_dipole_deriv_disk(deriv_order)
+            with open(xyz_path, 'r') as f:
+                tmp = f.read()
+            com = psi4.core.Molecule.from_string(tmp, 'xyz+').center_of_mass()
+            com = list([com[0], com[1], com[2]])
+
+            libint_interface.compute_dipole_deriv_disk(com, deriv_order)
             Mu_ = oei_obj.dipole(geom)
     else:
         # Precompute TEI derivatives
@@ -373,4 +378,4 @@ def check_tei_disk(int_type, basis1, basis2, basis3, basis4, deriv_order, addres
         if correct_int_derivs:
             print("Integral derivatives appear to be correct. Avoiding recomputation.")
         return correct_int_derivs
- """
\ No newline at end of file
+ """
diff --git a/quax/methods/mp2.py b/quax/methods/mp2.py
index fcd8401..39fca0e 100644
--- a/quax/methods/mp2.py
+++ b/quax/methods/mp2.py
@@ -1,5 +1,5 @@
 import jax 
-from jax.config import config; config.update("jax_enable_x64", True)
+jax.config.update("jax_enable_x64", True)
 import jax.numpy as jnp
 from jax.lax import fori_loop
 
diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index 90e0c78..0ad313a 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -1,5 +1,5 @@
 import jax 
-from jax.config import config; config.update("jax_enable_x64", True)
+jax.config.update("jax_enable_x64", True)
 import jax.numpy as jnp
 from jax.lax import fori_loop, cond
 
@@ -60,7 +60,7 @@ def loop_energy(idx, f12_corr):
 
         D_ij = D[i, j, :, :]
 
-        GD_ij = jnp.einsum('ab,ab->ab', G[i - 1, j - 1, :, :], D_ij, optimize='optimal')
+        GD_ij = jnp.einsum('ab,ab->ab', G[i - ncore, j - ncore, :, :], D_ij, optimize='optimal')
         V_ij = V[i, j, :, :] - jnp.einsum('klab,ab->kl', C, GD_ij, optimize='optimal')
 
         V_s = 0.25 * (t_(i, j, i, j) + t_(i, j, j, i)) * kd * (V_ij[i, j] + V_ij[j, i])
@@ -82,15 +82,20 @@ def loop_energy(idx, f12_corr):
 
         f12_corr += kd * (2.0 * V_s + B_s)         # Singlet Pair Energy
         f12_corr += 3.0 * kd * (2.0 * V_t + B_t)   # Triplet Pair Energy
-
         return f12_corr
 
-    start = ndocc if ncore > 0 else 0
-    dE_mp2f12 = fori_loop(start, indices.shape[0], loop_energy, 0.0)
+    def frzn_core(idx, accu):
+        accu += ndocc - idx
+        return accu
+
+    start = fori_loop(0, ncore, frzn_core, 0)
+    dE_f12 = fori_loop(start, indices.shape[0], loop_energy, 0.0)
 
     E_s = cabs_singles(f, spaces)
 
-    return E_mp2 + dE_mp2f12 + E_s
+    jax.debug.print("  Total MP2-F12/3C(FIX) Energy:         {}", E_mp2 + dE_f12 + E_s)
+
+    return E_mp2 + dE_f12 + E_s
 
 # CABS Singles
 def cabs_singles(f, spaces):
@@ -150,13 +155,16 @@ def form_h(geom, basis_set, cabs_set, C_mats, spaces, fields, xyz_path, deriv_or
 
     mo1 = one_body_mo_computer(geom, basis_set, basis_set, C_obs, C_obs, fields, xyz_path, deriv_order, options)
     tv = tv.at[:nobs, :nobs].set(mo1) # <O|O>
+    del mo1
 
     mo2 = one_body_mo_computer(geom, basis_set, cabs_set, C_obs, C_cabs, fields, xyz_path, deriv_order, options)
     tv = tv.at[:nobs, nobs:nri].set(mo2) # <O|C>
     tv = tv.at[nobs:nri, :nobs].set(mo2.T) # <C|O>
+    del mo2
 
     mo3 = one_body_mo_computer(geom, cabs_set, cabs_set, C_cabs, C_cabs, fields, xyz_path, deriv_order, options)
     tv = tv.at[nobs:nri, nobs:nri].set(mo3) # <C|C>
+    del mo3
 
     return tv
 
@@ -182,15 +190,18 @@ def form_J(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, opt
     mo1 = two_body_mo_computer(geom, "eri", basis_set, basis_set, basis_set, basis_set,\
                                C_obs, C_occ, C_obs, C_occ, xyz_path, deriv_order, options)
     eri = eri.at[:nobs, :, :nobs, :].set(mo1) # <Oo|Oo>
+    del mo1
 
     mo2 = two_body_mo_computer(geom, "eri", cabs_set, basis_set, basis_set, basis_set,\
                               C_cabs, C_occ, C_obs, C_occ, xyz_path, deriv_order, options)
     eri = eri.at[nobs:nri, :, :nobs, :].set(mo2) # <Co|Oo>
     eri = eri.at[:nobs, :, nobs:nri, :].set(jnp.transpose(mo2, (2,3,0,1))) # <Oo|Co>
+    del mo2
 
     mo3 = two_body_mo_computer(geom, "eri", cabs_set, basis_set, cabs_set, basis_set,\
                               C_cabs, C_occ, C_cabs, C_occ, xyz_path, deriv_order, options)
     eri = eri.at[nobs:nri, :, nobs:nri, :].set(mo3) # <Co|Co>
+    del mo3
 
     return eri
 
@@ -203,15 +214,18 @@ def form_K(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, opt
     mo1 = two_body_mo_computer(geom, "eri", basis_set, basis_set, basis_set, basis_set,\
                               C_obs, C_occ, C_occ, C_obs, xyz_path, deriv_order, options)
     eri = eri.at[:nobs, :, :, :nobs].set(mo1) # <Oo|oO>
+    del mo1
 
     mo2 = two_body_mo_computer(geom, "eri", cabs_set, basis_set, basis_set, basis_set,\
                               C_cabs, C_occ, C_occ, C_obs, xyz_path, deriv_order, options)
     eri = eri.at[nobs:nri, :, :, :nobs].set(mo2) # <Co|oO>
     eri = eri.at[:nobs, :, :, nobs:nri].set(jnp.transpose(mo2, (3,2,1,0))) # <Oo|oC>
+    del mo2
 
     mo3 = two_body_mo_computer(geom, "eri", cabs_set, basis_set, basis_set, cabs_set,\
                               C_cabs, C_occ, C_occ, C_cabs, xyz_path, deriv_order, options)
     eri = eri.at[nobs:nri, :, :, nobs:nri].set(mo3) # <Co|oC>
+    del mo3
 
     return eri
 
@@ -224,10 +238,12 @@ def form_ooO1(geom, int_type, basis_set, cabs_set, C_mats, spaces, xyz_path, der
     mo1 = two_body_mo_computer(geom, int_type, basis_set, basis_set, basis_set, basis_set,\
                               C_occ, C_occ, C_obs, C_obs, xyz_path, deriv_order, options)
     eri = eri.at[:, :, :, :nobs].set(mo1) # <oo|OO>
+    del mo1
 
     mo2 = two_body_mo_computer(geom, int_type, basis_set, basis_set, basis_set, cabs_set,\
                                C_occ, C_occ, C_obs, C_cabs, xyz_path, deriv_order, options)
     eri = eri.at[:, :, :, nobs:].set(mo2) # <oo|OC>
+    del mo2
 
     return eri
 
@@ -240,15 +256,18 @@ def form_F(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, opt
     mo1 = two_body_mo_computer(geom, "f12", basis_set, basis_set, basis_set, basis_set,\
                               C_occ, C_occ, C_obs, C_obs, xyz_path, deriv_order, options)
     f12 = f12.at[:, :, :nobs, :nobs].set(mo1) # <oo|OO>
+    del mo1
 
     mo2 = two_body_mo_computer(geom, "f12", basis_set, basis_set, basis_set, cabs_set,\
                               C_occ, C_occ, C_obs, C_cabs, xyz_path, deriv_order, options)
     f12 = f12.at[:, :, :nobs, nobs:].set(mo2) # <oo|OC>
     f12 = f12.at[:, :, nobs:, :nobs].set(jnp.transpose(mo2, (1,0,3,2))) # <oo|CO>
+    del mo2
 
     mo3 = two_body_mo_computer(geom, "f12", basis_set, basis_set, cabs_set, cabs_set,\
                               C_occ, C_occ, C_cabs, C_cabs, xyz_path, deriv_order, options)
     f12 = f12.at[:, :, nobs:, nobs:].set(mo3) # <oo|CC>
+    del mo3
 
     return f12
 
@@ -261,10 +280,12 @@ def form_F2(geom, basis_set, cabs_set, C_mats, spaces, xyz_path, deriv_order, op
     mo1 = two_body_mo_computer(geom, "f12_squared", basis_set, basis_set, basis_set, basis_set,\
                               C_occ, C_occ, C_occ, C_obs, xyz_path, deriv_order, options)
     f12_squared = f12_squared.at[:, :, :, :nobs].set(mo1) # <oo|oO>
+    del mo1
 
     mo2 = two_body_mo_computer(geom, "f12_squared", basis_set, basis_set, basis_set, cabs_set,\
                               C_occ, C_occ, C_occ, C_cabs, xyz_path, deriv_order, options)
     f12_squared = f12_squared.at[:, :, :, nobs:].set(mo2) # <oo|oC>
+    del mo2
 
     return f12_squared
 
@@ -362,4 +383,4 @@ def form_B(geom, basis_set, cabs_set, f, k, fk_o1, C_mats, spaces, xyz_path, der
 
     B_nosymm = Uf + terms + jnp.transpose(terms, (1,0,3,2)) # nmlk->mnkl
 
-    return 0.5 * (B_nosymm + jnp.transpose(B_nosymm, (2,3,0,1))) # mnkl + klmn
\ No newline at end of file
+    return 0.5 * (B_nosymm + jnp.transpose(B_nosymm, (2,3,0,1))) # mnkl + klmn
diff --git a/tests/test_dipoles.py b/tests/test_dipoles.py
index f371108..1e7dbcc 100644
--- a/tests/test_dipoles.py
+++ b/tests/test_dipoles.py
@@ -29,17 +29,17 @@
 def test_hartree_fock_gradient(method='hf'):
     psi4.properties(method, properties=['dipole'])
     psi_deriv = psi4.variable("SCF DIPOLE")
-    quax_deriv = quax.core.efield_deriv(molecule, basis_name, method, electric_field=efield, deriv_order=1, options=options).reshape(-1,3)
-    quax_partial0 = quax.core.efield_deriv(molecule, basis_name, method, electric_field=efield, deriv_order=1, partial=(0,), options=options)
+    quax_deriv = quax.core.efield_deriv(molecule, basis_name, method, efield=efield, deriv_order=1, options=options).reshape(-1,3)
+    quax_partial0 = quax.core.efield_deriv(molecule, basis_name, method, efield=efield, deriv_order=1, partial=(0,), options=options)
     assert np.allclose(psi_deriv, quax_deriv)
-    assert np.allclose(psi_deriv[0,0], quax_partial0)
+    assert np.allclose(psi_deriv[0], quax_partial0)
 
 def test_ccsd_gradient(method='ccsd'):
     psi4.properties(method, properties=['dipole'])
     psi_deriv = psi4.variable("CC DIPOLE")
-    quax_deriv = quax.core.efield_deriv(molecule, basis_name, method, electric_field=efield, deriv_order=1, options=options).reshape(-1,3)
-    quax_partial0 = quax.core.efield_deriv(molecule, basis_name, method, electric_field=efield, deriv_order=1, partial=(0,), options=options)
-    assert np.allclose(psi_deriv, quax_deriv)
-    assert np.allclose(psi_deriv[0,0], quax_partial0)
+    quax_deriv = quax.core.efield_deriv(molecule, basis_name, method, efield=efield, deriv_order=1, options=options).reshape(-1,3)
+    quax_partial0 = quax.core.efield_deriv(molecule, basis_name, method, efield=efield, deriv_order=1, partial=(0,), options=options)
+    assert np.allclose(psi_deriv, quax_deriv, rtol=1e-4, atol=1e-4)
+    assert np.allclose(psi_deriv[0], quax_partial0)
 
 
diff --git a/tests/test_hessians.py b/tests/test_hessians.py
index d45ce51..8da6b6f 100644
--- a/tests/test_hessians.py
+++ b/tests/test_hessians.py
@@ -38,7 +38,7 @@ def test_mp2_hessian(method='mp2'):
     n = psi_deriv.shape[0]
     quax_deriv = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, options=options).reshape(n,n)
     quax_partial00 = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, partial=(0,0), options=options)
-    assert np.allclose(psi_deriv, quax_deriv)
+    assert np.allclose(psi_deriv, quax_deriv, atol=5e-7)
     assert np.allclose(psi_deriv[0,0], quax_partial00)
 
 def test_ccsd_t_hessian(method='ccsd(t)'):
@@ -46,6 +46,6 @@ def test_ccsd_t_hessian(method='ccsd(t)'):
     n = psi_deriv.shape[0]
     quax_deriv = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, options=options).reshape(n,n)
     quax_partial00 = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, partial=(0,0), options=options)
-    assert np.allclose(psi_deriv, quax_deriv)
+    assert np.allclose(psi_deriv, quax_deriv, atol=5e-7)
     assert np.allclose(psi_deriv[0,0], quax_partial00)
 

From 84f593bb0363cb71c98bbdb689a7e0650d796ae5 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 10 Jun 2024 20:44:49 -0400
Subject: [PATCH 71/91] Update CI Python

---
 .github/workflows/continuous_integration.yml | 6 +++---
 quax/core.py                                 | 1 +
 tests/test_energies.py                       | 4 +---
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index 04b72fb..db22fb6 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -13,10 +13,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v2
-    - name: Set up Python 3.7
+    - name: Set up Python 3.10
       uses: actions/setup-python@v2
       with:
-        python-version: 3.7
+        python-version: 3.10
     - name: Add conda to system path
       run: |
         # $CONDA is an environment variable pointing to the root of the miniconda directory
@@ -24,7 +24,7 @@ jobs:
     - name: Install dependencies
       shell: bash -l {0}
       run: |
-        conda install python=3.7
+        conda install python=3.10
         conda install -c psi4 psi4
         conda install -c conda-forge jax
         conda install -c conda-forge jaxlib
diff --git a/quax/core.py b/quax/core.py
index 952edf3..11812e1 100644
--- a/quax/core.py
+++ b/quax/core.py
@@ -217,6 +217,7 @@ def electronic_energy(*args, options=options, deriv_order=deriv_order_R):
         else:
             print("Error: Order {},{} mixed derivatives are not exposed to the API.".format(deriv_order_F, deriv_order_R))
             deriv = 0
+        deriv = jnp.round(deriv, 14)
         return np.asarray(deriv)
     
     # Partial derivatives
diff --git a/tests/test_energies.py b/tests/test_energies.py
index 91b5266..a019c8a 100644
--- a/tests/test_energies.py
+++ b/tests/test_energies.py
@@ -19,9 +19,7 @@
                   'mp2_type':'conv',
                   'e_convergence': 1e-10,
                   'd_convergence':1e-10,
-                  'puream': 0,
-                  'points':5,
-                  'fd_project':False})
+                  'puream': 0})
 
 def test_hartree_fock(method='hf'):
     psi_e = psi4.energy(method + '/' + basis_name)

From 37f9079429c207cc537325a293e72241b56de077 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 24 Jun 2024 10:18:37 -0400
Subject: [PATCH 72/91] Fix tests

---
 quax/methods/mp2f12.py  |  2 --
 tests/test_dipoles.py   | 44 +++++++++++++++++++++++++++++------------
 tests/test_gradients.py |  8 ++++----
 tests/test_hessians.py  | 14 ++++++-------
 4 files changed, 42 insertions(+), 26 deletions(-)

diff --git a/quax/methods/mp2f12.py b/quax/methods/mp2f12.py
index 0ad313a..5cdd4d9 100644
--- a/quax/methods/mp2f12.py
+++ b/quax/methods/mp2f12.py
@@ -93,8 +93,6 @@ def frzn_core(idx, accu):
 
     E_s = cabs_singles(f, spaces)
 
-    jax.debug.print("  Total MP2-F12/3C(FIX) Energy:         {}", E_mp2 + dE_f12 + E_s)
-
     return E_mp2 + dE_f12 + E_s
 
 # CABS Singles
diff --git a/tests/test_dipoles.py b/tests/test_dipoles.py
index 1e7dbcc..f084bae 100644
--- a/tests/test_dipoles.py
+++ b/tests/test_dipoles.py
@@ -14,32 +14,50 @@
 units bohr
 """)
 basis_name = 'sto-3g'
-psi4.set_options({
-                  'basis': basis_name,
+psi4.set_options({'basis': basis_name,
                   'scf_type': 'pk',
                   'mp2_type':'conv',
                   'e_convergence': 1e-10,
                   'd_convergence':1e-10,
-                  'puream': 0
-                })
+                  'puream': 0})
 
-options = {'damping':True, 'spectral_shift':False}
+options = {'damping': True, 'spectral_shift': False}
 efield = np.zeros((3))
 
-def test_hartree_fock_gradient(method='hf'):
-    psi4.properties(method, properties=['dipole'])
-    psi_deriv = psi4.variable("SCF DIPOLE")
+def findif_dipole(method, pert):
+    lambdas = [pert, -pert, 2.0*pert, -2.0*pert]
+    dip_vec = np.zeros((3))
+
+    for i in range(3):
+        pert_vec = [0, 0, 0]
+        energies = []
+        for l in lambdas:
+            pert_vec[i] = l
+            psi4.set_options({'perturb_h': True,
+                              'perturb_with': 'dipole',
+                              'perturb_dipole': pert_vec})
+            energies.append(psi4.energy(method))
+        val = (8.0*energies[0] - 8.0*energies[1] - energies[2] + energies[3]) / (12.0*pert)
+        dip_vec[i] = val
+    return dip_vec
+
+def test_hartree_fock_dipole(method='hf'):
+    psi_deriv = findif_dipole(method, 0.0005)
     quax_deriv = quax.core.efield_deriv(molecule, basis_name, method, efield=efield, deriv_order=1, options=options).reshape(-1,3)
     quax_partial0 = quax.core.efield_deriv(molecule, basis_name, method, efield=efield, deriv_order=1, partial=(0,), options=options)
     assert np.allclose(psi_deriv, quax_deriv)
     assert np.allclose(psi_deriv[0], quax_partial0)
 
-def test_ccsd_gradient(method='ccsd'):
-    psi4.properties(method, properties=['dipole'])
-    psi_deriv = psi4.variable("CC DIPOLE")
+def test_mp2_dipole(method='mp2'):
+    psi_deriv = findif_dipole(method, 0.0005)
     quax_deriv = quax.core.efield_deriv(molecule, basis_name, method, efield=efield, deriv_order=1, options=options).reshape(-1,3)
     quax_partial0 = quax.core.efield_deriv(molecule, basis_name, method, efield=efield, deriv_order=1, partial=(0,), options=options)
-    assert np.allclose(psi_deriv, quax_deriv, rtol=1e-4, atol=1e-4)
+    assert np.allclose(psi_deriv, quax_deriv)
     assert np.allclose(psi_deriv[0], quax_partial0)
 
-
+def test_ccsd_t_dipole(method='ccsd(t)'):
+    psi_deriv = findif_dipole(method, 0.0005)
+    quax_deriv = quax.core.efield_deriv(molecule, basis_name, method, efield=efield, deriv_order=1, options=options).reshape(-1,3)
+    quax_partial0 = quax.core.efield_deriv(molecule, basis_name, method, efield=efield, deriv_order=1, partial=(0,), options=options)
+    assert np.allclose(psi_deriv, quax_deriv, atol=1e-7)
+    assert np.allclose(psi_deriv[0], quax_partial0)
diff --git a/tests/test_gradients.py b/tests/test_gradients.py
index f562c6d..93dcf60 100644
--- a/tests/test_gradients.py
+++ b/tests/test_gradients.py
@@ -18,12 +18,12 @@
                   'scf_type': 'pk',
                   'mp2_type':'conv',
                   'e_convergence': 1e-10,
-                  'd_convergence':1e-10,
+                  'd_convergence': 1e-10,
                   'puream': 0,
-                  'points':5,
-                  'fd_project':False})
+                  'points': 5,
+                  'fd_project': False})
 
-options = {'damping':True, 'spectral_shift':False}
+options = {'damping': True, 'spectral_shift': False}
 
 def test_hartree_fock_gradient(method='hf'):
     psi_deriv = np.round(np.asarray(psi4.gradient(method + '/' + basis_name)), 10)
diff --git a/tests/test_hessians.py b/tests/test_hessians.py
index 8da6b6f..1374598 100644
--- a/tests/test_hessians.py
+++ b/tests/test_hessians.py
@@ -18,19 +18,19 @@
                   'scf_type': 'pk',
                   'mp2_type':'conv',
                   'e_convergence': 1e-10,
-                  'd_convergence':1e-10,
+                  'd_convergence': 1e-10,
                   'puream': 0,
-                  'points':5,
-                  'fd_project':False})
+                  'points': 5,
+                  'fd_project': False})
 
-options = {'damping':True, 'spectral_shift':False}
+options = {'damping': True, 'spectral_shift': False}
 
 def test_hartree_fock_hessian(method='hf'):
     psi_deriv = np.round(np.asarray(psi4.hessian(method + '/' + basis_name)), 10)
     n = psi_deriv.shape[0]
     quax_deriv = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, options=options).reshape(n,n)
     quax_partial00 = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, partial=(0,0), options=options)
-    assert np.allclose(psi_deriv, quax_deriv)
+    assert np.allclose(psi_deriv, quax_deriv, rtol=5e-5)
     assert np.allclose(psi_deriv[0,0], quax_partial00)
 
 def test_mp2_hessian(method='mp2'):
@@ -38,7 +38,7 @@ def test_mp2_hessian(method='mp2'):
     n = psi_deriv.shape[0]
     quax_deriv = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, options=options).reshape(n,n)
     quax_partial00 = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, partial=(0,0), options=options)
-    assert np.allclose(psi_deriv, quax_deriv, atol=5e-7)
+    assert np.allclose(psi_deriv, quax_deriv, rtol=5e-5)
     assert np.allclose(psi_deriv[0,0], quax_partial00)
 
 def test_ccsd_t_hessian(method='ccsd(t)'):
@@ -46,6 +46,6 @@ def test_ccsd_t_hessian(method='ccsd(t)'):
     n = psi_deriv.shape[0]
     quax_deriv = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, options=options).reshape(n,n)
     quax_partial00 = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, partial=(0,0), options=options)
-    assert np.allclose(psi_deriv, quax_deriv, atol=5e-7)
+    assert np.allclose(psi_deriv, quax_deriv, rtol=7e-5)
     assert np.allclose(psi_deriv[0,0], quax_partial00)
 

From b9b0c6eaad9912b0484526cd14cbc3ef6dc12126 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 24 Jun 2024 10:32:12 -0400
Subject: [PATCH 73/91] README and versioning

---
 README.md | 34 +++++++++++++++++-----------------
 setup.py  |  2 +-
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 016b95c..134d57d 100644
--- a/README.md
+++ b/README.md
@@ -6,10 +6,11 @@
 
 You have found Quax. The paper outlining this work was just [recently published](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.1c00607). 
 This library supports a simple and clean API for obtaining higher-order energy derivatives of electronic
-structure computations such as Hartree-Fock, second-order Møller-Plesset perturbation theory (MP2), and
-coupled cluster with singles, doubles, and perturbative triples excitations [CCSD(T)].
-Whereas most codes support only analytic gradient and occasionally Hessian computations,
-this code can compute analytic derivatives of arbitrary order. 
+structure computations such as Hartree-Fock, second-order Møller-Plesset perturbation theory (MP2),
+explicitly correlated MP2 (MP2-F12), and coupled cluster with singles, doubles, and perturbative triples 
+excitations [CCSD(T)].
+Whereas most codes support only analytic gradient and occasionally Hessian computations, this code can 
+compute analytic derivatives of arbitrary order for both geometric derivatives and electric field derivatives. 
 We use [JAX](https://github.com/google/jax) for automatically differentiating electronic structure computations.
 The code can be easily extended to support other methods, for example
 using the guidance offered by the [Psi4Numpy project](https://github.com/psi4/psi4numpy).
@@ -44,22 +45,22 @@ molecule = psi4.geometry("""
 
 energy = quax.core.energy(molecule, 'sto-3g', 'hf')
 print(energy)
-gradient = quax.core.derivative(molecule, 'sto-3g', 'hf', deriv_order=1)
+gradient = quax.core.geom_deriv(molecule, 'sto-3g', 'hf', deriv_order=1)
 print(gradient)
-hessian = quax.core.derivative(molecule, 'sto-3g', 'hf', deriv_order=2)
+hessian = quax.core.geom_deriv(molecule, 'sto-3g', 'hf', deriv_order=2)
 print(hessian)
 
-dz1 = quax.core.partial_derivative(molecule, 'sto-3g', 'hf', deriv_order=1, partial=(2,))
+dz1 = quax.core.geom_deriv(molecule, 'sto-3g', 'hf', deriv_order=1, partial=(2,))
 print(dz1)
 
-dz1_dz2 = quax.core.partial_derivative(molecule, 'sto-3g', 'hf', deriv_order=2, partial=(2,5))
+dz1_dz2 = quax.core.geom_deriv(molecule, 'sto-3g', 'hf', deriv_order=2, partial=(2,5))
 print(dz1_dz2)
 
 print('Partial gradient matches gradient element: ', dz1 == gradient[2])
 print('Partial hessian matches hessian element: ', dz1_dz2 == hessian[2,5])
 ```
 
-Above, in the `quax.core.partial_derivative` function calls, the `partial` arguments describe the address of the element in the _n_th order derivative
+Above, in the `quax.core.geom_deriv` function calls, the `partial` arguments describe the address of the element in the _n_th order derivative
 tensor you want to compute. The dimensions of a derivative tensor correspond to the row-wise flattened Cartesian coordinates, with 0-based indexing.
 For _N_ Cartesian coordinates, gradient is a size _N_ vector, Hessian a _N_ by _N_ matrix, and cubic and quartic derivative tensors are rank-3 and rank-4 tensors with dimension size _N_.
 
@@ -77,7 +78,7 @@ molecule = psi4.geometry('''
                          units bohr
                          ''')
 
-quartic = quax.core.derivative(molecule, '6-31g', 'ccsd(t)', deriv_order=4)
+quartic = quax.core.geom_deriv(molecule, '6-31g', 'ccsd(t)', deriv_order=4)
 ```
 
 Perhaps that's too expensive/slow. You can instead compute quartic partial derivatives:
@@ -93,7 +94,7 @@ molecule = psi4.geometry('''
                          units bohr
                          ''')
 
-dz1_dz1_dz2_dz2 = quax.core.partial_derivative(molecule, '6-31g', 'ccsd(t)', deriv_order=4, partial=(2,2,5,5))
+dz1_dz1_dz2_dz2 = quax.core.geom_deriv(molecule, '6-31g', 'ccsd(t)', deriv_order=4, partial=(2,2,5,5))
 ```
 
 Similar computations can be split across multiple nodes in an embarassingly parallel fashion, and one can take full advantage of symmetry so that only the unique elements are computed.
@@ -132,7 +133,9 @@ python setup.py install
 ```
 
 ### Building the Libint Interface
-For the Libint interface, you nust install those dependencies as well.
+A [Docker image](https://hub.docker.com/r/ericacmitchell/libint_derivs) has been made for Libint with up to 2nd-order derivatives and maximum angular momentum of 5 for standard integrals, Cartesian-multipole integrals, and F12-type integrals.
+
+Otherwise, for the Libint interface, you nust install those dependencies as well.
 ```
 conda install libstdcxx-ng gcc_linux-64 gxx_linux-64 ninja boost eigen3 gmp bzip2 cmake pybind11
 ```
@@ -174,22 +177,19 @@ The --target check runs test suite, and finally the install command installs the
 tar -xvf libint_*.tgz
 cd libint-*/
 mkdir PREFIX
-cmake . -DCMAKE_INSTALL_PREFIX=/path/to/libint/PREFIX/ -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+cmake -GNinja . -DCMAKE_INSTALL_PREFIX=/path/to/libint/PREFIX/ -DCMAKE_POSITION_INDEPENDENT_CODE=ON
 cmake --build . -- -j4
 cmake --build . --target check
 cmake --build . --target install
 ```
 
 Note that the following cmake command may not find various libraries for the dependencies of Libint.
-`cmake . -DCMAKE_INSTALL_PREFIX=/path/to/libint/PREFIX/ -DCMAKE_POSITION_INDEPENDENT_CODE=ON`
+`cmake -GNinja . -DCMAKE_INSTALL_PREFIX=/path/to/libint/PREFIX/ -DCMAKE_POSITION_INDEPENDENT_CODE=ON`
 To fix this, you may need to explicitly point to it
 `export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/path/to/libint/dependency/lib/`
 and then run the above cmake command.
 If using Anaconda, the path is probably in the environment directory `/path/to/envs/quax/lib/`.
 
-Also note that Libint recommends using Ninja to build for performance reasons. This can be done if Ninja is installed:
-`cmake . -G Ninja -DCMAKE_INSTALL_PREFIX=/path/to/libint/PREFIX/ -DCMAKE_POSITION_INDEPENDENT_CODE=ON`
-
 ### Compiling the Libint-Quax interface
 Once Libint is installed, the makefile in `quax/integrals/makefile` needs to be edited with your compiler and the proper paths specifying the locations
 of headers and libraries for Libint, pybind11, HDF5, and python. 
diff --git a/setup.py b/setup.py
index 5c61f00..da55644 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 if __name__ == "__main__":
     setuptools.setup(
         name='quax',
-        version="0.2.0a1",
+        version="0.3.0",
         description='Arbitrary order derivatives of electronic structure computations.',
         author='Adam Abbott, Erica Mitchell',
         author_email='adabbott@uga.edu, emitchell@uga.edu',

From 6a8b5395c0dfc289fc08795088c6e47464d7e8a3 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 24 Jun 2024 10:37:55 -0400
Subject: [PATCH 74/91] Reversioning

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index da55644..7b321f8 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 if __name__ == "__main__":
     setuptools.setup(
         name='quax',
-        version="0.3.0",
+        version="0.2.0",
         description='Arbitrary order derivatives of electronic structure computations.',
         author='Adam Abbott, Erica Mitchell',
         author_email='adabbott@uga.edu, emitchell@uga.edu',

From c77a392cfed60b40165a6989924e3e3c7f850871 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 24 Jun 2024 10:51:26 -0400
Subject: [PATCH 75/91] Update GitHub CI

---
 .github/workflows/continuous_integration.yml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index db22fb6..2e90d00 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -12,11 +12,10 @@ jobs:
   build-linux:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python 3.10
-      uses: actions/setup-python@v2
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v5
       with:
-        python-version: 3.10
+        python-version: '3.10' 
     - name: Add conda to system path
       run: |
         # $CONDA is an environment variable pointing to the root of the miniconda directory

From c2a0b749cc72a51db17afb754221e521b0a0f25f Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 24 Jun 2024 11:18:45 -0400
Subject: [PATCH 76/91] CI PythonPath, JAX version

---
 .github/workflows/continuous_integration.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index 2e90d00..58d50c6 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -20,13 +20,14 @@ jobs:
       run: |
         # $CONDA is an environment variable pointing to the root of the miniconda directory
         echo $CONDA/bin >> $GITHUB_PATH
+        echo "PYTHONPATH=$GITHUB_WORKSPACE" >> $GITHUB_ENV
     - name: Install dependencies
       shell: bash -l {0}
       run: |
         conda install python=3.10
         conda install -c psi4 psi4
-        conda install -c conda-forge jax
-        conda install -c conda-forge jaxlib
+        conda install -c conda-forge 'jax>=0.4.19'
+        conda install -c conda-forge 'jaxlib>=0.4.19'
         conda install -c conda-forge h5py
         pip install -e .
     - name: Test with pytest

From ebee9b7b10f8fbc587bd580b0373284ff00a364a Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 24 Jun 2024 11:39:49 -0400
Subject: [PATCH 77/91] CI can't find module

---
 .github/workflows/continuous_integration.yml | 14 +++++++++++---
 quax/integrals/makefile                      |  4 ++--
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index 58d50c6..43ea182 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -19,18 +19,26 @@ jobs:
     - name: Add conda to system path
       run: |
         # $CONDA is an environment variable pointing to the root of the miniconda directory
+        echo $GITHUB_WORKSPACE
         echo $CONDA/bin >> $GITHUB_PATH
-        echo "PYTHONPATH=$GITHUB_WORKSPACE" >> $GITHUB_ENV
     - name: Install dependencies
       shell: bash -l {0}
       run: |
         conda install python=3.10
         conda install -c psi4 psi4
-        conda install -c conda-forge 'jax>=0.4.19'
-        conda install -c conda-forge 'jaxlib>=0.4.19'
+        conda install -c conda-forge jax
+        conda install -c conda-forge jaxlib
+        conda install -c conda-forge libint
         conda install -c conda-forge h5py
         pip install -e .
+    - name: Build integrals
+      shell: bash -l {0}
+      run: |
+        cd $GITHUB_WORKSPACE/quax/integrals
+        make
+        cd $GITHUB_WORKSPACE
     - name: Test with pytest
       run: |
         conda install pytest
+        echo "PYTHONPATH=$GITHUB_WORKSPACE" >> $GITHUB_ENV
         pytest
diff --git a/quax/integrals/makefile b/quax/integrals/makefile
index d2ef2a9..5e6452f 100644
--- a/quax/integrals/makefile
+++ b/quax/integrals/makefile
@@ -4,9 +4,9 @@ CC      := g++
 # Options passed to compiler, add "-fopenmp" if intending to use OpenMP
 CFLAGS  := -O3 -fPIC -fopenmp
 # Libint prefix location (where /include, /include/libint2, /lib, /share are located) 
-LIBINT_PREFIX := /home/ecm23353/psi_env
+LIBINT_PREFIX := /usr/share/miniconda
 # Conda prefix location, it is suggested to use conda to install nearly all dependencies
-CONDA_PREFIX := /home/ecm23353/psi_env
+CONDA_PREFIX := /usr/share/miniconda
 
 I1 := $(LIBINT_PREFIX)/include
 I2 := $(LIBINT_PREFIX)/include/libint2

From 7756485dbe4bd19a8fedb53a93473bdd770c4e87 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 24 Jun 2024 11:47:53 -0400
Subject: [PATCH 78/91] CI missing pybind11

---
 .github/workflows/continuous_integration.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index 43ea182..e084bcd 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -19,7 +19,7 @@ jobs:
     - name: Add conda to system path
       run: |
         # $CONDA is an environment variable pointing to the root of the miniconda directory
-        echo $GITHUB_WORKSPACE
+        echo "${GITHUB_WORKSPACE}"
         echo $CONDA/bin >> $GITHUB_PATH
     - name: Install dependencies
       shell: bash -l {0}
@@ -28,6 +28,7 @@ jobs:
         conda install -c psi4 psi4
         conda install -c conda-forge jax
         conda install -c conda-forge jaxlib
+        conda install -c conda-forge pybind11
         conda install -c conda-forge libint
         conda install -c conda-forge h5py
         pip install -e .
@@ -40,5 +41,5 @@ jobs:
     - name: Test with pytest
       run: |
         conda install pytest
-        echo "PYTHONPATH=$GITHUB_WORKSPACE" >> $GITHUB_ENV
+        echo "PYTHONPATH=${GITHUB_WORKSPACE}" >> $GITHUB_ENV
         pytest

From 2241c05a37ea84b06f37ed0526eb419cafbab72e Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 24 Jun 2024 11:52:43 -0400
Subject: [PATCH 79/91] CI missing Eigen

---
 .github/workflows/continuous_integration.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index e084bcd..b636aea 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -19,7 +19,6 @@ jobs:
     - name: Add conda to system path
       run: |
         # $CONDA is an environment variable pointing to the root of the miniconda directory
-        echo "${GITHUB_WORKSPACE}"
         echo $CONDA/bin >> $GITHUB_PATH
     - name: Install dependencies
       shell: bash -l {0}
@@ -29,6 +28,7 @@ jobs:
         conda install -c conda-forge jax
         conda install -c conda-forge jaxlib
         conda install -c conda-forge pybind11
+        conda install -c conda-forge eigen
         conda install -c conda-forge libint
         conda install -c conda-forge h5py
         pip install -e .
@@ -42,4 +42,5 @@ jobs:
       run: |
         conda install pytest
         echo "PYTHONPATH=${GITHUB_WORKSPACE}" >> $GITHUB_ENV
+        echo $GITHUB_WORKSPACE >> $GITHUB_PATH
         pytest

From 066738ed5c6acde0fb93fef1fff0b9782a4bc953 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 24 Jun 2024 12:46:36 -0400
Subject: [PATCH 80/91] Fix test_hessian, use diff pytest cmd

---
 .github/workflows/continuous_integration.yml | 4 +---
 tests/test_hessians.py                       | 6 +++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index b636aea..20f5214 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -41,6 +41,4 @@ jobs:
     - name: Test with pytest
       run: |
         conda install pytest
-        echo "PYTHONPATH=${GITHUB_WORKSPACE}" >> $GITHUB_ENV
-        echo $GITHUB_WORKSPACE >> $GITHUB_PATH
-        pytest
+        python -m pytest
diff --git a/tests/test_hessians.py b/tests/test_hessians.py
index 1374598..8aa43dc 100644
--- a/tests/test_hessians.py
+++ b/tests/test_hessians.py
@@ -30,7 +30,7 @@ def test_hartree_fock_hessian(method='hf'):
     n = psi_deriv.shape[0]
     quax_deriv = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, options=options).reshape(n,n)
     quax_partial00 = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, partial=(0,0), options=options)
-    assert np.allclose(psi_deriv, quax_deriv, rtol=5e-5)
+    assert np.allclose(psi_deriv, quax_deriv)
     assert np.allclose(psi_deriv[0,0], quax_partial00)
 
 def test_mp2_hessian(method='mp2'):
@@ -38,7 +38,7 @@ def test_mp2_hessian(method='mp2'):
     n = psi_deriv.shape[0]
     quax_deriv = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, options=options).reshape(n,n)
     quax_partial00 = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, partial=(0,0), options=options)
-    assert np.allclose(psi_deriv, quax_deriv, rtol=5e-5)
+    assert np.allclose(psi_deriv, quax_deriv, atol=5e-7)
     assert np.allclose(psi_deriv[0,0], quax_partial00)
 
 def test_ccsd_t_hessian(method='ccsd(t)'):
@@ -46,6 +46,6 @@ def test_ccsd_t_hessian(method='ccsd(t)'):
     n = psi_deriv.shape[0]
     quax_deriv = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, options=options).reshape(n,n)
     quax_partial00 = quax.core.geom_deriv(molecule, basis_name, method, deriv_order=2, partial=(0,0), options=options)
-    assert np.allclose(psi_deriv, quax_deriv, rtol=7e-5)
+    assert np.allclose(psi_deriv, quax_deriv, atol=5e-7)
     assert np.allclose(psi_deriv[0,0], quax_partial00)
 

From b93bd6cad77a60162b7345048069bb2bb853da88 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 24 Jun 2024 13:00:29 -0400
Subject: [PATCH 81/91] CI numpy version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 7b321f8..2ebb2c0 100644
--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,7 @@
         packages=setuptools.find_packages(where="quax"),
         package_dir={"": "quax"},
         install_requires=[
-            'numpy>=1.23',
+            'numpy>=1.23,<2.0',
             'jax>=0.4.19',
             'jaxlib>=0.4.19',
             'h5py>=2.8.0',

From c390ed7e8df00350c60ca91aafaab5149a4c3172 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 24 Jun 2024 13:10:16 -0400
Subject: [PATCH 82/91] CI Psi4 from conda-forge

---
 .github/workflows/continuous_integration.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index 20f5214..05f479f 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -24,7 +24,7 @@ jobs:
       shell: bash -l {0}
       run: |
         conda install python=3.10
-        conda install -c psi4 psi4
+        conda install -c conda-forge psi4
         conda install -c conda-forge jax
         conda install -c conda-forge jaxlib
         conda install -c conda-forge pybind11

From 55e16d8a36b8579f29443ecdf5b184abd10a1f6c Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 24 Jun 2024 13:26:26 -0400
Subject: [PATCH 83/91] CI conda-forge

---
 .github/workflows/continuous_integration.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index 05f479f..ac5d9ef 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -23,13 +23,14 @@ jobs:
     - name: Install dependencies
       shell: bash -l {0}
       run: |
-        conda install python=3.10
+        conda install -c conda-forge python=3.10
         conda install -c conda-forge psi4
         conda install -c conda-forge jax
         conda install -c conda-forge jaxlib
         conda install -c conda-forge pybind11
         conda install -c conda-forge eigen
         conda install -c conda-forge libint
+        conda install -c conda-forge hdf5
         conda install -c conda-forge h5py
         pip install -e .
     - name: Build integrals

From 8e888568279367c078cfaa9411ce090e28bf3fd6 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Mon, 24 Jun 2024 13:47:08 -0400
Subject: [PATCH 84/91] CI solver issues

---
 .github/workflows/continuous_integration.yml | 15 +++++++--------
 environment.yml                              |  7 +++++--
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index ac5d9ef..0c80674 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -24,14 +24,13 @@ jobs:
       shell: bash -l {0}
       run: |
         conda install -c conda-forge python=3.10
-        conda install -c conda-forge psi4
-        conda install -c conda-forge jax
-        conda install -c conda-forge jaxlib
-        conda install -c conda-forge pybind11
-        conda install -c conda-forge eigen
-        conda install -c conda-forge libint
-        conda install -c conda-forge hdf5
-        conda install -c conda-forge h5py
+        conda install conda-forge::psi4
+        conda install conda-forge::jax
+        conda install conda-forge::jaxlib
+        conda install conda-forge::pybind11
+        conda install conda-forge::eigen
+        conda install conda-forge::hdf5
+        conda install conda-forge::h5py
         pip install -e .
     - name: Build integrals
       shell: bash -l {0}
diff --git a/environment.yml b/environment.yml
index 7d38a5d..1888e45 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,11 +1,14 @@
 name: quax
 channels:
-  - psi4
   - conda-forge
-  - defaults
+  - nodefaults
 dependencies:
   - psi4
   - jax
   - jaxlib
+  - libint
+  - pybind11
+  - eigen
+  - hdf5
   - h5py
   - pytest

From adf2cd90380271d5effe285d85e3e05b5c568921 Mon Sep 17 00:00:00 2001
From: EricaCMitchell <ericamitch5@gmail.com>
Date: Wed, 21 Aug 2024 10:05:37 -0400
Subject: [PATCH 85/91] CI set solver for python

---
 .github/workflows/continuous_integration.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index 0c80674..94ba36a 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -23,7 +23,8 @@ jobs:
     - name: Install dependencies
       shell: bash -l {0}
       run: |
-        conda install -c conda-forge python=3.10
+        conda install python=3.10
+        conda config --set solver classic
         conda install conda-forge::psi4
         conda install conda-forge::jax
         conda install conda-forge::jaxlib

From 902b21b57546d69b6ac86cb6ea977c1412708968 Mon Sep 17 00:00:00 2001
From: Erica Mitchell <ericamitch5@gmail.com>
Date: Wed, 21 Aug 2024 16:51:18 +0000
Subject: [PATCH 86/91] Use resources from Molssi Cookiecutter CMS

---
 .codecov.yml                                  | 14 +++
 .gitattributes                                |  1 +
 .github/CONTRIBUTING.md                       | 42 +++++++++
 .github/PULL_REQUEST_TEMPLATE.md              | 12 +++
 .github/workflows/continuous_integration.yml  | 88 ++++++++++++-------
 .gitignore                                    | 34 ++++---
 CODE_OF_CONDUCT.md                            | 77 ++++++++++++++++
 MANIFEST.in                                   |  3 +
 devtools/README.md                            | 44 ++++++++++
 .../conda-envs/test_env.yaml                  | 15 ++--
 pyproject.toml                                | 61 +++++++++++++
 setup.cfg                                     | 20 +++++
 setup.py                                      | 37 --------
 13 files changed, 363 insertions(+), 85 deletions(-)
 create mode 100644 .codecov.yml
 create mode 100644 .gitattributes
 create mode 100644 .github/CONTRIBUTING.md
 create mode 100644 .github/PULL_REQUEST_TEMPLATE.md
 create mode 100644 CODE_OF_CONDUCT.md
 create mode 100644 MANIFEST.in
 create mode 100644 devtools/README.md
 rename environment.yml => devtools/conda-envs/test_env.yaml (54%)
 create mode 100644 pyproject.toml
 create mode 100644 setup.cfg
 delete mode 100644 setup.py

diff --git a/.codecov.yml b/.codecov.yml
new file mode 100644
index 0000000..a3ed7f4
--- /dev/null
+++ b/.codecov.yml
@@ -0,0 +1,14 @@
+# Codecov configuration to make it a bit less noisy
+coverage:
+  status:
+    patch: false
+    project:
+      default:
+        threshold: 50%
+comment:
+  layout: "header"
+  require_changes: false
+  branches: null
+  behavior: default
+  flags: null
+  paths: null
\ No newline at end of file
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..63058ee
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+quax/_version.py export-subst
\ No newline at end of file
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
new file mode 100644
index 0000000..621e242
--- /dev/null
+++ b/.github/CONTRIBUTING.md
@@ -0,0 +1,42 @@
+# How to contribute
+
+We welcome contributions from external contributors, and this document
+describes how to merge code changes into this Quax. 
+
+## Getting Started
+
+* Make sure you have a [GitHub account](https://github.com/signup/free).
+* [Fork](https://help.github.com/articles/fork-a-repo/) this repository on GitHub.
+* On your local machine,
+  [clone](https://help.github.com/articles/cloning-a-repository/) your fork of
+  the repository.
+
+## Making Changes
+
+* Add some really awesome code to your local fork.  It's usually a [good
+  idea](http://blog.jasonmeridth.com/posts/do-not-issue-pull-requests-from-your-master-branch/)
+  to make changes on a
+  [branch](https://help.github.com/articles/creating-and-deleting-branches-within-your-repository/)
+  with the branch name relating to the feature you are going to add.
+* When you are ready for others to examine and comment on your new feature,
+  navigate to your fork of {{cookiecutter.repo_name}} on GitHub and open a [pull
+  request](https://help.github.com/articles/using-pull-requests/) (PR). Note that
+  after you launch a PR from one of your fork's branches, all
+  subsequent commits to that branch will be added to the open pull request
+  automatically.  Each commit added to the PR will be validated for
+  mergability, compilation and test suite compliance; the results of these tests
+  will be visible on the PR page.
+* If you're providing a new feature, you must add test cases and documentation.
+* When the code is ready to go, make sure you run the test suite using pytest.
+* When you're ready to be considered for merging, check the "Ready to go"
+  box on the PR page to let the Quax devs know that the changes are complete.
+  The code will not be merged until this box is checked, the continuous
+  integration returns checkmarks,
+  and multiple core developers give "Approved" reviews.
+
+# Additional Resources
+
+* [General GitHub documentation](https://help.github.com/)
+* [PR best practices](http://codeinthehole.com/writing/pull-requests-and-other-good-practices-for-teams-using-github/)
+* [A guide to contributing to software packages](http://www.contribution-guide.org)
+* [Thinkful PR example](http://www.thinkful.com/learn/github-pull-request-tutorial/#Time-to-Submit-Your-First-PR)
\ No newline at end of file
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 0000000..c772b96
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,12 @@
+## Description
+Provide a brief description of the PR's purpose here.
+
+## Todos
+Notable points that this PR has either accomplished or will accomplish.
+  - [ ] TODO 1
+
+## Questions
+- [ ] Question1
+
+## Status
+- [ ] Ready to go
\ No newline at end of file
diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index 94ba36a..f37a9ad 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -9,37 +9,59 @@ on:
       - master
 
 jobs:
-  build-linux:
-    runs-on: ubuntu-latest
+  test:
+    name: Test on ${{ matrix.os }}, Python ${{ matrix.python-version }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        python-version: ["3.9", "3.10", "3.11"]
+
     steps:
-    - uses: actions/checkout@v4
-    - uses: actions/setup-python@v5
-      with:
-        python-version: '3.10' 
-    - name: Add conda to system path
-      run: |
-        # $CONDA is an environment variable pointing to the root of the miniconda directory
-        echo $CONDA/bin >> $GITHUB_PATH
-    - name: Install dependencies
-      shell: bash -l {0}
-      run: |
-        conda install python=3.10
-        conda config --set solver classic
-        conda install conda-forge::psi4
-        conda install conda-forge::jax
-        conda install conda-forge::jaxlib
-        conda install conda-forge::pybind11
-        conda install conda-forge::eigen
-        conda install conda-forge::hdf5
-        conda install conda-forge::h5py
-        pip install -e .
-    - name: Build integrals
-      shell: bash -l {0}
-      run: |
-        cd $GITHUB_WORKSPACE/quax/integrals
-        make
-        cd $GITHUB_WORKSPACE
-    - name: Test with pytest
-      run: |
-        conda install pytest
-        python -m pytest
+      - uses: actions/checkout@v4
+
+      - name: Additional info about the build
+        shell: bash
+        run: |
+          uname -a
+          df -h
+          ulimit -a
+
+      - name: Create Environment
+        uses: mamba-org/setup-micromamba@v1
+        with:
+          environment-file: devtools/conda-envs/test_env.yaml
+          environment-name: test
+          condarc: | 
+            channels:
+              - conda-forge
+          create-args: >- 
+            python=${{ matrix.python-version }}
+
+      - name: Install package
+        # conda setup requires this special shell
+        shell: bash -l {0}
+        run: |
+          python -m pip install . --no-deps
+          micromamba list
+
+      - name: Build integrals
+        shell: bash -l {0}
+        run: |
+          cd $GITHUB_WORKSPACE/quax/integrals
+          make
+          cd $GITHUB_WORKSPACE
+
+      - name: Run tests
+        # conda setup requires this special shell
+        shell: bash -l {0}
+        run: |
+          pytest -v --cov=quax --cov-report=xml --color=yes tests/
+
+      - name: CodeCov
+        uses: codecov/codecov-action@v1
+        with:
+          file: ./coverage.xml
+          flags: unittests
+          name: codecov-${{ matrix.os }}-py${{ matrix.python-version }}
+
diff --git a/.gitignore b/.gitignore
index cf2ea80..22b437b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,3 @@
-localtests/
-
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -8,11 +6,6 @@ __pycache__/
 # C extensions
 *.so
 
-# Psi4, HdF5, molecule file disk junk
-*.dat
-*.h5
-*.xyz
-
 # Distribution / packaging
 .Python
 env/
@@ -22,6 +15,7 @@ dist/
 downloads/
 eggs/
 .eggs/
+lib/
 lib64/
 parts/
 sdist/
@@ -30,7 +24,6 @@ wheels/
 *.egg-info/
 .installed.cfg
 *.egg
-benchmarks/
 
 # PyInstaller
 #  Usually these files are written by a python script from a template
@@ -48,6 +41,7 @@ htmlcov/
 .coverage
 .coverage.*
 .cache
+.pytest_cache
 nosetests.xml
 coverage.xml
 *.cover
@@ -101,11 +95,31 @@ ENV/
 # Rope project settings
 .ropeproject
 
+# Pycharm settings
+.idea
+*.iml
+*.iws
+*.ipr
+
+# Ignore devcontainer
+/.devcontainer
+
+# Ignore VSCode settings
+/.vscode
+
+# Ignore Sublime Text settings
+*.sublime-workspace
+*.sublime-project
+
+# vim swap
+*.swp
+
 # mkdocs documentation
 /site
 
 # mypy
 .mypy_cache/
 
-#misc files
-project_notes.txt
+# profraw files from LLVM? Unclear exactly what triggers this
+# There are reports this comes from LLVM profiling, but also Xcode 9.
+*profraw
\ No newline at end of file
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000..1a2531c
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,77 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age,
+body size, disability, ethnicity, gender identity and expression, level of
+experience, nationality, personal appearance, race, religion, or sexual
+identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+Moreover, project maintainers will strive to offer feedback and advice to
+ensure quality and consistency of contributions to the code.  Contributions
+from outside the group of project maintainers are strongly welcomed but the
+final decision as to whether commits are merged into the codebase rests with
+the team of project maintainers.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community. Examples of
+representing a project or community include using an official project e-mail
+address, posting via an official social media account, or acting as an
+appointed representative at an online or offline event. Representation of a
+project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at '{{cookiecutter.author_email}}'. The project team will
+review and investigate all complaints, and will respond in a way that it deems
+appropriate to the circumstances. The project team is obligated to maintain
+confidentiality with regard to the reporter of an incident. Further details of
+specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 1.4, available at
+[http://contributor-covenant.org/version/1/4][version]
+
+[homepage]: http://contributor-covenant.org
+[version]: http://contributor-covenant.org/version/1/4/
\ No newline at end of file
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..7b62c26
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,3 @@
+include CODE_OF_CONDUCT.md
+
+global-exclude *.py[cod] __pycache__ *.so
\ No newline at end of file
diff --git a/devtools/README.md b/devtools/README.md
new file mode 100644
index 0000000..45c983a
--- /dev/null
+++ b/devtools/README.md
@@ -0,0 +1,44 @@
+# Development, testing, and deployment tools
+
+This directory contains a collection of tools for running Continuous Integration (CI) tests, 
+conda installation, and other development tools not directly related to the coding process.
+
+
+## Manifest
+
+### Continuous Integration
+
+You should test your code, but do not feel compelled to use these specific programs.
+
+### Conda Environment:
+
+This directory contains the files to setup the Conda environment for testing purposes
+
+* `conda-envs`: directory containing the YAML file(s) which fully describe Conda Environments, their dependencies, and those dependency provenance's
+  * `test_env.yaml`: Simple test environment file with base dependencies. Channels are not specified here and therefore respect global Conda configuration
+  
+### Additional Scripts:
+
+This directory contains OS agnostic helper scripts which don't fall in any of the previous categories
+* `scripts`
+  * `create_conda_env.py`: Helper program for spinning up new conda environments based on a starter file with Python Version and Env. Name command-line options
+
+
+## How to contribute changes
+- Clone the repository if you have write access to the main repo, fork the repository if you are a collaborator.
+- Make a new branch with `git checkout -b {your branch name}`
+- Make changes and test your code
+- Ensure that the test environment dependencies (`conda-envs`) line up with the build and deploy dependencies (`conda-recipe/meta.yaml`)
+- Push the branch to the repo (either the main or your fork) with `git push -u origin {your branch name}`
+  * Note that `origin` is the default name assigned to the remote, yours may be different
+- Make a PR on GitHub with your changes
+- We'll review the changes and get your code into the repo after lively discussion!
+
+
+## Checklist for updates
+- [ ] Make sure there is an/are issue(s) opened for your specific update
+- [ ] Create the PR, referencing the issue
+- [ ] Debug the PR as needed until tests pass
+- [ ] Tag the final, debugged version 
+   *  `git tag -a X.Y.Z [latest pushed commit] && git push --follow-tags`
+- [ ] Get the PR merged in
diff --git a/environment.yml b/devtools/conda-envs/test_env.yaml
similarity index 54%
rename from environment.yml
rename to devtools/conda-envs/test_env.yaml
index 1888e45..5c27c5b 100644
--- a/environment.yml
+++ b/devtools/conda-envs/test_env.yaml
@@ -1,14 +1,19 @@
-name: quax
+name: test-quax
 channels:
   - conda-forge
-  - nodefaults
 dependencies:
-  - psi4
-  - jax
-  - jaxlib
+  # Libint dependencies
   - libint
   - pybind11
   - eigen
+  - boost
   - hdf5
+  # Quax dependencies
+  - psi4
+  - jax
+  - jaxlib
   - h5py
+  # Testing
   - pytest
+  - pytest-cov
+  - codecov
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..837e7c9
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,61 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+# Self-descriptive entries which should always be present
+# https://packaging.python.org/en/latest/specifications/declaring-project-metadata/
+[project]
+name = "quax"
+description = "Arbitrary order derivatives of electronic structure computations."
+dynamic = ["version"]
+readme = "README.md"
+authors = [
+  { name = "Adam Abbott", email = "adabbott@uga.edu" },
+  { name = "Erica Mitchell", email = "emitchell@uga.edu" }
+]
+license = { text = "BSD-3C" }
+# See https://pypi.org/classifiers/
+classifiers = [
+  "License :: OSI Approved :: BSD License",
+  "Programming Language :: Python :: 3",
+  "Development Status :: 4 - Beta",
+  "Intended Audience :: Science/Research"
+]
+requires-python = ">=3.9"
+# Declare any run-time dependencies that should be installed with the package.
+dependencies = [
+  "importlib-resources;python_version=<'3.12'",
+  "numpy>=1.23,<2.0",
+  "jax>=0.4.19",
+  "jaxlib>=0.4.19",
+  "h5py>=2.8.0",
+  "scipy>=1.9"
+]
+
+# Update the urls once the hosting is set up.
+[project.urls]
+"Source" = "https://github.com/CCQC/Quax/"
+#"Documentation" = "Quax.readthedocs.io/"
+
+[project.optional-dependencies]
+test = [
+  "pytest>=6.1.2",
+  "pytest-cov"
+]
+
+[tool.setuptools]
+zip-safe = false
+# Let setuptools discover the package in the current directory,
+# but be explicit about non-Python files.
+# See also:
+#   https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html#setuptools-specific-configuration
+include-package-data = false
+
+[tool.setuptools.packages.find]
+namespaces = false
+where = ["."]
+
+[tool.setuptools-git-versioning]
+enabled = true
+dev_template = "{tag}.{ccount}+git.{sha}"
+dirty_template = "{tag}.{ccount}+git.{sha}.dirty"
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..fb0a539
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,20 @@
+# Helper file to handle all configs
+
+[coverage:run]
+# .coveragerc to control coverage.py and pytest-cov
+omit =
+    # Omit the tests
+    */tests/*
+
+[yapf]
+# YAPF, in .style.yapf files this shows up as "[style]" header
+COLUMN_LIMIT = 119
+INDENT_WIDTH = 4
+USE_TABS = False
+
+[flake8]
+# Flake8, PyFlakes, etc
+max-line-length = 119
+
+[aliases]
+test = pytest
\ No newline at end of file
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 2ebb2c0..0000000
--- a/setup.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import setuptools
-
-if __name__ == "__main__":
-    setuptools.setup(
-        name='quax',
-        version="0.2.0",
-        description='Arbitrary order derivatives of electronic structure computations.',
-        author='Adam Abbott, Erica Mitchell',
-        author_email='adabbott@uga.edu, emitchell@uga.edu',
-        url="none",
-        license='BSD-3C',
-        packages=setuptools.find_packages(where="quax"),
-        package_dir={"": "quax"},
-        install_requires=[
-            'numpy>=1.23,<2.0',
-            'jax>=0.4.19',
-            'jaxlib>=0.4.19',
-            'h5py>=2.8.0',
-            'scipy>=1.9'
-        ],
-        extras_require={
-            'tests': [
-                'pytest-cov',
-            ],
-        },
-
-        tests_require=[
-            'pytest-cov',
-        ],
-
-        classifiers=[
-            'Development Status :: 4 - Beta',
-            'Intended Audience :: Science/Research',
-            'Programming Language :: Python :: 3',
-        ],
-        zip_safe=False
-    )

From f8b55b1504075962240daadc561e9963e4b070d4 Mon Sep 17 00:00:00 2001
From: Erica Mitchell <ericamitch5@gmail.com>
Date: Wed, 21 Aug 2024 18:01:58 +0000
Subject: [PATCH 87/91] Skip test_dipoles.py, include .so in installation

---
 MANIFEST.in           |  3 +++
 pyproject.toml        | 10 ++++------
 tests/test_dipoles.py |  3 +++
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index 7b62c26..cbb9f6e 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,6 @@
 include CODE_OF_CONDUCT.md
+include MANIFEST.in
+include LICENSE
 
+graft quax
 global-exclude *.py[cod] __pycache__ *.so
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 837e7c9..4e29bc1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,7 +24,7 @@ classifiers = [
 requires-python = ">=3.9"
 # Declare any run-time dependencies that should be installed with the package.
 dependencies = [
-  "importlib-resources;python_version=<'3.12'",
+  "importlib-resources;python_version>'3.8'",
   "numpy>=1.23,<2.0",
   "jax>=0.4.19",
   "jaxlib>=0.4.19",
@@ -45,16 +45,14 @@ test = [
 
 [tool.setuptools]
 zip-safe = false
-# Let setuptools discover the package in the current directory,
-# but be explicit about non-Python files.
-# See also:
-#   https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html#setuptools-specific-configuration
-include-package-data = false
 
 [tool.setuptools.packages.find]
 namespaces = false
 where = ["."]
 
+[tool.setuptools.package-data]
+quax = ["integrals/*.so"]
+
 [tool.setuptools-git-versioning]
 enabled = true
 dev_template = "{tag}.{ccount}+git.{sha}"
diff --git a/tests/test_dipoles.py b/tests/test_dipoles.py
index f084bae..744774e 100644
--- a/tests/test_dipoles.py
+++ b/tests/test_dipoles.py
@@ -6,6 +6,9 @@
 import pytest
 import numpy as np
 
+# Comment out if you have an installation of Libint with Cartesian multipole derivatives
+pytestmark = pytest.mark.skip("Requires Libint Cartesian multipole derivatives")  
+
 molecule = psi4.geometry("""
 0 1
 O   -0.000007070942     0.125146536460     0.000000000000

From 0403d679d008a6435395106f870dc42dcd1b1565 Mon Sep 17 00:00:00 2001
From: Erica Mitchell <ericamitch5@gmail.com>
Date: Wed, 21 Aug 2024 18:30:08 +0000
Subject: [PATCH 88/91] Update README, generic makefile

---
 README.md               | 20 ++++++++++++++++++--
 quax/integrals/makefile | 17 +++++++----------
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 134d57d..b7e2a11 100644
--- a/README.md
+++ b/README.md
@@ -128,8 +128,24 @@ To use Quax, only a few dependencies are needed. We recommend using a clean Anac
 ```
 conda create -n quax python=3.10
 conda activate quax
-conda install psi4 python=3.10 -c conda-forge/label/libint_dev -c conda-forge
-python setup.py install
+```
+
+Then install the dependencies into your new environment, all can be installed alongside Psi4:
+```
+conda install psi4 python=3.10 -c conda-forge
+```
+
+The Libint interface must be built before installing:
+```
+cd quax/integrals/
+make
+cd ../../
+```
+More can be found below if a custom Libint binary is wanted.
+
+Finally install Quax!
+```
+python -m pip install .
 ```
 
 ### Building the Libint Interface
diff --git a/quax/integrals/makefile b/quax/integrals/makefile
index 5e6452f..33ad7d5 100644
--- a/quax/integrals/makefile
+++ b/quax/integrals/makefile
@@ -1,22 +1,19 @@
 # NOTE: These paths below need to be edited such that they point to a set of 
-# Eigen headers, Python headers, Pybind11 headers, Libint API headers libint2.h libint2.hpp, the rest of the Libint2 headers, and the library location of libint2.a,
+# Eigen headers, Python headers, Pybind11 headers, Libint API headers libint2.h libint2.hpp,
+# the rest of the Libint2 headers, and the library location of libint2.a
 CC      := g++
 # Options passed to compiler, add "-fopenmp" if intending to use OpenMP
 CFLAGS  := -O3 -fPIC -fopenmp
 # Libint prefix location (where /include, /include/libint2, /lib, /share are located) 
-LIBINT_PREFIX := /usr/share/miniconda
+LIBINT_PREFIX := $(shell python3-config --prefix)
 # Conda prefix location, it is suggested to use conda to install nearly all dependencies
-CONDA_PREFIX := /usr/share/miniconda
+CONDA_PREFIX := $(shell python3-config --prefix)
 
 I1 := $(LIBINT_PREFIX)/include
 I2 := $(LIBINT_PREFIX)/include/libint2
 L1 := $(LIBINT_PREFIX)/lib
 # Eigen headers location 
-I3 := $(CONDA_PREFIX)/include/eigen3
-# Python headers location 
-I4 := $(CONDA_PREFIX)/include/python3.10
-# Pybind11 headers location 
-I5 := $(CONDA_PREFIX)/lib/python3.10/site-packages/pybind11/include
+I3 := /usr/include/eigen3
 # HDF5 headers, static and shared libraries 
 I6 := $(CONDA_PREFIX)/include
 L2 := $(CONDA_PREFIX)/lib
@@ -25,7 +22,7 @@ RPATH := -Wl,-rpath,"$(CONDA_PREFIX)/lib"
 
 # This 'TARGETS' suffix should be set to whatever is returned by the command `python3-config --extension-suffix` entered on command line.
 # and it should match the same python version referenced in the above include path for I4 := (3.7 in this case)
-TARGETS := libint_interface.cpython-310-x86_64-linux-gnu.so
+TARGETS := libint_interface$(shell python3-config --extension-suffix)
 OBJ     := libint_interface.o
 
 # Rest is boilerplate. Do not edit unless you know what you're doing.
@@ -37,7 +34,7 @@ clean:
 	rm -f $(OBJ)
 
 $(OBJ): %.o : %.cc $(DEPS)
-	$(CC) -c $< -o $@ $(CFLAGS) -I $(I1) -I $(I2) -I $(I3) -I $(I4) -I $(I5) -I $(I6) -lint2 -L $(L1) -lhdf5 -lhdf5_cpp -L $(L2) $(RPATH)
+	$(CC) -c $< -o $@ $(CFLAGS) -I $(I1) -I $(I2) -I $(I3) $(shell python3 -m pybind11 --includes) -I $(I6) -lint2 -L $(L1) -lhdf5 -lhdf5_cpp -L $(L2) $(RPATH)
 $(TARGETS): $(OBJ)
 	$(CC) $^ -o $@ $(CFLAGS) -shared -I $(I1) -I $(I2) -I $(I3) -I $(I4) -I $(I5) -I $(I6) -lint2 -L $(L1) -lhdf5 -lhdf5_cpp -L $(L2) $(RPATH)
 

From d46cc13517a2c8470f45307d41b2d1ab61cb90dc Mon Sep 17 00:00:00 2001
From: Erica Mitchell <ericamitch5@gmail.com>
Date: Wed, 21 Aug 2024 18:36:24 +0000
Subject: [PATCH 89/91] Fix CI ordering, missed change in makefile

---
 .github/workflows/continuous_integration.yml | 14 +++++++-------
 quax/integrals/makefile                      |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index f37a9ad..67ea1a3 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -38,13 +38,6 @@ jobs:
           create-args: >- 
             python=${{ matrix.python-version }}
 
-      - name: Install package
-        # conda setup requires this special shell
-        shell: bash -l {0}
-        run: |
-          python -m pip install . --no-deps
-          micromamba list
-
       - name: Build integrals
         shell: bash -l {0}
         run: |
@@ -52,6 +45,13 @@ jobs:
           make
           cd $GITHUB_WORKSPACE
 
+      - name: Install package
+        # conda setup requires this special shell
+        shell: bash -l {0}
+        run: |
+          python -m pip install .
+          micromamba list
+
       - name: Run tests
         # conda setup requires this special shell
         shell: bash -l {0}
diff --git a/quax/integrals/makefile b/quax/integrals/makefile
index 33ad7d5..ce46153 100644
--- a/quax/integrals/makefile
+++ b/quax/integrals/makefile
@@ -13,7 +13,7 @@ I1 := $(LIBINT_PREFIX)/include
 I2 := $(LIBINT_PREFIX)/include/libint2
 L1 := $(LIBINT_PREFIX)/lib
 # Eigen headers location 
-I3 := /usr/include/eigen3
+I3 := $(CONDA_PREFIX)/eigen3
 # HDF5 headers, static and shared libraries 
 I6 := $(CONDA_PREFIX)/include
 L2 := $(CONDA_PREFIX)/lib

From 65a99eacf1ab03c25b7632528229738e914e4e8c Mon Sep 17 00:00:00 2001
From: Erica Mitchell <ericamitch5@gmail.com>
Date: Wed, 21 Aug 2024 18:52:26 +0000
Subject: [PATCH 90/91] Fix versioning

---
 pyproject.toml          | 2 +-
 quax/integrals/makefile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4e29bc1..5be9c61 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=61.0"]
+requires = ["setuptools>=61.0", "setuptools-git-versioning"]
 build-backend = "setuptools.build_meta"
 
 # Self-descriptive entries which should always be present
diff --git a/quax/integrals/makefile b/quax/integrals/makefile
index ce46153..8b8bf9f 100644
--- a/quax/integrals/makefile
+++ b/quax/integrals/makefile
@@ -13,7 +13,7 @@ I1 := $(LIBINT_PREFIX)/include
 I2 := $(LIBINT_PREFIX)/include/libint2
 L1 := $(LIBINT_PREFIX)/lib
 # Eigen headers location 
-I3 := $(CONDA_PREFIX)/eigen3
+I3 := $(CONDA_PREFIX)/include/eigen3
 # HDF5 headers, static and shared libraries 
 I6 := $(CONDA_PREFIX)/include
 L2 := $(CONDA_PREFIX)/lib

From 2846db89327bd8dffd0cb1c2b90616c07e904a2d Mon Sep 17 00:00:00 2001
From: Erica Mitchell <ericamitch5@gmail.com>
Date: Wed, 21 Aug 2024 19:01:07 +0000
Subject: [PATCH 91/91] CI schedule, update Python versions

---
 .github/workflows/continuous_integration.yml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index 67ea1a3..e130a5a 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -7,6 +7,12 @@ on:
   pull_request:
     branches:
       - master
+  schedule:
+    # Weekly tests run on main by default:
+    #   Scheduled workflows run on the latest commit on the default or base branch.
+    #   (from https://help.github.com/en/actions/reference/events-that-trigger-workflows#scheduled-events-schedule)
+    - cron: "0 0 * * 0"
+    #   Scheduled workflows are automatically disabled when no repository activity has occurred in 60 day.
 
 jobs:
   test:
@@ -15,7 +21,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest]
-        python-version: ["3.9", "3.10", "3.11"]
+        python-version: ["3.10", "3.11", "3.12"]
 
     steps:
       - uses: actions/checkout@v4