diff --git a/src/madness/bspline/interp.cc b/src/madness/bspline/interp.cc
new file mode 100644
index 00000000000..39af6e640fb
--- /dev/null
+++ b/src/madness/bspline/interp.cc
@@ -0,0 +1,211 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <cassert>
+
+template <typename T>
+std::ostream& operator<<(std::ostream& s, const std::vector<T>& v) {
+    s << "[ ";
+    for (const auto& x : v) s << x << " ";
+    s << "]";
+    return s;
+}
+
+// Barycentric interpolation class using N uniformly spaced-intervals with m uniformly spaced points within each interval (including endpoints)
+template <typename T>
+class Barycentric {
+    size_t N; // Number of larger intervals
+    size_t m; // Number of points in each interval
+    T a; // Left endpoint of the interval
+    T b; // Right endpoint of the interval
+    T H; // Larger interval size
+    T h; // Smaller interval size
+
+    std::vector<T> y; // Vector of function values
+    std::vector<T> w; // Vector of m Barycentric weights
+
+    static size_t factorial(size_t n) {
+        size_t f = 1;
+        for (size_t i=1; i<=n; i++) f *= i;
+        return f;
+    }
+
+    static std::vector<T> weights(const size_t m) {
+        std::vector<T> w(m);
+        size_t n = m-1;
+        size_t nfac = factorial(n);
+        T sign = 1;
+        for (size_t j=0; j<=n; j++) {
+            w[j] = sign*nfac/(factorial(j)*factorial(n-j));
+            sign = -sign;
+        }
+        return w;
+    }
+
+    public:
+
+    template <typename funcT>
+    Barycentric(size_t N, size_t m, T a, T b, funcT f) 
+        : N(N)
+        , m(m)
+        , a(a)
+        , b(b)
+        , H((b-a)/N)
+        , h(H/(m-1))
+        , y(N*(m-1)+1)
+        , w(weights(m))
+    {
+        assert(m>=2);
+        assert(N>=1);
+        for (size_t i=0; i<y.size(); i++) {
+            T x = a + i*h;
+            y[i] = f(x);
+        }
+        // std::cout << "H = " << H << std::endl;
+        // std::cout << "h = " << h << std::endl;
+        // std::cout << "y = " << y << std::endl;
+        // std::cout << f(a) << " " << f(b) << std::endl;
+        // std::cout << "w = " << w << std::endl;
+    }
+
+    T operator()(T x) {
+        // std::cout << "input x = " << x << " " << N << " " << H << std::endl;
+        assert(x>=a && x<b);
+        x -= a;
+        const size_t i = x/H;
+        x -= i*H;
+        // std::cout << "i = " << i << " " << x << std::endl;
+
+        T num = 0;
+        T den = 0;
+        for (size_t j=0; j<m; j++) {
+            T diff = x-j*h;
+            if (diff == 0) return y[i*(m-1) + j];
+            T wj = w[j]/diff;
+            num += wj*y[i*(m-1)+j];
+            den += wj;
+        }
+        return num/den;
+    }
+
+    template <typename funcT>
+    T maxerr(funcT f, size_t oversample=5) {
+        T testh = h/oversample;
+        T err = 0;
+        T x = a;
+        while (x < b) {
+            err = std::max(std::abs(f(x) - (*this)(x)), err);
+            x += testh;
+        }
+        return err;
+    }    
+};
+
+// Barycentric interpolation class using N uniformly spaced-intervals with m Chebyshev points in each interval
+template <typename T>
+class BarycentricChebyshev {
+    size_t N; // Number of larger intervals
+    size_t m; // Number of points in each interval
+    T a; // Left endpoint of the interval
+    T b; // Right endpoint of the interval
+    T H; // Larger interval size
+
+    std::vector<T> y; // Vector of m*N function values
+    std::vector<T> p; // Vector of m sampling points in each interval
+    std::vector<T> w; // Vector of m Barycentric weights
+
+    std::vector<T> points(const size_t m) {
+        size_t n = m-1;
+        std::vector<T> p(m);
+        for (size_t j=0; j<=n; j++) {
+            p[j] = 0.5*H*(1-std::cos(M_PI*(j+0.5)/m));
+        }
+        return p;
+    }
+
+    std::vector<T> weights(const size_t m) {
+        std::vector<T> w(m);
+        size_t n = m-1;
+        T sign = 1;
+        for (size_t j=0; j<=n; j++) {
+            w[j] = sign*std::sin(M_PI*(j+0.5)/m);
+            sign = -sign;
+        }
+        return w;
+    }
+
+public:
+    template <typename funcT>
+    BarycentricChebyshev(size_t N, size_t m, T a, T b, funcT f) 
+        : N(N)
+        , m(m)
+        , a(a)
+        , b(b)
+        , H((b-a)/N)
+        , y(N*m)
+        , p(points(m))
+        , w(weights(m))
+    {
+        assert(m>=2);
+        assert(N>=1);
+        for (size_t i=0; i<N; i++) {
+            for (size_t j=0; j<m; j++) {
+                T x = a + i*H + p[j];
+                y[i*m+j] = f(x);
+            }
+        }
+        // std::cout << "H = " << H << std::endl;
+        // std::cout << "y = " << y << std::endl;
+        // std::cout << "p = " << p << std::endl;
+        // std::cout << "w = " << w << std::endl;
+        // std::cout << f(a) << " " << f(b) << std::endl;
+    }
+
+    T operator()(T x) {
+        assert(x>=a && x<b);
+        x -= a;
+        const size_t i = x/H;
+        x -= i*H;
+        // std::cout << "i = " << i << " " << x << std::endl;
+
+        T num = 0;
+        T den = 0;
+        for (size_t j=0; j<m; j++) {
+            // std::cout << j << "  p[j] = " << p[j] << std::endl;
+            T diff = x-p[j];
+            if (diff == 0) return y[i*m + j];
+            T wj = w[j]/diff;
+            num += wj*y[i*m+j];
+            den += wj;
+        }
+        return num/den;
+    }
+
+    template <typename funcT>
+    T maxerr(funcT f, size_t oversample=5) {
+        T testh = H/(oversample*m);
+        T err = 0;
+        T x = a;
+        while (x < b) {
+            err = std::max(std::abs(f(x) - (*this)(x)), err);
+            x += testh;
+        }
+        return err;
+    }    
+};
+
+int main() {
+    auto f = [](double x) { return std::sin(x); };
+    BarycentricChebyshev<double> c(10, 4, 0, 1, f);
+    std::cout << c(0.51) << " " << std::sin(0.51) << std::endl;
+    std::cout << c.maxerr(f) << std::endl;
+    
+    for (size_t N=1; N<=20; N++) {
+        for (size_t m=2; m<=10; m++) {
+            Barycentric<double> b(N, m, 0, 1, f);
+            BarycentricChebyshev<double> c(N, m, 0, 1, f);
+            std::cout << "N = " << N << " m = " << m << " maxerr = " << b.maxerr(f) << " " << c.maxerr(f) << std::endl;
+        }
+    }
+    return 0;
+}
diff --git a/src/madness/bspline/madbessel.cc b/src/madness/bspline/madbessel.cc
index c62d94fd29b..d4e8f91d45e 100644
--- a/src/madness/bspline/madbessel.cc
+++ b/src/madness/bspline/madbessel.cc
@@ -1,14 +1,19 @@
 //#pragma GCC optimize "-fno-associative-math"
 
+#include <typeinfo>
 #include <iostream>
 #include <fstream>
 #include <cassert>
 #include <vector>
 #include <tuple>
+#include <array>
 #include <limits>
 #include <cmath>
 #include <myqd.h>
 
+#include <algorithm>
+#include <numeric>
+
 template <typename T>
 class Matrix {
     size_t n, m;
@@ -25,8 +30,8 @@ class Matrix {
 
 
 size_t ggi = 9999999;
-std::vector<double> ggr;
-Matrix<double> ggj, ggh;
+std::vector<dd_real> ggr;
+Matrix<dd_real> ggj, ggh;
 
 // print a vector
 template <typename T>
@@ -94,7 +99,7 @@ namespace std {
 template <typename T>
 struct factorial_cache {
     static constexpr size_t N = 171;   // more than 171 will overflow exponent
-    static std::array<T,N+1> f;      // +1 for 0
+    static inline std::array<T,N+1> f = {};      // +1 for 0
     factorial_cache() {
         f[0] = 1;
         for (int i=1; i<=int(N); i++) f[i] = f[i-1]*i; // int since T might be dd_real/qd_real
@@ -104,7 +109,7 @@ struct factorial_cache {
 template <typename T> 
 struct double_factorial_cache {
     static constexpr size_t N = 300; // more than 300 will overflow exponent
-    static std::array<T,N+1> f; // +1 for 0
+    static inline std::array<T,N+1> f = {}; // +1 for 0
     double_factorial_cache() {
         f[0] = 1;
         f[1] = 1;
@@ -112,8 +117,9 @@ struct double_factorial_cache {
     }
 };
 
-template <typename T> std::array<T,172> factorial_cache<T>::f = {};
-template <typename T> std::array<T,301> double_factorial_cache<T>::f = {};
+// not needed since c++ 17 inline variables
+//template <typename T> std::array<T,172> factorial_cache<T>::f = {};
+//template <typename T> std::array<T,301> double_factorial_cache<T>::f = {};
 
 // If you might overflow size_t use floating point number for T
 template <typename T>
@@ -530,7 +536,7 @@ T RsmallrX(size_t l, T r) {
 }
 
 template <typename T>
-std::vector<T> JsphericalscaledVec(size_t maxl, T r) {
+std::vector<T> JsphericalscaledVecGood(size_t maxl, T r) {
     assert(maxl == 32);
     if (r == T(0.0)) return std::vector<T>(maxl+1,T(0.0));
     const T eps = std::numeric_limits<T>::epsilon();
@@ -689,9 +695,537 @@ std::vector<T> JsphericalscaledVec(size_t maxl, T r) {
     return j;
 }
 
+template <typename T>
+std::vector<T> JsphericalscaledVecXX(size_t maxl, T r) {
+    assert(maxl == 32);
+    if (r == T(0.0)) return std::vector<T>(maxl+1,T(0.0));
+    const T eps = std::numeric_limits<T>::epsilon();
+    const T min = std::numeric_limits<T>::min();
+
+    const T one = T(1.0);
+    const T two = T(2.0);
+    const T half = T(0.5);
+    
+    const T rr = one/r;
+    T R;
+
+    //bool doprint = (ggi != 9999999) && (std::abs(r - 0.00191116) < 0.00001);
+    const bool doprint = false;
+    if (doprint) {
+        std::cout << "printing " << r << " " << (ggj(33,ggi)>min) << " " << min << std::endl;
+    }
+
+    std::vector<T> j(maxl+1);
+
+    T rcut = 20.0; // So that exp(-2*rcut) is much less than epsilon including gradual underflow
+    if constexpr      (std::is_same<T,float>::value) rcut = 10.0;
+    else if constexpr (std::is_same<T,double>::value) rcut = 20.0;
+    else if constexpr (std::is_same<T,dd_real>::value) rcut = 40.0;
+    else if constexpr (std::is_same<T,qd_real>::value) rcut = 80.0;
+
+    auto RELERR = [&](const T& a, const T&b) {return (std::abs(a-b)/std::abs(b))/eps;};
+
+    if (r < std::max(rcut,T(12.5*maxl))) {
+        if (r < 0.5) { // was 2.5
+            R = Rsmallr(48,r);
+        }
+        else {
+            R = b33(r); // Very accurate guess for R_38, slight error at small r but going from 38 to 33 will fix it // was 38 now 48
+        }
+        size_t n = 48;
+        while (--n>maxl) {
+            R = T(1.0)/(R + T(2*n+1.0)*rr); // Compute R_n from R_{n+1}
+        }
+
+        if (doprint) {
+            T Rtest = ggj(33,ggi)/ggj(32,ggi);
+            T relerr = RELERR(R,Rtest);
+            if (relerr > 0) {
+                std::cout << 33 << " Rtest " << Rtest << " " << R << " " << ggi << " " << r << " " << ggr[ggi] << " " << ggj(33,ggi) << " " << ggj(32,ggi) << " " << relerr << std::endl;
+            }
+        }
+
+        T Rsave = R;
+        
+        T j1 = min * std::max(one, Rsave*r/T(2*maxl+1.0)); //was Rsave ... now trying to avoid underflow in s below
+        T j0 = j1/Rsave; // was 1
+        j[maxl] = j0;
+
+        if (doprint) {
+            T R = Rsave;
+            T Rtest = ggj(maxl+1,ggi)/ggj(maxl,ggi);
+            T relerr = RELERR(R,Rtest);
+            if (relerr > 0) {
+                std::cout << maxl+1 << " Rtest " << Rtest << " " << R << " " << ggi << " " << r << " " << ggr[ggi] << " " << ggj(maxl+1,ggi) << " " << ggj(maxl,ggi) << " " << relerr << std::endl;
+            }
+        }
+        
+        for (int l=maxl; l>0; l--) {
+            j[l-1] = j1 + (T(2*l+1.0)*rr)*j0;
+            
+            //std::cout << "NEW " << l << " " <<  j1 << " " << j0 << " " << j[l-1] << std::endl;
+            j1 = j0;
+            j0 = j[l-1];
+
+            if (doprint) {
+                T R = j1/j0;
+                T Rtest = ggj(l,ggi)/ggj(l-1,ggi);
+                T relerr = RELERR(R,Rtest);
+                if (relerr > 0) {
+                    std::cout << l << " Rtest " << Rtest << " " << R << " " << ggi << " " << r << " " << ggr[ggi] << " " << ggj(l,ggi) << " " << ggj(l-1,ggi) << " " << relerr << std::endl;
+                }
+            }
+
+            
+        }
+        //std::cout << "NEW " <<  j << std::endl;
+
+        if (doprint) {
+            T R = j[32]/j[0];
+            T Rtest = ggj(32,ggi)/ggj(0,ggi);
+            T relerr = RELERR(R,Rtest);
+            if (relerr > 0) {
+                std::cout << 99 << " Rtestall " << Rtest << " " << R << " " << ggi << " " << r << " " << ggr[ggi] << " " << ggj(0,ggi) << " " << ggj(32,ggi) << " " << relerr << std::endl;
+            }
+        }
+        
+        
+        j0 = -myexpm1(-two*r)*half*rr; // for small r use expm1 to avoid cancellation
+
+        // if (ggi != 9999999) {
+        //     T j0test = ggj(0,ggi);
+        //     T relerr = RELERR(j0,j0test);
+        //     if (relerr > 0) {
+        //         std::cout << 0 << " Rtest00 " << ggi << " " << r << " " << ggr[ggi] << " " << j0 << " " << j0test << " " << relerr <<  std::endl;
+        //     }
+        // }
+
+        T s = j0/j[0];
+        if (doprint) std::cout << "s " << s << std::endl;
+        
+        if (s < min) {
+            std::cout << "warning " << s << " " << min << std::endl;
+        }
+        j[0] = j0;
+
+        double reps = to_double(dd_real(r) - dd_real(1.0)/dd_real(rr));
+        
+        for (size_t l=1; l<=maxl; l++) {
+            j[l] *= s;
+
+            j[l] += (j[l-1]-(one+T(l+1)*rr)*j[l])*reps; // correction due to inexact computing of 1/r
+            
+            if (doprint) {
+                T R = j[l]/j[l-1];
+                T Rtest = ggj(l,ggi)/ggj(l-1,ggi);
+                T relerr = RELERR(R,Rtest);
+                if (relerr > 0) {
+                    std::cout << l << " RtestZ " << Rtest << " " << R << " " << ggi << " " << r << " " << ggr[ggi] << " " << ggj(l,ggi) << " " << ggj(l-1,ggi) << " " << relerr << std::endl;
+                }
+            }
+        }
+        //std::cout << s << std::endl;
+        //std::cout << "NEW " <<  j << std::endl;
+        //std::exit(0);
+    }
+    else {
+        // const T expm2r = std::exp(-two*r);
+        // j[0] = (one-expm2r)*half*rr;
+        // j[1] = ((one+expm2r)*half - j[0])*rr;
+        j[0] = half*rr;
+        j[1] = (half - j[0])*rr;
+        for (size_t l=1; l<maxl; l++) {
+            j[l+1] = j[l-1] - T(2*l+1.0)*j[l]*rr;
+        }
+    }
+    
+    return j;
+}
+
+template <typename T>
+std::vector<T> JsphericalscaledVecDumbBAD(size_t maxl, T r) {
+    const T zero = 0;
+    std::vector<T> j(maxl+1,zero);
+    if (r == zero) return j;
+    
+    const T eps = std::numeric_limits<T>::epsilon();
+    const T min = std::numeric_limits<T>::min();
+    const T one = 1;
+    const T two = 2;
+    const T half = T(0.5);
+    const T rr = one/r;
+    T R;
+
+    bool doprint = false; //std::abs(r-T(9.536743e-07)) < 1e-11;
+    if (doprint) std::cout << "r " << r << " eps " << eps << std::endl;
+
+    T rcut = 20.0; // So that exp(-2*rcut) is much less than epsilon including gradual underflow
+    if constexpr      (std::is_same<T,float>::value)   rcut = 10.0;
+    else if constexpr (std::is_same<T,double>::value)  rcut = 20.0;
+    else if constexpr (std::is_same<T,dd_real>::value) rcut = 40.0;
+    else if constexpr (std::is_same<T,qd_real>::value) rcut = 80.0;
+
+    if (r < std::max(rcut,T(12.5*maxl))) {
+        size_t n;
+        if constexpr      (std::is_same<T,float>::value)   n = maxl + 20;
+        else if constexpr (std::is_same<T,double>::value)  n = maxl + 80;
+        else if constexpr (std::is_same<T,dd_real>::value) n = maxl + 130;
+        else if constexpr (std::is_same<T,qd_real>::value) n = maxl + 200;
+
+
+        {
+            double s = (2*n+1)/to_double(r);
+            R = T((std::sqrt(s*s + 4) - s)*0.5);
+        }
+        
+        // while (--n>maxl) {
+        //     R = one/(R + (2*n+one)*rr); // Compute R_n from R_{n+1}
+        // }
+
+        // Reciprocal is slow so write back again in terms of function values instead of ratios
+        {
+            T j1 = min;
+            T j0 = j1/R;
+            while (--n>maxl) {
+                T j = j1 + ((2*n+one)*rr)*j0;
+                j0 = j1;
+                j1 = j;
+                if (j1 > one) {
+                    j0 = std::ldexp(j0,-30);
+                    j1 = std::ldexp(j1,-30);
+                }
+            }
+            R = j1 / j0;
+            std::cout << "TTT " << maxl << " " << r << " " << R << std::endl;
+        }
+
+        {
+            double s = (2*n+1)/to_double(r);
+            R = T((std::sqrt(s*s + 4) - s)*0.5);
+        }
+        
+        while (--n>maxl) {
+            R = one/(R + (2*n+one)*rr); // Compute R_n from R_{n+1}
+        }
+        std::cout << "YYY " << maxl << " " << r << " " << R << std::endl;
+        
+        
+        // This helps avoid underflow but loses precision when computing j1/R
+        //T j1 = min * std::max(one, R*r/T(2*maxl+one)); //was R ... now trying to avoid gradual underflow in s below
+        //T j0 = j1/R; // was 1
+
+        T j1 = R;
+        T j0 = one;
+        if constexpr (std::is_same<T,dd_real>::value || std::is_same<T,qd_real>::value) {
+            double small = std::exp2(-128.0);
+            j1 = mul_pwr2(R,small); // exact power of 2 so easy and exact to compute and apply
+            j0 = small;
+            // //double small = std::exp2(-128.0);
+            // j1 = std::ldexp(j1,-128); //mul_pwr2(R,small); // exact power of 2 so easy and exact to compute and apply
+            // j0 = std::exp2(-128); //small;
+        }
+        
+        j[maxl] = j0;
+
+        for (int l=maxl; l>0; l--) {
+            j[l-1] = j1 + ((2*l+one)*rr)*j0;
+            j1 = j0;
+            j0 = j[l-1];
+        }
+
+        j0 = -std::expm1(-two*r)*half*rr; // for small r use expm1 to avoid cancellation
+
+        T s = j0/j[0];
+        
+        if (s < min) {
+            std::cout << "warning: s has underflowed " << s << " " << min << std::endl;
+        }
+        j[0] = j0;
+
+        // Newton correction seems to help a lot for double, a little for float, but not at all for dd_real.
+        T reps = 0;
+        if constexpr      (std::is_same<T,float>::value)   reps = float(double(r) - double(1.0)/double(rr));
+        else if constexpr (std::is_same<T,double>::value)  reps = to_double(dd_real(r) - dd_real(1.0)/dd_real(rr));
+        else if constexpr (std::is_same<T,dd_real>::value) reps = to_dd_real(qd_real(r) - qd_real(1.0)/qd_real(rr));
+
+        for (size_t l=1; l<=maxl; l++) {
+            j[l] *= s;
+            if constexpr (!std::is_same<T,qd_real>::value) {
+                j[l] += (j[l-1]-(one+(l+one)*rr)*j[l])*reps; // Newton correction due to inexact computing of 1/r
+            }
+
+            // auto RELERR = [&](const T& a, const T&b) {return (std::abs(a-b)/std::abs(b))/eps;};
+
+            // if (doprint) {
+            //     T R = j[l]/j[l-1];
+            //     T Rtest = ggj(l,ggi)/ggj(l-1,ggi);
+            //     T Rrelerr = RELERR(R,Rtest);
+            //     T jrelerr = RELERR(j[l],ggj(l,ggi));
+            //     if (Rrelerr > 0) {
+            //         std::cout << l << " RtestZ " << Rtest << " " << R << " " << Rrelerr << " " << j[l] << " " << ggj(l,ggi) << " " << jrelerr << std::endl;
+            //     }
+            // }
+            
+        }
+    }
+    else {
+        j[0] = half*rr;
+        j[1] = (half - j[0])*rr;
+        for (size_t l=1; l<maxl; l++) {
+            j[l+1] = j[l-1] - (2*l+one)*j[l]*rr;
+        }
+    }
+    
+    return j;
+}
+
+// Validated for l up to 32 and r from 1e-8 to 1e8
+template <typename T>
+std::vector<T> JsphericalscaledVecDumb(size_t maxl, T r) {
+    const T zero = 0;
+    std::vector<T> j(maxl+1,zero);
+    if (r == zero) return j;
+    
+    const T eps = std::numeric_limits<T>::epsilon();
+    const T min = std::numeric_limits<T>::min();
+    const T one = 1;
+    const T two = 2;
+    const T half = T(0.5);
+    const T rr = one/r;
+    T R;
+
+    bool doprint = false; //std::abs(r-T(9.536743e-07)) < 1e-11;
+    if (doprint) std::cout << "r " << r << " eps " << eps << std::endl;
+
+    T rcut = 20.0; // So that exp(-2*rcut) is much less than epsilon including gradual underflow
+    if constexpr      (std::is_same<T,float>::value)   rcut = 10.0;
+    else if constexpr (std::is_same<T,double>::value)  rcut = 20.0;
+    else if constexpr (std::is_same<T,dd_real>::value) rcut = 40.0;
+    else if constexpr (std::is_same<T,qd_real>::value) rcut = 80.0;
+
+    if (r < std::max(rcut,T(12.5*maxl))) {
+        size_t n;
+        if constexpr      (std::is_same<T,float>::value)   n = maxl + 22;
+        else if constexpr (std::is_same<T,double>::value)  n = maxl + 80;
+        else if constexpr (std::is_same<T,dd_real>::value) n = maxl + 130;
+        else if constexpr (std::is_same<T,qd_real>::value) n = maxl + 204;
+
+        // Downward recursion to get R_n for n > maxl using ratios which is numerically stable with no overflow but slow due to reciprocals
+        // {
+        //     T s = (2*n+one)/r;
+        //     R = (std::sqrt(s*s + T(4)) - s)*half; // note fix below for small r cancellation
+        //     while (--n>maxl) {
+        //         R = one/(R + (2*n+one)*rr); // Compute R_n from R_{n+1}
+        //     }
+        // }
+
+        {
+            // Since reciprocal is slow so rewrite again in terms of function values instead of ratios but now
+            // have to manually handle possibility of overflow.  Tile the loop so that don't need to have
+            // if test in the inner lopp ... this will be important in the vectorized version of this code.
+
+            //T s = (2*n+1)*rr;
+            //T RR = (std::sqrt(s*s + T(4)) - s + eps)*half; // note the eps to avoid RR=0 if s*s > 4/eps but this is still wrong
+            T RR = (r*r + (n + 1)*r)/(r*r + (2*n + 1)*r + (2*n + 1)*(n + 1)); // initial guess that is slightly less accurate but no cancellation
+
+            //T RR = (2*r*r*r + (3*n +3)*r*r + (2*n + 3)*(n + 1)*r)/(2*r*r*r + (5*n + 3)*r*r + (6*n + 3)*(n + 1)*r + (2*n + 3)*(2*n + 1)*(n + 1)); // no better
+            
+            T j1 = min;
+            T j0 = min/RR;
+
+            int step = 30;
+            if constexpr (std::is_same<T,float>::value) step = 3;
+            for (int i=n-1; i>int(maxl); i-=step) { // use int to avoid size_t wraparound
+                size_t topl = std::max(i-step, int(maxl));
+                for (size_t m=i; m>topl; m--) {
+                    T j = j1 + ((2*m+1)*rr)*j0;
+                    j1 = j0;
+                    j0 = j;
+                }
+                if (j1 > one) {
+                    j0 *= min;
+                    j1 *= min; 
+                }
+            }
+            RR = j1 / j0;
+            //std::cout << "TTT " << maxl << " " << r << " " << R << " " << RR << " " << (R-RR)/eps << std::endl;
+            R = RR;
+        }
+        
+        // This helps avoid underflow but loses precision when computing j1/R
+        //T j1 = min * std::max(one, R*r/T(2*maxl+one)); //was R ... now trying to avoid gradual underflow in s below
+        //T j0 = j1/R; // was 1
+
+        T j1 = R;
+        T j0 = one;
+        if constexpr (std::is_same<T,dd_real>::value || std::is_same<T,qd_real>::value) {
+            j1 = std::ldexp(R,-128); 
+            j0 = std::exp2(-128);
+        }
+        
+        j[maxl] = j0;
+
+        for (int l=maxl; l>0; l--) {
+            j[l-1] = j1 + ((2*l+1)*rr)*j0;
+            j1 = j0;
+            j0 = j[l-1];
+        }
+
+        j0 = -std::expm1(-two*r)*half*rr; // for small r use expm1 to avoid cancellation
+
+        T s = j0/j[0];
+        
+        if (s < min) {
+            std::cout << "warning: s has underflowed " << s << " " << min << std::endl;
+        }
+        j[0] = j0;
+
+        // Newton correction seems to help a lot for double, a little for float, but not at all for dd_real.
+        T reps = 0;
+        if constexpr      (std::is_same<T,float>::value)   reps = float(double(r) - double(1.0)/double(rr));
+        else if constexpr (std::is_same<T,double>::value)  reps = to_double(dd_real(r) - dd_real(1.0)/dd_real(rr));
+        else if constexpr (std::is_same<T,dd_real>::value) reps = to_dd_real(qd_real(r) - qd_real(1.0)/qd_real(rr));
+
+        for (size_t l=1; l<=maxl; l++) {
+            j[l] *= s;
+            if constexpr (!std::is_same<T,qd_real>::value) {
+                j[l] += (j[l-1]-(one+(l+one)*rr)*j[l])*reps; // Newton correction due to inexact computing of 1/r
+            }
+
+            // auto RELERR = [&](const T& a, const T&b) {return (std::abs(a-b)/std::abs(b))/eps;};
+
+            // if (doprint) {
+            //     T R = j[l]/j[l-1];
+            //     T Rtest = ggj(l,ggi)/ggj(l-1,ggi);
+            //     T Rrelerr = RELERR(R,Rtest);
+            //     T jrelerr = RELERR(j[l],ggj(l,ggi));
+            //     if (Rrelerr > 0) {
+            //         std::cout << l << " RtestZ " << Rtest << " " << R << " " << Rrelerr << " " << j[l] << " " << ggj(l,ggi) << " " << jrelerr << std::endl;
+            //     }
+            // }
+            
+        }
+    }
+    else {
+        j[0] = half*rr;
+        j[1] = (half - j[0])*rr;
+        for (size_t l=1; l<maxl; l++) {
+            j[l+1] = j[l-1] - (2*l+one)*j[l]*rr;
+        }
+    }
+    
+    return j;
+}
+
+template <typename T>
+std::vector<T> JsphericalscaledVecDumbGoodReciprocal(size_t maxl, T r) {
+    const T zero = 0;
+    std::vector<T> j(maxl+1,zero);
+    if (r == zero) return j;
+    
+    const T eps = std::numeric_limits<T>::epsilon();
+    const T min = std::numeric_limits<T>::min();
+    const T one = 1;
+    const T two = 2;
+    const T half = T(0.5);
+    const T rr = one/r;
+    T R;
+
+    bool doprint = false; //std::abs(r-T(9.536743e-07)) < 1e-11;
+    if (doprint) std::cout << "r " << r << " eps " << eps << std::endl;
+
+    T rcut = 20.0; // So that exp(-2*rcut) is much less than epsilon including gradual underflow
+    if constexpr      (std::is_same<T,float>::value)   rcut = 10.0;
+    else if constexpr (std::is_same<T,double>::value)  rcut = 20.0;
+    else if constexpr (std::is_same<T,dd_real>::value) rcut = 40.0;
+    else if constexpr (std::is_same<T,qd_real>::value) rcut = 80.0;
+
+    if (r < std::max(rcut,T(12.5*maxl))) {
+        size_t n;
+        if constexpr      (std::is_same<T,float>::value)   n = maxl + 20;
+        else if constexpr (std::is_same<T,double>::value)  n = maxl + 80;
+        else if constexpr (std::is_same<T,dd_real>::value) n = maxl + 130;
+        else if constexpr (std::is_same<T,qd_real>::value) n = maxl + 200;
+
+        {
+            T s = (2*n+one)/r;
+            R = (std::sqrt(s*s + T(4)) - s)*half;
+        }
+        
+        while (--n>maxl) {
+            R = one/(R + (2*n+one)*rr); // Compute R_n from R_{n+1}
+        }
+
+        // This helps avoid underflow but loses precision when computing j1/R
+        //T j1 = min * std::max(one, R*r/T(2*maxl+one)); //was R ... now trying to avoid gradual underflow in s below
+        //T j0 = j1/R; // was 1
+
+        T j1 = R;
+        T j0 = one;
+        if constexpr (std::is_same<T,dd_real>::value || std::is_same<T,qd_real>::value) {
+            double small = std::exp2(-128.0);
+            j1 = mul_pwr2(R,small); // exact power of 2 so easy and exact to compute and apply
+            j0 = small;
+        }
+        
+        j[maxl] = j0;
+
+        for (int l=maxl; l>0; l--) {
+            j[l-1] = j1 + ((2*l+one)*rr)*j0;
+            j1 = j0;
+            j0 = j[l-1];
+        }
+
+        j0 = -std::expm1(-two*r)*half*rr; // for small r use expm1 to avoid cancellation
+
+        T s = j0/j[0];
+        
+        if (s < min) {
+            std::cout << "warning: s has underflowed " << s << " " << min << std::endl;
+        }
+        j[0] = j0;
+
+        // Newton correction seems to help a lot for double, a little for float, but not at all for dd_real.
+        T reps = 0;
+        if constexpr      (std::is_same<T,float>::value)   reps = float(double(r) - double(1.0)/double(rr));
+        else if constexpr (std::is_same<T,double>::value)  reps = to_double(dd_real(r) - dd_real(1.0)/dd_real(rr));
+        else if constexpr (std::is_same<T,dd_real>::value) reps = to_dd_real(qd_real(r) - qd_real(1.0)/qd_real(rr));
+
+        for (size_t l=1; l<=maxl; l++) {
+            j[l] *= s;
+            if constexpr (!std::is_same<T,qd_real>::value) {
+                j[l] += (j[l-1]-(one+(l+one)*rr)*j[l])*reps; // Newton correction due to inexact computing of 1/r
+            }
+
+            // auto RELERR = [&](const T& a, const T&b) {return (std::abs(a-b)/std::abs(b))/eps;};
+
+            // if (doprint) {
+            //     T R = j[l]/j[l-1];
+            //     T Rtest = ggj(l,ggi)/ggj(l-1,ggi);
+            //     T Rrelerr = RELERR(R,Rtest);
+            //     T jrelerr = RELERR(j[l],ggj(l,ggi));
+            //     if (Rrelerr > 0) {
+            //         std::cout << l << " RtestZ " << Rtest << " " << R << " " << Rrelerr << " " << j[l] << " " << ggj(l,ggi) << " " << jrelerr << std::endl;
+            //     }
+            // }
+            
+        }
+    }
+    else {
+        j[0] = half*rr;
+        j[1] = (half - j[0])*rr;
+        for (size_t l=1; l<maxl; l++) {
+            j[l+1] = j[l-1] - (2*l+one)*j[l]*rr;
+        }
+    }
+    
+    return j;
+}
+
+
 template <typename T>
 T JsphericalscaledY(size_t l, T r) {
-    return JsphericalscaledVec(32, r)[l];
+    return JsphericalscaledVecDumb(32, r)[l];
 }
 
 
@@ -876,7 +1410,25 @@ load_bessel_test_data() {
             if (l != ll) throw std::runtime_error("l mismatch");
         }
     }
-    return std::make_tuple(r, j, h);
+
+    // Perform index sort on r
+    std::vector<size_t> idx(nR);
+    std::iota(idx.begin(), idx.end(), 0);
+    std::sort(idx.begin(), idx.end(), [&](size_t i, size_t j) {return r[i] < r[j];});
+
+    // Reorder r, j, h
+    std::vector<T> r1(nR);
+    Matrix<T> j1(maxL+2, nR), h1(maxL+2, nR);
+    for (size_t i=0; i<nR; i++) {
+        r1[i] = r[idx[i]];
+        for (size_t l=0; l<=maxL+1; l++) {
+            j1(l,i) = j(l,idx[i]);
+            h1(l,i) = h(l,idx[i]);
+        }
+    }
+
+    return std::make_tuple(r1, j1, h1);
+    //return std::make_tuple(r, j, h);
 }
 
 template <typename T>
@@ -929,49 +1481,53 @@ void test_bessel2() {
     //std::cout << "r " << r;
     
     for (size_t i=0; i<nR; i+=1) {
+        bool doprint = false; //std::abs(r[i]-T(9.536743e-07)) < 1e-11;
         ggi = i;
-        std::cout << "r " << i << " " << r[i] << std::endl;
-        std::vector<T> js = JsphericalscaledVec(maxL, r[i]);
+        //std::cout << "r " << i << " " << r[i] << std::endl;
+        std::vector<T> js = JsphericalscaledVecDumb(maxL, r[i]);
         for (size_t l=0; l<=maxL; l+=1) {
             //for (size_t l=40; l<=40; l+=5) {
             //for (size_t i=300; i<=300; i+=20) {
 
-            double reps = to_double(dd_real(r[i]) - dd_real(1.0)/dd_real(1.0/r[i]));
+            //double reps = to_double(dd_real(r[i]) - dd_real(1.0)/dd_real(1.0/r[i]));
             
             T j0 = js[l];
             T err = (j0-j(l,i));
             T relerr = (err/j(l,i))/eps;
-            T estrelerr = ((l/r[i])*reps)/eps;
-            T estrelerr2 = 0.0;
-            if (l > 0) {
-                estrelerr2 = (j(l-1,i)-(1+(l+1)/r[i])*j(l,i))*reps/(j(l,i)*eps);
-            }
-            avgsgnerr += relerr;
-            err = abs(err);
-            relerr = relerr;
-            avgabserr += std::abs(relerr);
-            if (std::abs(relerr) > maxabserr) {
-                maxabserr = std::abs(relerr);
-                worstr = r[i];
-            }
-            maxabserr = std::max(maxabserr,std::abs(relerr));
-            count += 1;
-            // double gives 26eps with worst errors at short and intermediate distances ... this was due to values of r not being representable exactly in doubles
-            // qd gives 10eps
-            // dd gives 50eps nearly everywhere
-            T fudge = 10;
-            if constexpr (std::is_same_v<T, double>) fudge = 1;
-            else if constexpr (std::is_same_v<T, qd_real>) fudge = 10;
-            else if constexpr (std::is_same_v<T, dd_real>) fudge = 10;
-
-            if (j(l,i)>std::numeric_limits<T>::min()) { // really tiny values will be denormed
-                if (relerr > fudge) {
-                    std::cout << "l=" << l << " r=" << r[i] << " j=" << to_str(j(l,i)) << " j0=" << to_str(j0) << " err=" << err << " relerr=" << relerr << " " << estrelerr << " " << estrelerr2 << std::endl;
+            //T estrelerr = ((l/r[i])*reps)/eps;
+            //T estrelerr2 = 0.0;
+            //if (l > 0) {
+            //estrelerr2 = (j(l-1,i)-(1+(l+1)/r[i])*j(l,i))*reps/(j(l,i)*eps);
+            //            }
+
+            if (j(l,i)>std::numeric_limits<T>::min()) { // really tiny values will be denormed so ignore them
+                avgsgnerr += relerr;
+                err = abs(err);
+                avgabserr += std::abs(relerr);
+                if (std::abs(relerr) > maxabserr) {
+                    maxabserr = std::abs(relerr);
+                    worstr = r[i];
+                }
+                maxabserr = std::max(maxabserr,std::abs(relerr));
+                count += 1;
+                // double gave 26eps with worst errors at short and intermediate distances ... this was due to test values of r not being representable exactly in doubles
+                // 26 goes to 14 from using representable r, then 14 to 6 from gradient correction for err in computing 1/r, then to 4 from not doing 1/R before starting downward recursion
+                
+                // qd gives 2eps
+                // dd gives better than 20eps nearly everywhere with worst about 46
+                T fudge = 1;
+                if constexpr (std::is_same_v<T, float>) fudge = 2;
+                if constexpr (std::is_same_v<T, double>) fudge = 5;
+                else if constexpr (std::is_same_v<T, dd_real>) fudge = 47;
+                else if constexpr (std::is_same_v<T, qd_real>) fudge = 2;
+                
+                if (doprint || relerr > fudge) {
+                    std::cout << "l=" << l << " r=" << r[i] << " j=" << to_str(j(l,i)) << " j0=" << to_str(j0) << " err=" << err << " relerr=" << relerr << std::endl;
                 }
             }
         }
     }
-    std::cout << "test bessel2: " << avgsgnerr/count << " " << avgabserr/count << " " << maxabserr << " eps" << " " << worstr << std::endl;
+    std::cout << "test bessel2: " << typeid(T).name() << ": " <<  avgsgnerr/count << " " << avgabserr/count << " " << maxabserr << " eps" << " " << worstr << std::endl;
 }
 
 template <typename T>
@@ -1043,7 +1599,6 @@ int main() {
     //BarycentricX<double> b(2000, 6, L, 12.5*L, f33);
     Barycentric<double> b(1250, 12, 0.5, 12.5*L, f33);
     b33 = b;
-    std::cout << (Jsphericalscaled(32, 0.7*L) - JsphericalscaledY(32, 0.7*L))/Jsphericalscaled(32, 0.7*L) << std::endl;
 
     // std::cout << b(20.018) << std::endl;
     // std::cout << f(20.018) << std::endl;
@@ -1054,9 +1609,12 @@ int main() {
     std::cout << b.maxerr(f33) << std::endl;
 
 
-    std::tie(ggr, ggj, ggh) = load_bessel_test_data<double>();
+    std::tie(ggr, ggj, ggh) = load_bessel_test_data<dd_real>();
     
+    test_bessel2<float>();
     test_bessel2<double>();
+    test_bessel2<dd_real>();
+    test_bessel2<qd_real>();
 
     return 0;
     
diff --git a/src/madness/bspline/periodic.tex b/src/madness/bspline/periodic.tex
new file mode 100644
index 00000000000..2811bfeae03
--- /dev/null
+++ b/src/madness/bspline/periodic.tex
@@ -0,0 +1,232 @@
+\documentclass[12pt]{article}
+\usepackage{cite}
+\usepackage{mathtools} %% includes amsmath
+\usepackage{amssymb,amsfonts}
+\usepackage{algorithmic}
+\usepackage{textcomp}
+\usepackage{xcolor}
+\usepackage{physics}
+\usepackage{graphicx,graphics}
+\setlength{\oddsidemargin}{0.25in}
+\setlength{\evensidemargin}{-0.25in}
+\setlength{\topmargin}{-.5in}
+\setlength{\textheight}{9in}
+\setlength{\parskip}{.1in}
+\setlength{\parindent}{2em}
+\setlength{\textwidth}{6.25in}
+
+\DeclareMathOperator{\erfc}{erfc}
+
+\title{Notes on MADNESS periodic Coulomb}
+\date{\today}
+\author{Robert J. Harrison}
+ 
+\begin{document}
+
+\maketitle
+
+\section{Gaussian representation of Coulomb kernel}
+
+We start from the identity
+\begin{eqnarray}
+  \frac{1}{r} & = & \frac{2}{\sqrt{\pi}} \int_0^\infty \! dt \, \exp(-t^2 r^2) 
+\end{eqnarray}
+change variables using $t=\exp(s)$ to obtain
+\begin{eqnarray}
+  \frac{1}{r} & = & \frac{2}{\sqrt{\pi}} \int_{-\infty}^\infty \! ds \,\exp(-r^2 \exp(2s) + s)
+\end{eqnarray}
+which has the advantage of the kernel being super-exponentially decreasing on the RHS and exponentially decreasing on the LHS which only weakly $r$ dependent.  Thus, a trapezoidal quadrature can easily give arbitrary precision over an arbitrary range (e.g., size of universe!) with a number of points that depends only logarithmically on both attributes.
+\begin{eqnarray}
+  \frac{1}{r} & \approx & \sum_{i=0}^{M-1} \omega_i \exp(-t_i r^2) \\
+  \omega_i & = & \frac{2}{\sqrt{\pi}} \exp(s_i) \\
+  t_i & = & \exp(2s_i) \\
+  s_i = s_0 + i h,
+\end{eqnarray}
+where $h$ (the spacing in the quadrature) and $s_0$ can be chosen either empirically or from a rigorous analysis of the error in the quadrature.
+
+Range separation in the periodic operator is performed by separately treating all exponenents less than some threshold.  We note that
+\begin{eqnarray}
+  \frac{\erf\left( \exp(S) r \right)}{r} & = & \frac{2}{\sqrt{\pi}} \int_{-\infty}^S \! ds \,\exp(-r^2 \exp(2s) + s) \\
+  \frac{\erfc\left( \exp(S) r \right)}{r} & = & \frac{2}{\sqrt{\pi}} \int_S^\infty \! ds \,\exp(-r^2 \exp(2s) + s) \\
+\end{eqnarray}
+These forms have been used in screened response calculations, and high accuracy can be maintained in the trapeoid integration by using high-order end-point corrections at the discontinuity.
+
+\section{Spheropoles}
+
+A spheropole is a charge distribution with all moments vanishing, and therefore the resulting potential vanishes exterior to the distribution.  Of necessity, it is spherically symmetric.  Since we can evaluate the integrals, we construct a spheropole as the difference between two normalized Gaussian distributions with different exponents.
+\begin{eqnarray}
+  \rho(r) & = & g_\alpha(r) - g_\beta(r) \\
+  g_\alpha(r) & = & \left( \frac{\alpha}{\pi} \right)^{3/2} \exp(-\alpha r^2) \\
+  1 & = & 4 \pi \int_0^\infty \! dr \, r^2 g_\alpha(r)
+\end{eqnarray}
+
+The potential due to a normalized Gaussian $g_\alpha(r)$
+\begin{eqnarray}
+  u_\alpha(r) & = & \frac{\erf(\sqrt{\alpha} r)}{r}
+\end{eqnarray}
+which can be verified by substitution into Poisson's equation
+\begin{eqnarray}
+  \nabla^2 u & = & -4 \pi \rho
+\end{eqnarray}
+Thus, the potential due to the spheropole is just $u_\alpha(r) - u_\beta(r)$.
+
+We can slightly painfully verify that using the representation of $1/r$ via the integral form reproduces the potential, meaning that we can indeed switch the order of the two infinite integrations (one to do the convolution of the Green function with the charge density, and one in the representation of $1/r$).  This is actually already apparent from Tonelli's theorem.
+
+With ``$*$'' denoting convolution, we first note that
+\begin{eqnarray}
+  g_\alpha(r) * g_\beta(r) & = & g_{\frac{\alpha \beta}{\alpha+\beta}}(r) .
+\end{eqnarray}
+Then, the potential due to a Gaussian becomes
+\begin{eqnarray}
+  \frac{1}{r} * g_\alpha(r) & = & \frac{2}{\sqrt{\pi}} \int_0^\infty \! dt \, \exp(-t^2 r2) * g_\alpha(r)\\
+  & = & \frac{2}{\sqrt{\pi}} \int_0^\infty \! dt \, \left( \frac{\pi}{t^2} \right)^{3/2}  g_{t^2}(r) * g_\alpha(r) \\
+  & = & \frac{2}{\sqrt{\pi}} \int_0^\infty \! dt \, \left( \frac{\pi}{t^2} \right)^{3/2}  g_{\frac{t^2 \alpha}{t^2 + \alpha}}(r) \\
+  & = & \frac{2}{\sqrt{\pi}} \int_0^\infty \! dt \, \left( \frac{\alpha}{t^2 + \alpha} \right)^{3/2}  \exp\left(-\frac{t^2 \alpha}{t^2 + \alpha} r^2 \right) \\
+  & = & \frac{2}{\sqrt{\pi}r} \int_0^{\sqrt{\alpha} r} \! dz \, \exp(-z^2) \\
+  & = & \frac{\erf(\sqrt{\alpha} r)}{r}
+\end{eqnarray}
+in which we made the substitution 
+\begin{eqnarray}
+  z^2 & = & \frac{t^2 \alpha}{t^2 + \alpha} r^2
+\end{eqnarray}
+
+So we are happy with the free-space situation.
+
+\section{Periodic sum of a Gaussian}
+
+In 1D, we define the familiar integral
+\begin{eqnarray}
+  I & = & \frac{1}{\sqrt{\pi}} \int_{-\infty}^\infty \! dx \, \exp(-x^2) \\
+    & = & 1
+\end{eqnarray}
+If we approximate this integral using the trapezoid rule with spacing $h$
+\begin{eqnarray}
+  I_h & = & h \sum_i \frac{1}{\sqrt{\pi}} \exp(-x_i^2) \\
+  x_i    & = & i h
+\end{eqnarray}
+then, for small $h$ the error behaves as (see Trefethen's SIAM Review article ``The exponentially convergent trapezoidal rule'' for more details)
+\begin{eqnarray}
+  | I_h - I | & \approx & 2 \exp\left( -\frac{\pi^2}{h^2} \right)
+\end{eqnarray}
+To get 17 digits, $h=0.5$ suffices.  In turn, this means that for $\alpha < 0.25$ the lattice sum
+\begin{eqnarray}
+  s_\alpha & = & \sum_i g_\alpha(x+i) 
+\end{eqnarray}
+takes on the value to machine precision of the corresponding integral (1 here due to the normalization of the Gaussian).  I.e., the lattice sum of a diffuse Gaussian is everywhere a constant to a precision that is easily made exponentially small.
+
+\section{Periodic Coulomb potential and GF}
+
+Let $\rho(r)$ be the charge density in the unit cell ($\Omega_0$) and be zero outside.  Thus,
+\begin{eqnarray}
+  \sum_R \rho(r-R) ,
+\end{eqnarray}
+where $R$ is a valid lattice translation, is the charge density periodically extended to the entire crystal.  This sum is absolutely convergent.
+
+The electrostatic potential (we are interested just in the unit cell, but it too will be periodic) due to this periodically-extended density can then be computed by convolution with usual Green function
+\begin{eqnarray}
+  u(r) & = & G(r) * \sum_R \rho(r-R) \\
+  & = & \int_{\Omega_0} \! d^3s \, \sum_S G(r-s) \rho(s-S) \\
+  & = & \int_{\Omega_0} \! d^3s \, \rho(s) \sum_R G(r-R-s) \\
+\end{eqnarray}
+in which $S$ and $R$ denote valid lattice translations, and we used $\Omega$ to denote the entire (infinite) crystal volume and $\Omega_0$ to denote the unit cell.  These improper (infinite) integrals and infinite sums are fraught with multiple issues that we will try to sidestep for present purposes.  First, if we restrict our attention to densities with sufficient vanisishing low-order multiploles (including the spheropoles), we know the resulting periodic potential is well-defined, though that does not fully mean we are free from issues.  E.g., the sum in the last line is nominally over the Green function and not constrained by the density.  Second, imagine we are using a screened-Coulomb operator that dies perhaps exponentially at some distance --- this ensures the infinite integral is convergent (or can even be truncated to be over a finite domain). Third, imagine we are computing in a finite volume (and ignoring surface effects) --- this eliminates both the infinite integral and  sum. 
+
+Returning briefly to the observation that the sum in the last line is over the bare Green function and clearly blows up.  We know that we have to constrain the density so that its first few moments vanish in order for its periodic potential to converge.  When we switch the order of summation and integration, this property should be carried over --- i.e., the contributions to the far-field components of $G$ in the last line should correspond only to those needed to represent the potential from the specific density.  So outside of some convex region containing the unit cell, $G(r)$ should decay appropriately.  Alternatively, before we take the limit of the sum going to infinity, we compute the integral with $\rho$ to achieve the similar effect.  However, with conditionally convergent sums you can easily take limits and get nearly any answer you want.
+
+We are expanding $G(r) = 1/r$ in terms of Gaussians and we saw above that the periodic sum of a Gaussian is a constant if the exponent is above some value.  Since the net charge is zero, we can just throw the diffuse Gaussians away.  We also saw above that splitting the sum over Gaussians at some exponent ($\gamma$) corresponds to the decomposition
+\begin{eqnarray}
+  \frac{1}{r} & = & \frac{\erfc(\sqrt{\gamma} r)}{r} + \frac{\erf(\sqrt{\gamma} r)}{r} 
+\end{eqnarray}
+thus, discarding the diffuse Gaussians corresponds to discarding the long-range term ($\erf$).
+
+The screened potential due to a single normalized Gaussian of exponent $\alpha$ is 
+\begin{eqnarray}
+  u_{\alpha,\gamma}(r) & = & \frac{\erfc(\sqrt{\gamma} r)}{r}  * g_\alpha(r) \\
+   & = & \frac{\erf(\sqrt{\alpha}r) - \erf(\sqrt{\tau_{\alpha,\gamma}}r)}{r}
+\end{eqnarray}
+with 
+\begin{eqnarray}
+  \tau_{\alpha,\gamma} & = & \frac{\alpha \gamma}{\alpha + \gamma}
+\end{eqnarray}
+Noting $\tau_{\alpha,\gamma}$ is symmetric in $\alpha$ and $\gamma$, and that in the expected application we will have $\alpha > \gamma$
+\begin{eqnarray}
+  0 \le \tau_{\alpha,\gamma} \le \gamma < \alpha \\
+  \lim_{\alpha \rightarrow \infty} \tau_{\alpha,\gamma} & = & \gamma
+\end{eqnarray}
+
+Thus, the screened potential due to the spheropole composed as the difference between two Gaussians is
+\begin{eqnarray}
+  u(r) & = & u_{\alpha,\gamma}(r) - u_{\beta,\gamma}(r) \\
+       & = & \frac{\erf(\sqrt{\alpha}r)-\erf(\sqrt{\beta}r)}{r} - \frac{\erf(\sqrt{\tau_{\alpha,\gamma}}r)-\erf(\sqrt{\tau_{\beta,\gamma}}r)}{r}
+\end{eqnarray}
+We are focused on the potential within the unit cell but exterior to the spheropole.  Exterior means that both $g_\alpha(r)$ and $g_\beta(r)$ are negligible, and hence the exterior potential reduces to
+\begin{eqnarray}
+  u(r) & \rightarrow & - \frac{\erf(\sqrt{\tau_{\alpha,\gamma}}r)-\erf(\sqrt{\tau_{\beta,\gamma}}r)}{r}
+\end{eqnarray}
+In the limit, $\alpha \gg \gamma$ and $\beta \gg \gamma$, this approaches zero.  Assuming a unit cell of width 1, with the spheropole in the center, $\alpha=100$, $\beta=10000$, and $\gamma=0.25$ we compute 
+\begin{eqnarray}
+  \tau_{\alpha \gamma} & = & 0.249377 \\
+  \tau_{\beta \gamma} & = & 0.249994 \\
+  u(0.5) & = & 0.000655 \\
+\end{eqnarray}
+Finally, we need the sum of this extended over neighboring cells sufficiently far that the screened potential is exponentially small.  For $\gamma=0.25$ this is about $\pm12$ in each dimension, but in the code we only do this in 1D once for each Gaussian.  This averaging makes the potential exterior to the spheropole into a constant.
+
+Let's compute this value since we can express $\erfc$ in Gaussians and can compute their periodic sum. 
+\begin{eqnarray}
+  \frac{\erfc(\sqrt{\tau}r)}{r} & = & \frac{2}{\sqrt{\pi}} \int_{\sqrt{\tau}}^\infty \! dt \,\exp(-r^2 t^2) \\
+ & = & \frac{2}{\sqrt{\pi}} \int_{\sqrt{\tau}}^\infty \! dt \, \left( \frac{\pi}{t^2} \right)^{3/2}  g_{t^2}(r)
+\end{eqnarray}
+We take the periodic sum over $r+R$, and legitimately exchange the order of summation and integration.  Thus, we seek
+\begin{eqnarray}
+  \sum_R \frac{\erfc(\sqrt{\tau}|r+R|)}{|r+R|}  & = & \frac{2}{\sqrt{\pi}} \int_{\sqrt{\tau}}^\infty \! dt \, \left( \frac{\pi}{t^2} \right)^{3/2}  \sum_R g_{t^2}(|r+R|)
+\end{eqnarray}
+Unfortunately, we only exactly know the sum for small exponents, but here the integral is over exponents larger than $\tau$ --- this was the mistake in the original set of notes.
+
+How to proceed?  We know the asymptotic form of the complementary error function is
+\begin{eqnarray}
+  \erfc(x) & = & \frac{e^{-x^2)}{x \sqrt{\pi}} \left( 1 - \frac{1}{2x^2} + \frac{3}{4 x^4} + O\left(\frac{1}{x^6}\right) \right)
+\end{eqnarray}
+with the first term giving at least 1 significant digit if $x>sqrt(5)\approx 2.3$.  So with $\tau=0.25$, the asymptotic form is suitable for $r > 4.6$.
+
+
+\section{Next steps}
+
+So what happened here?  Focusing on spheropoles:
+\begin{itemize}
+\item If we compute the sum of the potentials arising from each density we have an absolutely convergence sum, since it is zero everywhere exterior to the charge desnity.  This is what we are seeking to reproduce.
+\item And, we want this zero of potential to be independent of the specific form, magnitude, and location within the unit cell (incuding being split with a neighboring cell) of the spheropole.
+\item When we exchange the sum and integration, we now have a divergent sum over the Green function that we are attempting to wrangle to a physically meaningful answer. 
+\item We remove the long-range part of the potential to obtain an absolutely convergent result by noting that a diffuse Gaussians sums to a constant which traces against a zero net charge.  However, this constant will not be finite when summed over the entire expansion of the potential.
+\item When we convolve the screened Coulomb potential with a single spheropole, the resulting potential has a tail within the unit cell, which in the periodic sum averages into an undesired non-zero constant.
+\end{itemize}
+
+Noting we can add and subtract anything we want as long as it sums to a constant in the periodic sum, we can try to undo the damage we did to the short-range potential on the scale of the unit cell.  So, lets seek a modification so that the screened Green function agrees more accurately with the exact Coulomb potential on the range of the unit cell.  Given that computationally we can only easily handle Gaussians, let's try adding some diffuse ones back in to the mix. We can do this in a variety of ways, e.g.,
+\begin{itemize}
+\item find a different partitioning or change of variables in the integral representation used to form the Gaussian expansion, or
+\item numerically fit a small expansion.
+\end{itemize}
+
+Proceeding numerically, let's just add a single diffuse Gaussian with exponent $\gamma$ and adjust the exponent so that the average error is zero over a sphere (here chosen of radius 0.5 so that it touches the faces of the unit cell, but perhaps better is to touch the corners).  With $\gamma=0.25$ this yields the corrected potential of
+\begin{eqnarray}
+ \frac{\erfc(0.5 r)}{r} + 0.5784458587 \exp(-0.25 r^2)
+\end{eqnarray}
+Below is a plot of the uncorrected potential and $1/r$ --- the corrected potential is within the line width of $1/r$ on the plot! This is so good, that something must be going on that needs analyzing, but, \ldots
+\begin{figure}[hb]
+\includegraphics[width=.9\linewidth]{screendpot.pdf}  
+  \caption{Screened potential ($erfc(0.5r)/r$) and exact potential ($1/r$).}
+\end{figure}
+
+Let's compute the correction to the (screened free-space) potential of the spheropole. Noting,
+\begin{eqnarray}
+  0.5784458587 \exp(-0.25 r^2) & = &  25.76781018 g_{0.25}(r) 
+\end{eqnarray}
+at $r=0.5$ this gives a correction (sign??????????????????)
+\begin{eqnarray}
+  25.76781018 \left( g_{\frac{0.25\alpha}{0.25+\alpha}}(0.5) - g_{\frac{0.25\beta}{0.25+\beta}}(0.5) \right)  & = & -0.00192748693998839445
+\end{eqnarray}
+which substantially overshoots the correction (even if I got the sign right).
+
+What happened here?  We're not evaluating the Green function --- we're convolving with it, so the value at every point contributes.  We don't want to
+
+
+
+\end{document}
diff --git a/src/madness/bspline/testsum.py b/src/madness/bspline/testsum.py
new file mode 100644
index 00000000000..354228f3c9d
--- /dev/null
+++ b/src/madness/bspline/testsum.py
@@ -0,0 +1,27 @@
+from math import *
+
+tau = 0.25
+fac = pow(tau/pi,1.5)
+hi = ceil(sqrt(2.3*16/tau))
+
+
+x = 0.5
+y = 0.5
+z = 0.5
+
+s = 0.0 # verify norm of gaussian is computed as 1
+u = 0.0 # verify sum of screened potential
+for X in range (-hi,hi+1):
+    for Y in range(-hi,hi+1):
+        for Z in range(-hi,hi+1):
+            rR = sqrt((x+X)**2 + (y+Y)**2 + (z+Z)**2)
+            s += fac*exp(-tau*rR**2)
+            u += erfc(sqrt(tau)*rR)/rR
+
+print("tau", tau, "fac", fac, "hi", hi)
+print("norm", s, 1.0)
+print("potn", u, pi/tau)
+
+
+            
+