diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
index acf0cc687f5..76f093837de 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
@@ -9,7 +9,7 @@
 #include <hpx/config.hpp>
 #include <hpx/functional/detail/tag_fallback_invoke.hpp>
 #include <hpx/functional/invoke.hpp>
-#include <hpx/parallel/algorithms/detail/rfa_cuda.hpp>
+#include <hpx/parallel/algorithms/detail/rfa.hpp>
 #include <hpx/parallel/util/loop.hpp>
 
 #include <cstddef>
@@ -34,19 +34,29 @@ namespace hpx::parallel::detail {
         {
             hpx::parallel::detail::rfa::RFA_bins<T> bins;
             bins.initialize_bins();
-            memcpy(rfa::bin_host_buffer, &bins, sizeof(bins));
+            std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins));
 
             hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<T> rfa;
             rfa.set_max_abs_val(init);
             rfa.unsafe_add(init);
             rfa.renorm();
-            // rfa.add(first, last);
+            size_t count = 0;
+            T max_val = std::abs(*first);
             for (auto e = first; e != last; ++e)
             {
-                // printf("%f \n",*e);
-                rfa.set_max_abs_val(*e);
+                T temp_max_val = std::abs(static_cast<T>(*e));
+                if (max_val < temp_max_val)
+                {
+                    rfa.set_max_abs_val(temp_max_val);
+                    max_val = temp_max_val;
+                }
                 rfa.unsafe_add(*e);
-                rfa.renorm();
+                count++;
+                if (count == rfa.endurance())
+                {
+                    rfa.renorm();
+                    count = 0;
+                }
             }
             return rfa.conv();
         }
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
index 340f4cc7b2e..4a9910cb6fa 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
@@ -27,7 +27,73 @@
 #include <cstdint>
 #include <limits>
 
-namespace RFA {
+namespace hpx::parallel::detail::rfa {
+    template <typename F>
+    struct type4
+    {
+        F x;
+        F y;
+        F z;
+        F w;
+    };
+
+    template <typename F>
+    struct type2
+    {
+        F x;
+        F y;
+    };
+    using float4 = type4<float>;
+    using double4 = type4<double>;
+    using float2 = type2<float>;
+    using double2 = type2<double>;
+
+    auto abs_max(float4 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        auto z = std::abs(a.z);
+        auto w = std::abs(a.w);
+        const std::vector<float> v = {x, y, z, w};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+    auto abs_max(double4 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        auto z = std::abs(a.z);
+        auto w = std::abs(a.w);
+        const std::vector<double> v = {x, y, z, w};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+    auto abs_max(float2 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        const std::vector<float> v = {x, y};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+    auto abs_max(double2 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        const std::vector<double> v = {x, y};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+// disable zero checks
+#define DISABLE_ZERO
+
+// disable nan / infinity checks
+#define DISABLE_NANINF
+
+// jump table for indexing into data
+#define MAX_JUMP 5
+    static_assert(MAX_JUMP <= 5, "MAX_JUMP greater than max");
+
     template <typename Real>
     inline constexpr Real ldexp_impl(Real arg, int exp) noexcept
     {
@@ -50,21 +116,74 @@ namespace RFA {
         // return arg;
     }
 
+    template <class ftype>
+    struct RFA_bins
+    {
+        static constexpr auto BIN_WIDTH =
+            std::is_same_v<ftype, double> ? 40 : 13;
+        static constexpr auto MIN_EXP =
+            std::numeric_limits<ftype>::min_exponent;
+        static constexpr auto MAX_EXP =
+            std::numeric_limits<ftype>::max_exponent;
+        static constexpr auto MANT_DIG = std::numeric_limits<ftype>::digits;
+        ///Binned floating-point maximum index
+        static constexpr auto MAXINDEX =
+            ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1;
+        //The maximum floating-point fold supported by the library
+        static constexpr auto MAXFOLD = MAXINDEX + 1;
+
+        ///The binned floating-point reference bins
+        std::array<ftype, MAXINDEX + MAXFOLD> bins = {};
+
+        constexpr ftype& operator[](int d)
+        {
+            return bins[d];
+        }
+
+        void initialize_bins()
+        {
+            if constexpr (std::is_same_v<ftype, float>)
+            {
+                bins[0] = std::ldexp(0.75, MAX_EXP);
+            }
+            else
+            {
+                bins[0] = 2.0 * std::ldexp(0.75, MAX_EXP - 1);
+            }
+
+            for (int index = 1; index <= MAXINDEX; index++)
+            {
+                bins[index] = std::ldexp(0.75,
+                    MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH);
+            }
+            for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++)
+            {
+                bins[index] = bins[index - 1];
+            }
+        }
+    };
+
+    static char bin_host_buffer[sizeof(RFA_bins<double>)];
+
     ///Class to hold a reproducible summation of the numbers passed to it
     ///
-    ///@param ftype Floating-point data type; either `float` or `double`
+    ///@param ftype Floating-point data type; either `float` or `double
     ///@param FOLD  The fold; use 3 as a default unless you understand it.
-    template <class ftype, int FOLD = 3,
-        typename std::enable_if<std::is_floating_point<ftype>::value>::type* =
+    template <class ftype_, int FOLD_ = 3,
+        typename std::enable_if_t<std::is_floating_point<ftype_>::value>* =
             nullptr>
-    class ReproducibleFloatingAccumulator
+    class alignas(2 * sizeof(ftype_)) ReproducibleFloatingAccumulator
     {
     public:
+        using ftype = ftype_;
+        static constexpr int FOLD = FOLD_;
+
+    private:
         std::array<ftype, 2 * FOLD> data = {0};
 
         ///Floating-point precision bin width
         static constexpr auto BIN_WIDTH =
-            std::is_same<ftype, double>::value ? 40 : 13;
+            std::is_same_v<ftype, double> ? 40 : 13;
         static constexpr auto MIN_EXP =
             std::numeric_limits<ftype>::min_exponent;
         static constexpr auto MAX_EXP =
@@ -91,43 +210,11 @@ namespace RFA {
         ///The number of deposits that can be performed before a renorm is necessary.
         ///Applies also to binned complex double precision.
         static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2);
-        // static_assert(ENDURANCE == 0);
-    public:
-        ///Generates binned floating-point reference bins
-        static constexpr std::array<ftype, MAXINDEX + MAXFOLD> initialize_bins()
-        {
-            std::array<ftype, MAXINDEX + MAXFOLD> bins{0};
 
-            if (std::is_same<ftype, float>::value)
-            {
-                bins[0] = ldexp_impl(0.75, MAX_EXP);
-            }
-            else
-            {
-                bins[0] = 2.0 * ldexp_impl(0.75, MAX_EXP - 1);
-            }
-
-            for (int index = 1; index <= MAXINDEX; index++)
-            {
-                bins[index] = ldexp_impl(0.75,
-                    MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH);
-            }
-            for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++)
-            {
-                bins[index] = bins[index - 1];
-            }
-
-            return bins;
-        }
-
-        ///The binned floating-point reference bins
-        static std::array<ftype, MAXINDEX + MAXFOLD> bins;
-
-    private:
         ///Return a binned floating-point reference bin
-        static inline constexpr const ftype* binned_bins(const int x)
+        inline const ftype* binned_bins(const int x) const
         {
-            return &bins[x];
+            return &reinterpret_cast<RFA_bins<ftype>&>(bin_host_buffer)[x];
         }
 
         ///Get the bit representation of a float
@@ -151,33 +238,109 @@ namespace RFA {
             return *reinterpret_cast<const uint64_t*>(&x);
         }
 
-        ///Return a pointer to the primary vector
-        inline ftype* pvec()
+        ///Return primary vector value const ref
+        inline const ftype& primary(int i) const
+        {
+            if constexpr (FOLD <= MAX_JUMP)
+            {
+                switch (i)
+                {
+                case 0:
+                    if constexpr (FOLD >= 1)
+                        return data[0];
+                case 1:
+                    if constexpr (FOLD >= 2)
+                        return data[1];
+                case 2:
+                    if constexpr (FOLD >= 3)
+                        return data[2];
+                case 3:
+                    if constexpr (FOLD >= 4)
+                        return data[3];
+                case 4:
+                    if constexpr (FOLD >= 5)
+                        return data[4];
+                default:
+                    return data[FOLD - 1];
+                }
+            }
+            else
+            {
+                return data[i];
+            }
+        }
+
+        ///Return carry vector value const ref
+        inline const ftype& carry(int i) const
         {
-            return &data[0];
+            if constexpr (FOLD <= MAX_JUMP)
+            {
+                switch (i)
+                {
+                case 0:
+                    if constexpr (FOLD >= 1)
+                        return data[FOLD + 0];
+                case 1:
+                    if constexpr (FOLD >= 2)
+                        return data[FOLD + 1];
+                case 2:
+                    if constexpr (FOLD >= 3)
+                        return data[FOLD + 2];
+                case 3:
+                    if constexpr (FOLD >= 4)
+                        return data[FOLD + 3];
+                case 4:
+                    if constexpr (FOLD >= 5)
+                        return data[FOLD + 4];
+                default:
+                    return data[2 * FOLD - 1];
+                }
+            }
+            else
+            {
+                return data[FOLD + i];
+            }
         }
-        ///Return a pointer to the carry vector
-        inline ftype* cvec()
+
+        ///Return primary vector value ref
+        inline ftype& primary(int i)
         {
-            return &data[FOLD];
+            const auto& c = *this;
+            return const_cast<ftype&>(c.primary(i));
         }
-        ///Return a const pointer to the primary vector
-        inline const ftype* pvec() const
+
+        ///Return carry vector value ref
+        inline ftype& carry(int i)
         {
-            return &data[0];
+            const auto& c = *this;
+            return const_cast<ftype&>(c.carry(i));
         }
-        ///Return a const pointer to the carry vector
-        inline const ftype* cvec() const
+
+#ifdef DISABLE_ZERO
+        static inline constexpr bool ISZERO(const ftype)
         {
-            return &data[FOLD];
+            return false;
         }
+#else
+        static inline constexpr bool ISZERO(const ftype x)
+        {
+            return x == 0.0;
+        }
+#endif
 
+#ifdef DISABLE_NANINF
+        static inline constexpr int ISNANINF(const ftype)
+        {
+            return false;
+        }
+#else
         static inline constexpr int ISNANINF(const ftype x)
         {
             const auto bits = get_bits(x);
             return (bits & ((2ull * MAX_EXP - 1) << (MANT_DIG - 1))) ==
                 ((2ull * MAX_EXP - 1) << (MANT_DIG - 1));
         }
+#endif
 
         static inline constexpr int EXP(const ftype x)
         {
@@ -200,8 +363,8 @@ namespace RFA {
                 }
                 else
                 {
-                    frexp(x, &exp);
-                    return std::min((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX);
+                    std::frexp(x, &exp);
+                    return std::max((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX);
                 }
             }
             return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH;
@@ -213,7 +376,7 @@ namespace RFA {
         inline int binned_index() const
         {
             return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) -
-                       EXP(pvec()[0])) /
+                       EXP(primary(0))) /
                 BIN_WIDTH;
         }
 
@@ -221,7 +384,7 @@ namespace RFA {
         ///A quick check to determine if the index is 0
         inline bool binned_index0() const
         {
-            return EXP(pvec()[0]) == MAX_EXP + EXP_BIAS;
+            return EXP(primary(0)) == MAX_EXP + EXP_BIAS;
         }
 
         ///Update manually specified binned fp with a scalar (X -> Y)
@@ -234,43 +397,40 @@ namespace RFA {
         void binned_dmdupdate(
             const ftype max_abs_val, const int incpriY, const int inccarY)
         {
-            int i;
-            int j;
-            int X_index;
-            int shift;
-            auto* const priY = pvec();
-            auto* const carY = cvec();
-
-            if (ISNANINF(priY[0]))
-            {
+            if (ISNANINF(primary(0)))
                 return;
-            }
 
-            X_index = binned_dindex(max_abs_val);
-            if (priY[0] == 0.0)
+            int X_index = binned_dindex(max_abs_val);
+            if (ISZERO(primary(0)))
             {
                 const ftype* const bins = binned_bins(X_index);
-                for (i = 0; i < FOLD; i++)
+                for (int i = 0; i < FOLD; i++)
                 {
-                    priY[i * incpriY] = bins[i];
-                    carY[i * inccarY] = 0.0;
+                    primary(i * incpriY) = bins[i];
+                    carry(i * inccarY) = 0.0;
                 }
             }
             else
             {
-                shift = binned_index() - X_index;
+                int shift = binned_index() - X_index;
                 if (shift > 0)
                 {
-                    for (i = FOLD - 1; i >= shift; i--)
+#pragma unroll
+                    for (int i = FOLD - 1; i >= 1; i--)
                     {
-                        priY[i * incpriY] = priY[(i - shift) * incpriY];
-                        carY[i * inccarY] = carY[(i - shift) * inccarY];
+                        if (i < shift)
+                            break;
+                        primary(i * incpriY) = primary((i - shift) * incpriY);
+                        carry(i * inccarY) = carry((i - shift) * inccarY);
                     }
                     const ftype* const bins = binned_bins(X_index);
-                    for (j = 0; j < i + 1; j++)
+#pragma unroll
+                    for (int j = 0; j < FOLD; j++)
                     {
-                        priY[j * incpriY] = bins[j];
-                        carY[j * inccarY] = 0.0;
+                        if (j >= shift)
+                            break;
+                        primary(j * incpriY) = bins[j];
+                        carry(j * inccarY) = 0.0;
                     }
                 }
             }
@@ -285,59 +445,59 @@ namespace RFA {
         void binned_dmddeposit(const ftype X, const int incpriY)
         {
             ftype M;
-            int i;
             ftype x = X;
-            auto* const priY = pvec();
 
-            if (ISNANINF(x) || ISNANINF(priY[0]))
+            if (ISNANINF(x) || ISNANINF(primary(0)))
             {
-                priY[0] += x;
+                primary(0) += x;
                 return;
             }
 
             if (binned_index0())
             {
-                M = priY[0];
+                M = primary(0);
                 ftype qd = x * COMPRESSION;
                 auto& ql = get_bits(qd);
                 ql |= 1;
                 qd += M;
-                priY[0] = qd;
+                primary(0) = qd;
                 M -= qd;
                 M *= EXPANSION * 0.5;
                 x += M;
                 x += M;
-                for (i = 1; i < FOLD - 1; i++)
+#pragma unroll
+                for (int i = 1; i < FOLD - 1; i++)
                 {
-                    M = priY[i * incpriY];
+                    M = primary(i * incpriY);
                     qd = x;
                     ql |= 1;
                     qd += M;
-                    priY[i * incpriY] = qd;
+                    primary(i * incpriY) = qd;
                     M -= qd;
                     x += M;
                 }
                 qd = x;
                 ql |= 1;
-                priY[i * incpriY] += qd;
+                primary((FOLD - 1) * incpriY) += qd;
             }
             else
             {
                 ftype qd = x;
                 auto& ql = get_bits(qd);
-                for (i = 0; i < FOLD - 1; i++)
+#pragma unroll
+                for (int i = 0; i < FOLD - 1; i++)
                 {
-                    M = priY[i * incpriY];
+                    M = primary(i * incpriY);
                     qd = x;
                     ql |= 1;
                     qd += M;
-                    priY[i * incpriY] = qd;
+                    primary(i * incpriY) = qd;
                     M -= qd;
                     x += M;
                 }
                 qd = x;
                 ql |= 1;
-                priY[i * incpriY] += qd;
+                primary((FOLD - 1) * incpriY) += qd;
             }
         }
 
@@ -348,26 +508,22 @@ namespace RFA {
         ///
         ///@param incpriX stride within X's primary vector (use every incpriX'th element)
         ///@param inccarX stride within X's carry vector (use every inccarX'th element)
-        void binned_dmrenorm(const int incpriX, const int inccarX)
+        inline void binned_dmrenorm(const int incpriX, const int inccarX)
         {
-            auto* priX = pvec();
-            auto* carX = cvec();
-
-            if (priX[0] == 0.0 || ISNANINF(priX[0]))
-            {
+            if (ISZERO(primary(0)) || ISNANINF(primary(0)))
                 return;
-            }
 
-            for (int i = 0; i < FOLD; i++, priX += incpriX, carX += inccarX)
+            for (int i = 0; i < FOLD; i++)
             {
-                auto tmp_renormd = priX[0];
+                auto tmp_renormd = primary(i * incpriX);
                 auto& tmp_renorml = get_bits(tmp_renormd);
 
-                carX[0] += (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2;
+                carry(i * inccarX) +=
+                    (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2;
 
                 tmp_renorml &= ~(1ull << (MANT_DIG - 3));
                 tmp_renorml |= 1ull << (MANT_DIG - 2);
-                priX[0] = tmp_renormd;
+                primary(i * incpriX) = tmp_renormd;
             }
         }
 
@@ -392,18 +548,10 @@ namespace RFA {
         {
             int i = 0;
 
-            const auto* const priX = pvec();
-            const auto* const carX = cvec();
-
-            if (ISNANINF(priX[0]))
-            {
-                return priX[0];
-            }
-
-            if (priX[0] == 0.0)
-            {
+            if (ISNANINF(primary(0)))
+                return primary(0);
+            if (ISZERO(primary(0)))
                 return 0.0;
-            }
 
             double Y = 0.0;
             double scale_down;
@@ -413,30 +561,31 @@ namespace RFA {
             const auto* const bins = binned_bins(X_index);
             if (X_index <= (3 * MANT_DIG) / BIN_WIDTH)
             {
-                scale_down = ldexp_impl(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH));
-                scale_up = ldexp_impl(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH));
+                scale_down = std::ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH));
+                scale_up = std::ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH));
                 scaled = std::max(
                     std::min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0);
                 if (X_index == 0)
                 {
-                    Y += carX[0] * ((bins[0] / 6.0) * scale_down * EXPANSION);
-                    Y += carX[inccarX] * ((bins[1] / 6.0) * scale_down);
-                    Y += (priX[0] - bins[0]) * scale_down * EXPANSION;
+                    Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION);
+                    Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down);
+                    Y += (primary(0) - bins[0]) * scale_down * EXPANSION;
                     i = 2;
                 }
                 else
                 {
-                    Y += carX[0] * ((bins[0] / 6.0) * scale_down);
+                    Y += carry(0) * ((bins[0] / 6.0) * scale_down);
                     i = 1;
                 }
                 for (; i < scaled; i++)
                 {
-                    Y += carX[i * inccarX] * ((bins[i] / 6.0) * scale_down);
-                    Y += (priX[(i - 1) * incpriX] - bins[i - 1]) * scale_down;
+                    Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down);
+                    Y +=
+                        (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down;
                 }
                 if (i == FOLD)
                 {
-                    Y += (priX[(FOLD - 1) * incpriX] - bins[FOLD - 1]) *
+                    Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) *
                         scale_down;
                     return Y * scale_up;
                 }
@@ -447,20 +596,20 @@ namespace RFA {
                 Y *= scale_up;
                 for (; i < FOLD; i++)
                 {
-                    Y += carX[i * inccarX] * (bins[i] / 6.0);
-                    Y += priX[(i - 1) * incpriX] - bins[i - 1];
+                    Y += carry(i * inccarX) * (bins[i] / 6.0);
+                    Y += primary((i - 1) * incpriX) - bins[i - 1];
                 }
-                Y += priX[(FOLD - 1) * incpriX] - bins[FOLD - 1];
+                Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1];
             }
             else
             {
-                Y += carX[0] * (bins[0] / 6.0);
+                Y += carry(0) * (bins[0] / 6.0);
                 for (i = 1; i < FOLD; i++)
                 {
-                    Y += carX[i * inccarX] * (bins[i] / 6.0);
-                    Y += (priX[(i - 1) * incpriX] - bins[i - 1]);
+                    Y += carry(i * inccarX) * (bins[i] / 6.0);
+                    Y += (primary((i - 1) * incpriX) - bins[i - 1]);
                 }
-                Y += (priX[(FOLD - 1) * incpriX] - bins[FOLD - 1]);
+                Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
             }
             return Y;
         }
@@ -473,18 +622,11 @@ namespace RFA {
         {
             int i = 0;
             double Y = 0.0;
-            const auto* const priX = pvec();
-            const auto* const carX = cvec();
-
-            if (ISNANINF(priX[0]))
-            {
-                return priX[0];
-            }
 
-            if (priX[0] == 0.0)
-            {
+            if (ISNANINF(primary(0)))
+                return primary(0);
+            if (ISZERO(primary(0)))
                 return 0.0;
-            }
 
             //Note that the following order of summation is in order of decreasing
             //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and
@@ -493,23 +635,23 @@ namespace RFA {
             const auto* const bins = binned_bins(X_index);
             if (X_index == 0)
             {
-                Y += (double) carX[0] * (double) (bins[0] / 6.0) *
+                Y += (double) carry(0) * (double) (bins[0] / 6.0) *
                     (double) EXPANSION;
-                Y += (double) carX[inccarX] * (double) (bins[1] / 6.0);
-                Y += (double) (priX[0] - bins[0]) * (double) EXPANSION;
+                Y += (double) carry(inccarX) * (double) (bins[1] / 6.0);
+                Y += (double) (primary(0) - bins[0]) * (double) EXPANSION;
                 i = 2;
             }
             else
             {
-                Y += (double) carX[0] * (double) (bins[0] / 6.0);
+                Y += (double) carry(0) * (double) (bins[0] / 6.0);
                 i = 1;
             }
             for (; i < FOLD; i++)
             {
-                Y += (double) carX[i * inccarX] * (double) (bins[i] / 6.0);
-                Y += (double) (priX[(i - 1) * incpriX] - bins[i - 1]);
+                Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0);
+                Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
             }
-            Y += (double) (priX[(FOLD - 1) * incpriX] - bins[FOLD - 1]);
+            Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
 
             return (float) Y;
         }
@@ -522,64 +664,78 @@ namespace RFA {
         ///@param inccarX stride within X's carry vector (use every inccarX'th element)
         ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
         ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
-        void binned_dmdmadd(const ReproducibleFloatingAccumulator& other,
+        void binned_dmdmadd(const ReproducibleFloatingAccumulator& x,
             const int incpriX, const int inccarX, const int incpriY,
             const int inccarY)
         {
-            auto* const priX = other.pvec();
-            auto* const carX = other.cvec();
-            auto* const priY = pvec();
-            auto* const carY = cvec();
-
-            if (priX[0] == 0.0)
+            if (ISZERO(x.primary(0)))
                 return;
 
-            if (priY[0] == 0.0)
+            if (ISZERO(primary(0)))
             {
                 for (int i = 0; i < FOLD; i++)
                 {
-                    priY[i * incpriY] = priX[i * incpriX];
-                    carY[i * inccarY] = carX[i * inccarX];
+                    primary(i * incpriY) = x.primary(i * incpriX);
+                    carry(i * inccarY) = x.carry(i * inccarX);
                 }
                 return;
             }
 
-            if (ISNANINF(priX[0]) || ISNANINF(priY[0]))
+            if (ISNANINF(x.primary(0)) || ISNANINF(primary(0)))
             {
-                priY[0] += priX[0];
+                primary(0) += x.primary(0);
                 return;
             }
 
-            // const auto X_index = binned_index(priX);
-            const auto X_index = other.binned_index();
-            const auto Y_index = binned_index();
+            const auto X_index = x.binned_index();
+            const auto Y_index = this->binned_index();
             const auto shift = Y_index - X_index;
             if (shift > 0)
             {
                 const auto* const bins = binned_bins(Y_index);
                 //shift Y upwards and add X to Y
-                for (int i = FOLD - 1; i >= shift; i--)
+#pragma unroll
+                for (int i = FOLD - 1; i >= 1; i--)
                 {
-                    priY[i * incpriY] = priX[i * incpriX] +
-                        (priY[(i - shift) * incpriY] - bins[i - shift]);
-                    carY[i * inccarY] =
-                        carX[i * inccarX] + carY[(i - shift) * inccarY];
+                    if (i < shift)
+                        break;
+                    primary(i * incpriY) = x.primary(i * incpriX) +
+                        (primary((i - shift) * incpriY) - bins[i - shift]);
+                    carry(i * inccarY) =
+                        x.carry(i * inccarX) + carry((i - shift) * inccarY);
                 }
-                for (int i = 0; i < shift && i < FOLD; i++)
+#pragma unroll
+                for (int i = 0; i < FOLD; i++)
                 {
-                    priY[i * incpriY] = priX[i * incpriX];
-                    carY[i * inccarY] = carX[i * inccarX];
+                    if (i == shift)
+                        break;
+                    primary(i * incpriY) = x.primary(i * incpriX);
+                    carry(i * inccarY) = x.carry(i * inccarX);
                 }
             }
-            else
+            else if (shift < 0)
             {
                 const auto* const bins = binned_bins(X_index);
                 //shift X upwards and add X to Y
-                for (int i = 0 - shift; i < FOLD; i++)
+#pragma unroll
+                for (int i = 0; i < FOLD; i++)
                 {
-                    priY[i * incpriY] +=
-                        priX[(i + shift) * incpriX] - bins[i + shift];
-                    carY[i * inccarY] += carX[(i + shift) * inccarX];
+                    if (i < -shift)
+                        continue;
+                    primary(i * incpriY) +=
+                        x.primary((i + shift) * incpriX) - bins[i + shift];
+                    carry(i * inccarY) += x.carry((i + shift) * inccarX);
+                }
+            }
+            else if (shift == 0)
+            {
+                const auto* const bins = binned_bins(X_index);
+                // add X to Y
+#pragma unroll
+                for (int i = 0; i < FOLD; i++)
+                {
+                    primary(i * incpriY) += x.primary(i * incpriX) - bins[i];
+                    carry(i * inccarY) += x.carry(i * inccarX);
                 }
             }
 
@@ -594,6 +750,13 @@ namespace RFA {
         }
 
     public:
+        ReproducibleFloatingAccumulator() = default;
+        ReproducibleFloatingAccumulator(
+            const ReproducibleFloatingAccumulator&) = default;
+        ///Sets this binned fp equal to another binned fp
+        ReproducibleFloatingAccumulator& operator=(
+            const ReproducibleFloatingAccumulator&) = default;
+
         ///Set the binned fp to zero
         void zero()
         {
@@ -601,22 +764,27 @@ namespace RFA {
         }
 
         ///Return the fold of the binned fp
-        int fold() const
+        constexpr int fold() const
         {
             return FOLD;
         }
 
+        ///Return the endurance of the binned fp
+        constexpr int endurance() const
+        {
+            return ENDURANCE;
+        }
+
         ///Returns the number of reference bins. Used for judging memory usage.
-        static constexpr size_t number_of_reference_bins()
+        constexpr size_t number_of_reference_bins()
         {
-            return bins.size();
+            return std::array<ftype, MAXINDEX + MAXFOLD>::size();
         }
 
         ///Accumulate an arithmetic @p x into the binned fp.
         ///NOTE: Casts @p x to the type of the binned fp
         template <typename U,
-            typename std::enable_if<std::is_arithmetic<U>::value>::type* =
-                nullptr>
+            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
         ReproducibleFloatingAccumulator& operator+=(const U x)
         {
             binned_dmdadd(static_cast<ftype>(x), 1, 1);
@@ -626,8 +794,7 @@ namespace RFA {
         ///Accumulate-subtract an arithmetic @p x into the binned fp.
         ///NOTE: Casts @p x to the type of the binned fp
         template <typename U,
-            typename std::enable_if<std::is_arithmetic<U>::value>::type* =
-                nullptr>
+            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
         ReproducibleFloatingAccumulator& operator-=(const U x)
         {
             binned_dmdadd(-static_cast<ftype>(x), 1, 1);
@@ -666,8 +833,7 @@ namespace RFA {
         ///Sets this binned fp equal to the arithmetic value @p x
         ///NOTE: Casts @p x to the type of the binned fp
         template <typename U,
-            typename std::enable_if<std::is_arithmetic<U>::value>::type* =
-                nullptr>
+            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
         ReproducibleFloatingAccumulator& operator=(const U x)
         {
             zero();
@@ -675,14 +841,6 @@ namespace RFA {
             return *this;
         }
 
-        ///Sets this binned fp equal to another binned fp
-        ReproducibleFloatingAccumulator& operator=(
-            const ReproducibleFloatingAccumulator<ftype, FOLD>& o)
-        {
-            data = o.data;
-            return *this;
-        }
-
         ///Returns the negative of this binned fp
         ///NOTE: Makes a copy and performs arithmetic; slow.
         ReproducibleFloatingAccumulator operator-()
@@ -690,14 +848,14 @@ namespace RFA {
             constexpr int incpriX = 1;
             constexpr int inccarX = 1;
             ReproducibleFloatingAccumulator temp = *this;
-            if (pvec()[0] != 0.0)
+            if (primary(0) != 0.0)
             {
                 const auto* const bins = binned_bins(binned_index());
                 for (int i = 0; i < FOLD; i++)
                 {
-                    temp.pvec()[i * incpriX] =
-                        bins[i] - (pvec()[i * incpriX] - bins[i]);
-                    temp.cvec()[i * inccarX] = -cvec()[i * inccarX];
+                    temp.primary(i * incpriX) =
+                        bins[i] - (primary(i * incpriX) - bins[i]);
+                    temp.carry(i * inccarX) = -carry(i * inccarX);
                 }
             }
             return temp;
@@ -706,7 +864,7 @@ namespace RFA {
         ///Convert this binned fp into its native floating-point representation
         ftype conv() const
         {
-            if (std::is_same<ftype, float>::value)
+            if (std::is_same_v<ftype, float>)
             {
                 return binned_conv_single(1, 1);
             }
@@ -729,9 +887,8 @@ namespace RFA {
         {
             const double X = std::abs(max_abs_val);
             const double S = std::abs(binned_sum);
-            return static_cast<ftype>(
-                std::max(X, ldexp_impl(0.5, MIN_EXP - 1)) *
-                    ldexp_impl(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N +
+            return static_cast<ftype>(max(X, std::ldexp(0.5, MIN_EXP - 1)) *
+                    std::ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N +
                 ((7.0 * EPSILON) /
                     (1.0 - 6.0 * std::sqrt(static_cast<double>(EPSILON)) -
                         7.0 * EPSILON)) *
@@ -754,10 +911,12 @@ namespace RFA {
         {
             binned_dmdupdate(std::abs(max_abs_val), 1, 1);
             size_t count = 0;
+            size_t N = last - first;
             for (; first != last; first++, count++)
             {
                 binned_dmddeposit(static_cast<ftype>(*first), 1);
-                if (count == ENDURANCE)
+                // first conditional allows compiler to remove the call here when possible
+                if (N > ENDURANCE && count == ENDURANCE)
                 {
                     binned_dmrenorm(1, 1);
                     count = 0;
@@ -788,14 +947,11 @@ namespace RFA {
         ///@param N           Number of elements to add
         ///@param max_abs_val Maximum absolute value of any member of the range
         template <typename T,
-            typename std::enable_if<std::is_arithmetic<T>::value>::type* =
-                nullptr>
+            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
         void add(const T* input, const size_t N, const ftype max_abs_val)
         {
             if (N == 0)
-            {
                 return;
-            }
             add(input, input + N, max_abs_val);
         }
 
@@ -807,19 +963,147 @@ namespace RFA {
         ///@param input       Start of the range
         ///@param N           Number of elements to add
         template <typename T,
-            typename std::enable_if<std::is_arithmetic<T>::value>::type* =
-                nullptr>
+            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
         void add(const T* input, const size_t N)
         {
             if (N == 0)
+                return;
+
+            T max_abs_val = input[0];
+            for (size_t i = 0; i < N; i++)
+            {
+                max_abs_val = max(max_abs_val, std::abs(input[i]));
+            }
+            add(input, N, max_abs_val);
+        }
+
+        ///Accumulate a float4 @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        ReproducibleFloatingAccumulator& operator+=(const float4& x)
+        {
+            binned_dmdupdate(abs_max(x), 1, 1);
+            binned_dmddeposit(static_cast<ftype>(x.x), 1);
+            binned_dmddeposit(static_cast<ftype>(x.y), 1);
+            binned_dmddeposit(static_cast<ftype>(x.z), 1);
+            binned_dmddeposit(static_cast<ftype>(x.w), 1);
+            return *this;
+        }
+
+        ///Accumulate a double2 @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        ReproducibleFloatingAccumulator& operator+=(const float2& x)
+        {
+            binned_dmdupdate(abs_max(x), 1, 1);
+            binned_dmddeposit(static_cast<ftype>(x.x), 1);
+            binned_dmddeposit(static_cast<ftype>(x.y), 1);
+            return *this;
+        }
+
+        ///Accumulate a double2 @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        ReproducibleFloatingAccumulator& operator+=(const double2& x)
+        {
+            binned_dmdupdate(abs_max(x), 1, 1);
+            binned_dmddeposit(static_cast<ftype>(x.x), 1);
+            binned_dmddeposit(static_cast<ftype>(x.y), 1);
+            return *this;
+        }
+
+        void add(const float4* input, const size_t N, float max_abs_val)
+        {
+            if (N == 0)
+                return;
+            binned_dmdupdate(max_abs_val, 1, 1);
+
+            size_t count = 0;
+            for (size_t i = 0; i < N; i++)
             {
+                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].z), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].w), 1);
+
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        void add(const double2* input, const size_t N, double max_abs_val)
+        {
+            if (N == 0)
                 return;
+            binned_dmdupdate(max_abs_val, 1, 1);
+
+            size_t count = 0;
+            for (size_t i = 0; i < N; i++)
+            {
+                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
+
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
             }
-            T max_abs_val = input[0];
+        }
+
+        void add(const float2* input, const size_t N, double max_abs_val)
+        {
+            if (N == 0)
+                return;
+            binned_dmdupdate(max_abs_val, 1, 1);
+
+            size_t count = 0;
             for (size_t i = 0; i < N; i++)
             {
-                max_abs_val = std::max(max_abs_val, std::abs(input[i]));
+                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
+
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
             }
+        }
+
+        void add(const float4* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            auto max_abs_val = abs_max(input[0]);
+            for (size_t i = 1; i < N; i++)
+                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
+
+            add(input, N, max_abs_val);
+        }
+
+        void add(const double2* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            auto max_abs_val = abs_max(input[0]);
+            for (size_t i = 1; i < N; i++)
+                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
+
+            add(input, N, max_abs_val);
+        }
+
+        void add(const float2* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            auto max_abs_val = abs_max(input[0]);
+            for (size_t i = 1; i < N; i++)
+                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
+
             add(input, N, max_abs_val);
         }
 
@@ -833,8 +1117,7 @@ namespace RFA {
         ///with `unsafe_add` after which `renorm()` must be called. See the source of
         ///`add()` for an example
         template <typename T,
-            typename std::enable_if<std::is_arithmetic<T>::value>::type* =
-                nullptr>
+            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
         void set_max_abs_val(const T mav)
         {
             binned_dmdupdate(std::abs(mav), 1, 1);
@@ -858,11 +1141,4 @@ namespace RFA {
         }
     };
 
-    template <>
-    auto ReproducibleFloatingAccumulator<float>::bins =
-        ReproducibleFloatingAccumulator<float>::initialize_bins();
-    
-    template <>
-    auto ReproducibleFloatingAccumulator<double>::bins =
-        ReproducibleFloatingAccumulator<double>::initialize_bins();
-}    // namespace RFA
+}    // namespace hpx::parallel::detail::rfa
\ No newline at end of file
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp
deleted file mode 100644
index 05f71d9ae74..00000000000
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp
+++ /dev/null
@@ -1,1168 +0,0 @@
-//Reproducible Floating Point Accumulations via Binned Floating Point
-//Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0.
-//ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel.
-//
-//The code accomplishes several objectives:
-//
-//1. Reproducible summation, independent of summation order, assuming only a
-//   subset of the IEEE 754 Floating Point Standard
-//
-//2. Has accuracy at least as good as conventional summation, and tunable
-//
-//3. Handles overflow, underflow, and other exceptions reproducibly.
-//
-//4. Makes only one read-only pass over the summands.
-//
-//5. Requires only one parallel reduction.
-//
-//6. Uses minimal memory (6 doubles per accumulator with fold=3).
-//
-//7. Relatively easy to use
-
-#pragma once
-
-#include <algorithm>
-#include <climits>
-#include <cmath>
-#include <cstdint>
-#include <limits>
-
-#ifndef __CUDACC__
-#define __host__
-#define __device__
-#define __forceinline__
-#include <array>
-using std::array;
-using std::max;
-using std::min;
-#else
-#include <cuda/std/array>
-using cuda::std::array;
-using cuda::std::max;
-using cuda::std::min;
-#include "vector.hpp"
-#endif
-
-namespace hpx::parallel::detail::rfa {
-    template <typename F>
-    struct type4
-    {
-        F x;
-        F y;
-        F z;
-        F w;
-    };
-
-    template <typename F>
-    struct type2
-    {
-        F x;
-        F y;
-    };
-    using float4 = type4<float>;
-    using double4 = type4<double>;
-    using float2 = type2<float>;
-    using double2 = type2<double>;
-
-    auto abs_max(float4 a)
-    {
-        auto x = std::abs(a.x);
-        auto y = std::abs(a.y);
-        auto z = std::abs(a.z);
-        auto w = std::abs(a.w);
-        const std::vector<float> v = {x, y, z, w};
-        return *std::max_element(v.begin(), v.end());
-    }
-
-    auto abs_max(double4 a)
-    {
-        auto x = std::abs(a.x);
-        auto y = std::abs(a.y);
-        auto z = std::abs(a.z);
-        auto w = std::abs(a.w);
-        const std::vector<double> v = {x, y, z, w};
-        return *std::max_element(v.begin(), v.end());
-    }
-
-    auto abs_max(float2 a)
-    {
-        auto x = std::abs(a.x);
-        auto y = std::abs(a.y);
-        const std::vector<float> v = {x, y};
-        return *std::max_element(v.begin(), v.end());
-    }
-
-    auto abs_max(double2 a)
-    {
-        auto x = std::abs(a.x);
-        auto y = std::abs(a.y);
-        const std::vector<double> v = {x, y};
-        return *std::max_element(v.begin(), v.end());
-    }
-
-// disable zero checks
-#define DISABLE_ZERO
-
-// disable nan / infinity checks
-#define DISABLE_NANINF
-
-// jump table for indexing into data
-#define MAX_JUMP 5
-    static_assert(MAX_JUMP <= 5, "MAX_JUMP greater than max");
-
-    template <typename Real>
-    inline constexpr Real ldexp_impl(Real arg, int exp) noexcept
-    {
-        return std::ldexp(arg, exp);
-        // while (arg == 0)
-        // {
-        //     return arg;
-        // }
-        // while (exp > 0)
-        // {
-        //     arg *= static_cast<Real>(2);
-        //     --exp;
-        // }
-        // while (exp < 0)
-        // {
-        //     arg /= static_cast<Real>(2);
-        //     ++exp;
-        // }
-
-        // return arg;
-    }
-
-    template <class ftype>
-    struct RFA_bins
-    {
-        static constexpr auto BIN_WIDTH =
-            std::is_same_v<ftype, double> ? 40 : 13;
-        static constexpr auto MIN_EXP =
-            std::numeric_limits<ftype>::min_exponent;
-        static constexpr auto MAX_EXP =
-            std::numeric_limits<ftype>::max_exponent;
-        static constexpr auto MANT_DIG = std::numeric_limits<ftype>::digits;
-        ///Binned floating-point maximum index
-        static constexpr auto MAXINDEX =
-            ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1;
-        //The maximum floating-point fold supported by the library
-        static constexpr auto MAXFOLD = MAXINDEX + 1;
-
-        ///The binned floating-point reference bins
-        array<ftype, MAXINDEX + MAXFOLD> bins = {};
-
-        constexpr ftype& operator[](int d)
-        {
-            return bins[d];
-        }
-
-        void initialize_bins()
-        {
-            if constexpr (std::is_same_v<ftype, float>)
-            {
-                bins[0] = std::ldexp(0.75, MAX_EXP);
-            }
-            else
-            {
-                bins[0] = 2.0 * ldexp(0.75, MAX_EXP - 1);
-            }
-
-            for (int index = 1; index <= MAXINDEX; index++)
-            {
-                bins[index] = ldexp(0.75,
-                    MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH);
-            }
-            for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++)
-            {
-                bins[index] = bins[index - 1];
-            }
-        }
-    };
-
-    static char bin_host_buffer[sizeof(RFA_bins<double>)];
-#ifdef __CUDACC__
-    __constant__ static char bin_device_buffer[sizeof(RFA_bins<double>)];
-#endif
-
-    ///Class to hold a reproducible summation of the numbers passed to it
-    ///
-    ///@param ftype Floating-point data type; either `float` or `double
-    ///@param FOLD  The fold; use 3 as a default unless you understand it.
-    template <class ftype_, int FOLD_ = 3,
-        typename std::enable_if_t<std::is_floating_point<ftype_>::value>* =
-            nullptr>
-    class alignas(2 * sizeof(ftype_)) ReproducibleFloatingAccumulator
-    {
-    public:
-        using ftype = ftype_;
-        static constexpr int FOLD = FOLD_;
-
-    private:
-        array<ftype, 2 * FOLD> data = {0};
-
-        ///Floating-point precision bin width
-        static constexpr auto BIN_WIDTH =
-            std::is_same_v<ftype, double> ? 40 : 13;
-        static constexpr auto MIN_EXP =
-            std::numeric_limits<ftype>::min_exponent;
-        static constexpr auto MAX_EXP =
-            std::numeric_limits<ftype>::max_exponent;
-        static constexpr auto MANT_DIG = std::numeric_limits<ftype>::digits;
-        ///Binned floating-point maximum index
-        static constexpr auto MAXINDEX =
-            ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1;
-        //The maximum floating-point fold supported by the library
-        static constexpr auto MAXFOLD = MAXINDEX + 1;
-        ///Binned floating-point compression factor
-        ///This factor is used to scale down inputs before deposition into the bin of
-        ///highest index
-        static constexpr auto COMPRESSION =
-            1.0 / (1 << (MANT_DIG - BIN_WIDTH + 1));
-        ///Binned double precision expansion factor
-        ///This factor is used to scale up inputs after deposition into the bin of
-        ///highest index
-        static constexpr auto EXPANSION =
-            1.0 * (1 << (MANT_DIG - BIN_WIDTH + 1));
-        static constexpr auto EXP_BIAS = MAX_EXP - 2;
-        static constexpr auto EPSILON = std::numeric_limits<ftype>::epsilon();
-        ///Binned floating-point deposit endurance
-        ///The number of deposits that can be performed before a renorm is necessary.
-        ///Applies also to binned complex double precision.
-        static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2);
-
-        ///Return a binned floating-point reference bin
-        inline const ftype* binned_bins(const int x) const
-        {
-#ifdef __CUDA_ARCH__    // must be arch not CC here
-            return &reinterpret_cast<RFA_bins<ftype>&>(bin_device_buffer)[x];
-#else
-            return &reinterpret_cast<RFA_bins<ftype>&>(bin_host_buffer)[x];
-#endif
-        }
-
-        ///Get the bit representation of a float
-        static inline uint32_t& get_bits(float& x)
-        {
-            return *reinterpret_cast<uint32_t*>(&x);
-        }
-        ///Get the bit representation of a double
-        static inline uint64_t& get_bits(double& x)
-        {
-            return *reinterpret_cast<uint64_t*>(&x);
-        }
-        ///Get the bit representation of a const float
-        static inline uint32_t get_bits(const float& x)
-        {
-            return *reinterpret_cast<const uint32_t*>(&x);
-        }
-        ///Get the bit representation of a const double
-        static inline uint64_t get_bits(const double& x)
-        {
-            return *reinterpret_cast<const uint64_t*>(&x);
-        }
-
-        ///Return primary vector value const ref
-        inline const ftype& primary(int i) const
-        {
-            if constexpr (FOLD <= MAX_JUMP)
-            {
-                switch (i)
-                {
-                case 0:
-                    if constexpr (FOLD >= 1)
-                        return data[0];
-                case 1:
-                    if constexpr (FOLD >= 2)
-                        return data[1];
-                case 2:
-                    if constexpr (FOLD >= 3)
-                        return data[2];
-                case 3:
-                    if constexpr (FOLD >= 4)
-                        return data[3];
-                case 4:
-                    if constexpr (FOLD >= 5)
-                        return data[4];
-                default:
-                    return data[FOLD - 1];
-                }
-            }
-            else
-            {
-                return data[i];
-            }
-        }
-
-        ///Return carry vector value const ref
-        inline const ftype& carry(int i) const
-        {
-            if constexpr (FOLD <= MAX_JUMP)
-            {
-                switch (i)
-                {
-                case 0:
-                    if constexpr (FOLD >= 1)
-                        return data[FOLD + 0];
-                case 1:
-                    if constexpr (FOLD >= 2)
-                        return data[FOLD + 1];
-                case 2:
-                    if constexpr (FOLD >= 3)
-                        return data[FOLD + 2];
-                case 3:
-                    if constexpr (FOLD >= 4)
-                        return data[FOLD + 3];
-                case 4:
-                    if constexpr (FOLD >= 5)
-                        return data[FOLD + 4];
-                default:
-                    return data[2 * FOLD - 1];
-                }
-            }
-            else
-            {
-                return data[FOLD + i];
-            }
-        }
-
-        ///Return primary vector value ref
-        inline ftype& primary(int i)
-        {
-            const auto& c = *this;
-            return const_cast<ftype&>(c.primary(i));
-        }
-
-        ///Return carry vector value ref
-        inline ftype& carry(int i)
-        {
-            const auto& c = *this;
-            return const_cast<ftype&>(c.carry(i));
-        }
-
-#ifdef DISABLE_ZERO
-        static inline constexpr bool ISZERO(const ftype)
-        {
-            return false;
-        }
-#else
-        static inline constexpr bool ISZERO(const ftype x)
-        {
-            return x == 0.0;
-        }
-#endif
-
-#ifdef DISABLE_NANINF
-        static inline constexpr int ISNANINF(const ftype)
-        {
-            return false;
-        }
-#else
-        static inline constexpr int ISNANINF(const ftype x)
-        {
-            const auto bits = get_bits(x);
-            return (bits & ((2ull * MAX_EXP - 1) << (MANT_DIG - 1))) ==
-                ((2ull * MAX_EXP - 1) << (MANT_DIG - 1));
-        }
-#endif
-
-        static inline constexpr int EXP(const ftype x)
-        {
-            const auto bits = get_bits(x);
-            return (bits >> (MANT_DIG - 1)) & (2 * MAX_EXP - 1);
-        }
-
-        ///Get index of float-point precision
-        ///The index of a non-binned type is the smallest index a binned type would
-        ///need to have to sum it reproducibly. Higher indicies correspond to smaller
-        ///bins.
-        static inline constexpr int binned_dindex(const ftype x)
-        {
-            int exp = EXP(x);
-            if (exp == 0)
-            {
-                if (x == 0.0)
-                {
-                    return MAXINDEX;
-                }
-                else
-                {
-                    frexp(x, &exp);
-                    return min((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX);
-                }
-            }
-            return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH;
-        }
-
-        ///Get index of manually specified binned double precision
-        ///The index of a binned type is the bin that it corresponds to. Higher
-        ///indicies correspond to smaller bins.
-        inline int binned_index() const
-        {
-            return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) -
-                       EXP(primary(0))) /
-                BIN_WIDTH;
-        }
-
-        ///Check if index of manually specified binned floating-point is 0
-        ///A quick check to determine if the index is 0
-        inline bool binned_index0() const
-        {
-            return EXP(primary(0)) == MAX_EXP + EXP_BIAS;
-        }
-
-        ///Update manually specified binned fp with a scalar (X -> Y)
-        ///
-        ///This method updates the binned fp to an index suitable for adding numbers
-        ///with absolute value less than @p max_abs_val
-        ///
-        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
-        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
-        void binned_dmdupdate(
-            const ftype max_abs_val, const int incpriY, const int inccarY)
-        {
-            if (ISNANINF(primary(0)))
-                return;
-
-            int X_index = binned_dindex(max_abs_val);
-            if (ISZERO(primary(0)))
-            {
-                const ftype* const bins = binned_bins(X_index);
-                for (int i = 0; i < FOLD; i++)
-                {
-                    primary(i * incpriY) = bins[i];
-                    carry(i * inccarY) = 0.0;
-                }
-            }
-            else
-            {
-                int shift = binned_index() - X_index;
-                if (shift > 0)
-                {
-#pragma unroll
-                    for (int i = FOLD - 1; i >= 1; i--)
-                    {
-                        if (i < shift)
-                            break;
-                        primary(i * incpriY) = primary((i - shift) * incpriY);
-                        carry(i * inccarY) = carry((i - shift) * inccarY);
-                    }
-                    const ftype* const bins = binned_bins(X_index);
-#pragma unroll
-                    for (int j = 0; j < FOLD; j++)
-                    {
-                        if (j >= shift)
-                            break;
-                        primary(j * incpriY) = bins[j];
-                        carry(j * inccarY) = 0.0;
-                    }
-                }
-            }
-        }
-
-        ///Add scalar @p X to suitably binned manually specified binned fp (Y += X)
-        ///
-        ///Performs the operation Y += X on an binned type Y where the index of Y is
-        ///larger than the index of @p X
-        ///
-        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
-        void binned_dmddeposit(const ftype X, const int incpriY)
-        {
-            ftype M;
-            ftype x = X;
-
-            if (ISNANINF(x) || ISNANINF(primary(0)))
-            {
-                primary(0) += x;
-                return;
-            }
-
-            if (binned_index0())
-            {
-                M = primary(0);
-                ftype qd = x * COMPRESSION;
-                auto& ql = get_bits(qd);
-                ql |= 1;
-                qd += M;
-                primary(0) = qd;
-                M -= qd;
-                M *= EXPANSION * 0.5;
-                x += M;
-                x += M;
-#pragma unroll
-                for (int i = 1; i < FOLD - 1; i++)
-                {
-                    M = primary(i * incpriY);
-                    qd = x;
-                    ql |= 1;
-                    qd += M;
-                    primary(i * incpriY) = qd;
-                    M -= qd;
-                    x += M;
-                }
-                qd = x;
-                ql |= 1;
-                primary((FOLD - 1) * incpriY) += qd;
-            }
-            else
-            {
-                ftype qd = x;
-                auto& ql = get_bits(qd);
-#pragma unroll
-                for (int i = 0; i < FOLD - 1; i++)
-                {
-                    M = primary(i * incpriY);
-                    qd = x;
-                    ql |= 1;
-                    qd += M;
-                    primary(i * incpriY) = qd;
-                    M -= qd;
-                    x += M;
-                }
-                qd = x;
-                ql |= 1;
-                primary((FOLD - 1) * incpriY) += qd;
-            }
-        }
-
-        ///Renormalize manually specified binned double precision
-        ///
-        ///Renormalization keeps the primary vector within the necessary bins by
-        ///shifting over to the carry vector
-        ///
-        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
-        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
-        inline void binned_dmrenorm(const int incpriX, const int inccarX)
-        {
-            if (ISZERO(primary(0)) || ISNANINF(primary(0)))
-                return;
-
-            for (int i = 0; i < FOLD; i++)
-            {
-                auto tmp_renormd = primary(i * incpriX);
-                auto& tmp_renorml = get_bits(tmp_renormd);
-
-                carry(i * inccarX) +=
-                    (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2;
-
-                tmp_renorml &= ~(1ull << (MANT_DIG - 3));
-                tmp_renorml |= 1ull << (MANT_DIG - 2);
-                primary(i * incpriX) = tmp_renormd;
-            }
-        }
-
-        ///Add scalar to manually specified binned fp (Y += X)
-        ///
-        ///Performs the operation Y += X on an binned type Y
-        ///
-        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
-        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
-        void binned_dmdadd(const ftype X, const int incpriY, const int inccarY)
-        {
-            binned_dmdupdate(X, incpriY, inccarY);
-            binned_dmddeposit(X, incpriY);
-            binned_dmrenorm(incpriY, inccarY);
-        }
-
-        ///Convert manually specified binned fp to native double-precision (X -> Y)
-        ///
-        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
-        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
-        double binned_conv_double(const int incpriX, const int inccarX) const
-        {
-            int i = 0;
-
-            if (ISNANINF(primary(0)))
-                return primary(0);
-            if (ISZERO(primary(0)))
-                return 0.0;
-
-            double Y = 0.0;
-            double scale_down;
-            double scale_up;
-            int scaled;
-            const auto X_index = binned_index();
-            const auto* const bins = binned_bins(X_index);
-            if (X_index <= (3 * MANT_DIG) / BIN_WIDTH)
-            {
-                scale_down = ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH));
-                scale_up = ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH));
-                scaled =
-                    max(min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0);
-                if (X_index == 0)
-                {
-                    Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION);
-                    Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down);
-                    Y += (primary(0) - bins[0]) * scale_down * EXPANSION;
-                    i = 2;
-                }
-                else
-                {
-                    Y += carry(0) * ((bins[0] / 6.0) * scale_down);
-                    i = 1;
-                }
-                for (; i < scaled; i++)
-                {
-                    Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down);
-                    Y +=
-                        (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down;
-                }
-                if (i == FOLD)
-                {
-                    Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) *
-                        scale_down;
-                    return Y * scale_up;
-                }
-                if (std::isinf(Y * scale_up))
-                {
-                    return Y * scale_up;
-                }
-                Y *= scale_up;
-                for (; i < FOLD; i++)
-                {
-                    Y += carry(i * inccarX) * (bins[i] / 6.0);
-                    Y += primary((i - 1) * incpriX) - bins[i - 1];
-                }
-                Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1];
-            }
-            else
-            {
-                Y += carry(0) * (bins[0] / 6.0);
-                for (i = 1; i < FOLD; i++)
-                {
-                    Y += carry(i * inccarX) * (bins[i] / 6.0);
-                    Y += (primary((i - 1) * incpriX) - bins[i - 1]);
-                }
-                Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
-            }
-            return Y;
-        }
-
-        ///Convert manually specified binned fp to native single-precision (X -> Y)
-        ///
-        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
-        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
-        float binned_conv_single(const int incpriX, const int inccarX) const
-        {
-            int i = 0;
-            double Y = 0.0;
-
-            if (ISNANINF(primary(0)))
-                return primary(0);
-            if (ISZERO(primary(0)))
-                return 0.0;
-
-            //Note that the following order of summation is in order of decreasing
-            //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and
-            //the number of carries equal to 1.
-            const auto X_index = binned_index();
-            const auto* const bins = binned_bins(X_index);
-            if (X_index == 0)
-            {
-                Y += (double) carry(0) * (double) (bins[0] / 6.0) *
-                    (double) EXPANSION;
-                Y += (double) carry(inccarX) * (double) (bins[1] / 6.0);
-                Y += (double) (primary(0) - bins[0]) * (double) EXPANSION;
-                i = 2;
-            }
-            else
-            {
-                Y += (double) carry(0) * (double) (bins[0] / 6.0);
-                i = 1;
-            }
-            for (; i < FOLD; i++)
-            {
-                Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0);
-                Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
-            }
-            Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
-
-            return (float) Y;
-        }
-
-        ///Add two manually specified binned fp (Y += X)
-        ///Performs the operation Y += X
-        ///
-        ///@param other   Another binned fp of the same type
-        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
-        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
-        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
-        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
-        void binned_dmdmadd(const ReproducibleFloatingAccumulator& x,
-            const int incpriX, const int inccarX, const int incpriY,
-            const int inccarY)
-        {
-            if (ISZERO(x.primary(0)))
-                return;
-
-            if (ISZERO(primary(0)))
-            {
-                for (int i = 0; i < FOLD; i++)
-                {
-                    primary(i * incpriY) = x.primary(i * incpriX);
-                    carry(i * inccarY) = x.carry(i * inccarX);
-                }
-                return;
-            }
-
-            if (ISNANINF(x.primary(0)) || ISNANINF(primary(0)))
-            {
-                primary(0) += x.primary(0);
-                return;
-            }
-
-            const auto X_index = x.binned_index();
-            const auto Y_index = this->binned_index();
-            const auto shift = Y_index - X_index;
-            if (shift > 0)
-            {
-                const auto* const bins = binned_bins(Y_index);
-                //shift Y upwards and add X to Y
-#pragma unroll
-                for (int i = FOLD - 1; i >= 1; i--)
-                {
-                    if (i < shift)
-                        break;
-                    primary(i * incpriY) = x.primary(i * incpriX) +
-                        (primary((i - shift) * incpriY) - bins[i - shift]);
-                    carry(i * inccarY) =
-                        x.carry(i * inccarX) + carry((i - shift) * inccarY);
-                }
-#pragma unroll
-                for (int i = 0; i < FOLD; i++)
-                {
-                    if (i == shift)
-                        break;
-                    primary(i * incpriY) = x.primary(i * incpriX);
-                    carry(i * inccarY) = x.carry(i * inccarX);
-                }
-            }
-            else if (shift < 0)
-            {
-                const auto* const bins = binned_bins(X_index);
-                //shift X upwards and add X to Y
-#pragma unroll
-                for (int i = 0; i < FOLD; i++)
-                {
-                    if (i < -shift)
-                        continue;
-                    primary(i * incpriY) +=
-                        x.primary((i + shift) * incpriX) - bins[i + shift];
-                    carry(i * inccarY) += x.carry((i + shift) * inccarX);
-                }
-            }
-            else if (shift == 0)
-            {
-                const auto* const bins = binned_bins(X_index);
-                // add X to Y
-#pragma unroll
-                for (int i = 0; i < FOLD; i++)
-                {
-                    primary(i * incpriY) += x.primary(i * incpriX) - bins[i];
-                    carry(i * inccarY) += x.carry(i * inccarX);
-                }
-            }
-
-            binned_dmrenorm(incpriY, inccarY);
-        }
-
-        ///Add two manually specified binned fp (Y += X)
-        ///Performs the operation Y += X
-        void binned_dbdbadd(const ReproducibleFloatingAccumulator& other)
-        {
-            binned_dmdmadd(other, 1, 1, 1, 1);
-        }
-
-    public:
-        ReproducibleFloatingAccumulator() = default;
-        ReproducibleFloatingAccumulator(
-            const ReproducibleFloatingAccumulator&) = default;
-        ///Sets this binned fp equal to another binned fp
-        ReproducibleFloatingAccumulator& operator=(
-            const ReproducibleFloatingAccumulator&) = default;
-
-        ///Set the binned fp to zero
-        void zero()
-        {
-            data = {0};
-        }
-
-        ///Return the fold of the binned fp
-        constexpr int fold() const
-        {
-            return FOLD;
-        }
-
-        ///Return the endurance of the binned fp
-        constexpr int endurance() const
-        {
-            return ENDURANCE;
-        }
-
-        ///Returns the number of reference bins. Used for judging memory usage.
-        constexpr size_t number_of_reference_bins()
-        {
-            return array<ftype, MAXINDEX + MAXFOLD>::size();
-        }
-
-        ///Accumulate an arithmetic @p x into the binned fp.
-        ///NOTE: Casts @p x to the type of the binned fp
-        template <typename U,
-            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
-        ReproducibleFloatingAccumulator& operator+=(const U x)
-        {
-            binned_dmdadd(static_cast<ftype>(x), 1, 1);
-            return *this;
-        }
-
-        ///Accumulate-subtract an arithmetic @p x into the binned fp.
-        ///NOTE: Casts @p x to the type of the binned fp
-        template <typename U,
-            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
-        ReproducibleFloatingAccumulator& operator-=(const U x)
-        {
-            binned_dmdadd(-static_cast<ftype>(x), 1, 1);
-            return *this;
-        }
-
-        ///Accumulate a binned fp @p x into the binned fp.
-        ReproducibleFloatingAccumulator& operator+=(
-            const ReproducibleFloatingAccumulator& other)
-        {
-            binned_dbdbadd(other);
-            return *this;
-        }
-
-        ///Accumulate-subtract a binned fp @p x into the binned fp.
-        ///NOTE: Makes a copy and performs arithmetic; slow.
-        ReproducibleFloatingAccumulator& operator-=(
-            const ReproducibleFloatingAccumulator& other)
-        {
-            const auto temp = -other;
-            binned_dbdbadd(temp);
-        }
-
-        ///Determines if two binned fp are equal
-        bool operator==(const ReproducibleFloatingAccumulator& other) const
-        {
-            return data == other.data;
-        }
-
-        ///Determines if two binned fp are not equal
-        bool operator!=(const ReproducibleFloatingAccumulator& other) const
-        {
-            return !operator==(other);
-        }
-
-        ///Sets this binned fp equal to the arithmetic value @p x
-        ///NOTE: Casts @p x to the type of the binned fp
-        template <typename U,
-            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
-        ReproducibleFloatingAccumulator& operator=(const U x)
-        {
-            zero();
-            binned_dmdadd(static_cast<ftype>(x), 1, 1);
-            return *this;
-        }
-
-        ///Returns the negative of this binned fp
-        ///NOTE: Makes a copy and performs arithmetic; slow.
-        ReproducibleFloatingAccumulator operator-()
-        {
-            constexpr int incpriX = 1;
-            constexpr int inccarX = 1;
-            ReproducibleFloatingAccumulator temp = *this;
-            if (primary(0) != 0.0)
-            {
-                const auto* const bins = binned_bins(binned_index());
-                for (int i = 0; i < FOLD; i++)
-                {
-                    temp.primary(i * incpriX) =
-                        bins[i] - (primary(i * incpriX) - bins[i]);
-                    temp.carry(i * inccarX) = -carry(i * inccarX);
-                }
-            }
-            return temp;
-        }
-
-        ///Convert this binned fp into its native floating-point representation
-        ftype conv() const
-        {
-            if (std::is_same_v<ftype, float>)
-            {
-                return binned_conv_single(1, 1);
-            }
-            else
-            {
-                return binned_conv_double(1, 1);
-            }
-        }
-
-        ///@brief Get binned fp summation error bound
-        ///
-        ///This is a bound on the absolute error of a summation using binned types
-        ///
-        ///@param N           The number of single precision floating point summands
-        ///@param max_abs_val The summand of maximum absolute value
-        ///@param binned_sum  The value of the sum computed using binned types
-        ///@return            The absolute error bound
-        static constexpr ftype error_bound(
-            const uint64_t N, const ftype max_abs_val, const ftype binned_sum)
-        {
-            const double X = std::abs(max_abs_val);
-            const double S = std::abs(binned_sum);
-            return static_cast<ftype>(max(X, ldexp(0.5, MIN_EXP - 1)) *
-                    ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N +
-                ((7.0 * EPSILON) /
-                    (1.0 - 6.0 * std::sqrt(static_cast<double>(EPSILON)) -
-                        7.0 * EPSILON)) *
-                    S);
-        }
-
-        ///Add @p x to the binned fp
-        void add(const ftype x)
-        {
-            binned_dmdadd(x, 1, 1);
-        }
-
-        ///Add arithmetics in the range [first, last) to the binned fp
-        ///
-        ///@param first       Start of range
-        ///@param last        End of range
-        ///@param max_abs_val Maximum absolute value of any member of the range
-        template <typename InputIt>
-        void add(InputIt first, InputIt last, const ftype max_abs_val)
-        {
-            binned_dmdupdate(std::abs(max_abs_val), 1, 1);
-            size_t count = 0;
-            size_t N = last - first;
-            for (; first != last; first++, count++)
-            {
-                binned_dmddeposit(static_cast<ftype>(*first), 1);
-                // first conditional allows compiler to remove the call here when possible
-                if (N > ENDURANCE && count == ENDURANCE)
-                {
-                    binned_dmrenorm(1, 1);
-                    count = 0;
-                }
-            }
-        }
-
-        ///Add arithmetics in the range [first, last) to the binned fp
-        ///
-        ///NOTE: A maximum absolute value is calculated, so two passes are made over
-        ///      the data
-        ///
-        ///@param first       Start of range
-        ///@param last        End of range
-        template <typename InputIt>
-        void add(InputIt first, InputIt last)
-        {
-            const auto max_abs_val = *std::max_element(
-                first, last, [](const auto& a, const auto& b) {
-                    return std::abs(a) < std::abs(b);
-                });
-            add(first, last, static_cast<ftype>(max_abs_val));
-        }
-
-        ///Add @p N elements starting at @p input to the binned fp: [input, input+N)
-        ///
-        ///@param input       Start of the range
-        ///@param N           Number of elements to add
-        ///@param max_abs_val Maximum absolute value of any member of the range
-        template <typename T,
-            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
-        void add(const T* input, const size_t N, const ftype max_abs_val)
-        {
-            if (N == 0)
-                return;
-            add(input, input + N, max_abs_val);
-        }
-
-        ///Add @p N elements starting at @p input to the binned fp: [input, input+N)
-        ///
-        ///NOTE: A maximum absolute value is calculated, so two passes are made over
-        ///      the data
-        ///
-        ///@param input       Start of the range
-        ///@param N           Number of elements to add
-        template <typename T,
-            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
-        void add(const T* input, const size_t N)
-        {
-            if (N == 0)
-                return;
-
-            T max_abs_val = input[0];
-            for (size_t i = 0; i < N; i++)
-            {
-                max_abs_val = max(max_abs_val, std::abs(input[i]));
-            }
-            add(input, N, max_abs_val);
-        }
-
-        ///Accumulate a float4 @p x into the binned fp.
-        ///NOTE: Casts @p x to the type of the binned fp
-        ReproducibleFloatingAccumulator& operator+=(const float4& x)
-        {
-            binned_dmdupdate(abs_max(x), 1, 1);
-            binned_dmddeposit(static_cast<ftype>(x.x), 1);
-            binned_dmddeposit(static_cast<ftype>(x.y), 1);
-            binned_dmddeposit(static_cast<ftype>(x.z), 1);
-            binned_dmddeposit(static_cast<ftype>(x.w), 1);
-            return *this;
-        }
-
-        ///Accumulate a double2 @p x into the binned fp.
-        ///NOTE: Casts @p x to the type of the binned fp
-        ReproducibleFloatingAccumulator& operator+=(const float2& x)
-        {
-            binned_dmdupdate(abs_max(x), 1, 1);
-            binned_dmddeposit(static_cast<ftype>(x.x), 1);
-            binned_dmddeposit(static_cast<ftype>(x.y), 1);
-            return *this;
-        }
-
-        ///Accumulate a double2 @p x into the binned fp.
-        ///NOTE: Casts @p x to the type of the binned fp
-        ReproducibleFloatingAccumulator& operator+=(const double2& x)
-        {
-            binned_dmdupdate(abs_max(x), 1, 1);
-            binned_dmddeposit(static_cast<ftype>(x.x), 1);
-            binned_dmddeposit(static_cast<ftype>(x.y), 1);
-            return *this;
-        }
-
-        void add(const float4* input, const size_t N, float max_abs_val)
-        {
-            if (N == 0)
-                return;
-            binned_dmdupdate(max_abs_val, 1, 1);
-
-            size_t count = 0;
-            for (size_t i = 0; i < N; i++)
-            {
-                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
-                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
-                binned_dmddeposit(static_cast<ftype>(input[i].z), 1);
-                binned_dmddeposit(static_cast<ftype>(input[i].w), 1);
-
-                if (N > ENDURANCE && count == ENDURANCE)
-                {
-                    binned_dmrenorm(1, 1);
-                    count = 0;
-                }
-            }
-        }
-
-        void add(const double2* input, const size_t N, double max_abs_val)
-        {
-            if (N == 0)
-                return;
-            binned_dmdupdate(max_abs_val, 1, 1);
-
-            size_t count = 0;
-            for (size_t i = 0; i < N; i++)
-            {
-                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
-                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
-
-                if (N > ENDURANCE && count == ENDURANCE)
-                {
-                    binned_dmrenorm(1, 1);
-                    count = 0;
-                }
-            }
-        }
-
-        void add(const float2* input, const size_t N, double max_abs_val)
-        {
-            if (N == 0)
-                return;
-            binned_dmdupdate(max_abs_val, 1, 1);
-
-            size_t count = 0;
-            for (size_t i = 0; i < N; i++)
-            {
-                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
-                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
-
-                if (N > ENDURANCE && count == ENDURANCE)
-                {
-                    binned_dmrenorm(1, 1);
-                    count = 0;
-                }
-            }
-        }
-
-        void add(const float4* input, const size_t N)
-        {
-            if (N == 0)
-                return;
-
-            auto max_abs_val = abs_max(input[0]);
-            for (size_t i = 1; i < N; i++)
-                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
-
-            add(input, N, max_abs_val);
-        }
-
-        void add(const double2* input, const size_t N)
-        {
-            if (N == 0)
-                return;
-
-            auto max_abs_val = abs_max(input[0]);
-            for (size_t i = 1; i < N; i++)
-                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
-
-            add(input, N, max_abs_val);
-        }
-
-        void add(const float2* input, const size_t N)
-        {
-            if (N == 0)
-                return;
-
-            auto max_abs_val = abs_max(input[0]);
-            for (size_t i = 1; i < N; i++)
-                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
-
-            add(input, N, max_abs_val);
-        }
-
-        //////////////////////////////////////
-        //MANUAL OPERATIONS; USE WISELY
-        //////////////////////////////////////
-
-        ///Rebins for repeated accumulation of scalars with magnitude <= @p mav
-        ///
-        ///Once rebinned, `ENDURANCE` values <= @p mav can be added to the accumulator
-        ///with `unsafe_add` after which `renorm()` must be called. See the source of
-        ///`add()` for an example
-        template <typename T,
-            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
-        void set_max_abs_val(const T mav)
-        {
-            binned_dmdupdate(std::abs(mav), 1, 1);
-        }
-
-        ///Add @p x to the binned fp
-        ///
-        ///This is intended to be used after a call to `set_max_abs_val()`
-        void unsafe_add(const ftype x)
-        {
-            binned_dmddeposit(x, 1);
-        }
-
-        ///Renormalizes the binned fp
-        ///
-        ///This is intended to be used after a call to `set_max_abs_val()` and one or
-        ///more calls to `unsafe_add()`
-        void renorm()
-        {
-            binned_dmrenorm(1, 1);
-        }
-    };
-
-    
-}    // namespace hpx::parallel::detail::rfa
\ No newline at end of file
diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
index 462b1473f33..0a67dea0302 100644
--- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
+++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
@@ -8,7 +8,7 @@
 
 #include <hpx/init.hpp>
 #include <hpx/modules/testing.hpp>
-#include <hpx/parallel/algorithms/detail/rfa_cuda.hpp>
+#include <hpx/parallel/algorithms/detail/rfa.hpp>
 #include <hpx/parallel/algorithms/reduce.hpp>
 #include <hpx/parallel/algorithms/reduce_deterministic.hpp>