From 41c8b5f878457dff5fbf2abfd4e69b0297c3948c Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Fri, 23 Jun 2023 09:22:01 +0100
Subject: [PATCH 1/4] Fix verification and synchronization for reduction

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 pattern/reduction.cpp | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)
diff --git a/pattern/reduction.cpp b/pattern/reduction.cpp
index cb97367..f11ef7c 100644
--- a/pattern/reduction.cpp
+++ b/pattern/reduction.cpp
@@ -48,30 +48,43 @@ class Reduction
   void submit_ndrange(std::vector<cl::sycl::event>& events){
     this->submit([this, &events](sycl::buffer<T, 1> *input, sycl::buffer<T, 1> *output,
                         const size_t reduction_size, const size_t num_groups) {
-      events.push_back(this->local_reduce_ndrange(input, output, reduction_size, num_groups));
+      auto ev = this->local_reduce_ndrange(input, output, reduction_size,
+                                            num_groups);
+      // We need to wait for the event here to make sure the kernel finished
+      // execution before swapping the buffers.
+      ev.wait();
+      events.push_back(ev);
     });
   }
 
   void submit_hierarchical(std::vector<cl::sycl::event>& events){
     this->submit([this, &events](sycl::buffer<T, 1> *input, sycl::buffer<T, 1> *output,
                         const size_t reduction_size, const size_t num_groups) {
-      events.push_back(this->local_reduce_hierarchical(input, output, reduction_size,
-                                      num_groups));
+      auto ev = this->local_reduce_hierarchical(input, output, reduction_size,
+                                                  num_groups);
+      // We need to wait for the event here to make sure the kernel finished
+      // execution before swapping the buffers.
+      ev.wait();
+      events.push_back(ev);
     });
   }
 
   bool verify(VerificationSetting &ver) {
-    T result = _final_output_buff->template get_access<sycl::access::mode::read>(
-        sycl::range<1>{0}, sycl::id<1>{1})[0];
+    T result = _final_output_buff->get_host_access()[0];
 
     // Calculate CPU result in fp64 to avoid obtaining a wrong verification result
+    std::vector<T> initial_input(_input.size());
     std::vector<double> input_fp64(_input.size());
-    for(std::size_t i = 0; i < _input.size(); ++i)
-      input_fp64[i] = static_cast<double>(_input[i]);
+    // _input_buff will be re-used as output buffer in the multi-step reduction,
+    // overriding the content of _input. Initialize again with the original
+    // input values.
+    generate_input(initial_input);
+    for(std::size_t i = 0; i < initial_input.size(); ++i)
+      input_fp64[i] = static_cast<double>(initial_input[i]);
 
     double delta =
         static_cast<double>(result) - std::accumulate(input_fp64.begin(), input_fp64.end(), T{});
-    
+
     return std::abs(delta) < 1.e-5;
   }
 private:

From f2a94cffdf0d20e18654a6c02ef43bfd38f617dd Mon Sep 17 00:00:00 2001
From: Victor Perez <victor.perez@codeplay.com>
Date: Tue, 27 Jun 2023 13:14:29 +0100
Subject: [PATCH 2/4] Reimplement nbody verification

nbody results can get relatively large, so verification may fail for
lower precision FP types, i.e., float. Change verification so that it
uses relative error.
---
 single-kernel/nbody.cpp | 43 +++++++++++------------------------------
 1 file changed, 11 insertions(+), 32 deletions(-)

diff --git a/single-kernel/nbody.cpp b/single-kernel/nbody.cpp
index 956f5a1..27846d8 100644
--- a/single-kernel/nbody.cpp
+++ b/single-kernel/nbody.cpp
@@ -2,6 +2,7 @@
 
 #include <iostream>
 #include <cassert>
+#include <limits>
 
 using namespace cl;
 
@@ -105,41 +106,19 @@ class NBody
       host_resulting_velocities[i] = new_v;
     }
 
-    double deviation = std::sqrt(
-        calculateSquaredDifference(host_resulting_particles.data(), resulting_particles.get_pointer(), particles.size()) +
-        calculateSquaredDifference(host_resulting_velocities.data(), resulting_velocities.get_pointer(), particles.size()));
-
-    return deviation < 1.e-6;
+    constexpr float_type maxErr = 10.f * std::numeric_limits<float_type>::epsilon();
+    return checkResults(
+               host_resulting_particles.begin(), host_resulting_particles.end(), resulting_particles.begin(), maxErr) &&
+           checkResults(host_resulting_velocities.begin(), host_resulting_velocities.end(),
+               resulting_velocities.begin(), maxErr);
   }
 
 protected:
-
-  template<class T>
-  double calculateSquaredDifference(sycl::vec<T,3> a, sycl::vec<T,3> b) {
-    auto diff = a - b;
-    diff *= diff;
-
-    return static_cast<float_type>(diff.x()+diff.y()+diff.z());
-  }
-
-  template<class T>
-  double calculateSquaredDifference(sycl::vec<T,4> a, sycl::vec<T,4> b) {
-    auto diff = a - b;
-    diff *= diff;
-
-    return static_cast<float_type>(diff.x()+diff.y()+diff.z()+diff.w());
-  }
-
-  template<class T>
-  double calculateSquaredDifference(const T* a, const T* b, std::size_t size) {
-
-    double result = 0.0;
-
-    for(std::size_t i = 0; i < size; ++i) {
-      result += calculateSquaredDifference(a[i], b[i]);
-    }
-
-    return result;
+  template <class InputIter0, class InputIter1>
+  static bool checkResults(InputIter0 expectedBegin, InputIter0 expectedEnd, InputIter1 gotBegin, float_type maxErr) {
+    return std::equal(expectedBegin, expectedEnd, gotBegin, [=](const auto& expected, const auto& got) {
+      return sycl::distance(expected, got) / sycl::length(expected) < maxErr;
+    });
   }
 
   void submitNDRange(sycl::buffer<particle_type>& particles, sycl::buffer<vector_type>& velocities) {

From c873c9763d14ca0ec9e7ad6c2e0342fead3f578b Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Thu, 29 Jun 2023 09:09:20 +0100
Subject: [PATCH 3/4] Address PR feedback

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 pattern/reduction.cpp | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/pattern/reduction.cpp b/pattern/reduction.cpp
index f11ef7c..5a20d42 100644
--- a/pattern/reduction.cpp
+++ b/pattern/reduction.cpp
@@ -48,24 +48,15 @@ class Reduction
   void submit_ndrange(std::vector<cl::sycl::event>& events){
     this->submit([this, &events](sycl::buffer<T, 1> *input, sycl::buffer<T, 1> *output,
                         const size_t reduction_size, const size_t num_groups) {
-      auto ev = this->local_reduce_ndrange(input, output, reduction_size,
-                                            num_groups);
-      // We need to wait for the event here to make sure the kernel finished
-      // execution before swapping the buffers.
-      ev.wait();
-      events.push_back(ev);
+      events.push_back(this->local_reduce_ndrange(input, output, reduction_size, num_groups));
     });
   }
 
   void submit_hierarchical(std::vector<cl::sycl::event>& events){
     this->submit([this, &events](sycl::buffer<T, 1> *input, sycl::buffer<T, 1> *output,
                         const size_t reduction_size, const size_t num_groups) {
-      auto ev = this->local_reduce_hierarchical(input, output, reduction_size,
-                                                  num_groups);
-      // We need to wait for the event here to make sure the kernel finished
-      // execution before swapping the buffers.
-      ev.wait();
-      events.push_back(ev);
+      events.push_back(this->local_reduce_hierarchical(input, output, reduction_size,
+                                      num_groups));
     });
   }
 
@@ -73,14 +64,9 @@ class Reduction
     T result = _final_output_buff->get_host_access()[0];
 
     // Calculate CPU result in fp64 to avoid obtaining a wrong verification result
-    std::vector<T> initial_input(_input.size());
     std::vector<double> input_fp64(_input.size());
-    // _input_buff will be re-used as output buffer in the multi-step reduction,
-    // overriding the content of _input. Initialize again with the original
-    // input values.
-    generate_input(initial_input);
-    for(std::size_t i = 0; i < initial_input.size(); ++i)
-      input_fp64[i] = static_cast<double>(initial_input[i]);
+    for(std::size_t i = 0; i < _input.size(); ++i)
+      input_fp64[i] = static_cast<double>(_input[i]);
 
     double delta =
         static_cast<double>(result) - std::accumulate(input_fp64.begin(), input_fp64.end(), T{});

From 5cfd475a3828451c3742744bb21997d9c43c0993 Mon Sep 17 00:00:00 2001
From: Victor Perez <victor.perez@codeplay.com>
Date: Thu, 29 Jun 2023 14:37:22 +0100
Subject: [PATCH 4/4] Use accessor::get_pointer instead of accessor::begin

---
 single-kernel/nbody.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/single-kernel/nbody.cpp b/single-kernel/nbody.cpp
index 27846d8..7f9f824 100644
--- a/single-kernel/nbody.cpp
+++ b/single-kernel/nbody.cpp
@@ -107,10 +107,10 @@ class NBody
     }
 
     constexpr float_type maxErr = 10.f * std::numeric_limits<float_type>::epsilon();
-    return checkResults(
-               host_resulting_particles.begin(), host_resulting_particles.end(), resulting_particles.begin(), maxErr) &&
+    return checkResults(host_resulting_particles.begin(), host_resulting_particles.end(),
+               resulting_particles.get_pointer(), maxErr) &&
            checkResults(host_resulting_velocities.begin(), host_resulting_velocities.end(),
-               resulting_velocities.begin(), maxErr);
+               resulting_velocities.get_pointer(), maxErr);
   }
 
 protected: