From 41c8b5f878457dff5fbf2abfd4e69b0297c3948c Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Fri, 23 Jun 2023 09:22:01 +0100 Subject: [PATCH 1/4] Fix verification and synchronization for reduction Signed-off-by: Lukas Sommer --- pattern/reduction.cpp | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/pattern/reduction.cpp b/pattern/reduction.cpp index cb97367..f11ef7c 100644 --- a/pattern/reduction.cpp +++ b/pattern/reduction.cpp @@ -48,30 +48,43 @@ class Reduction void submit_ndrange(std::vector& events){ this->submit([this, &events](sycl::buffer *input, sycl::buffer *output, const size_t reduction_size, const size_t num_groups) { - events.push_back(this->local_reduce_ndrange(input, output, reduction_size, num_groups)); + auto ev = this->local_reduce_ndrange(input, output, reduction_size, + num_groups); + // We need to wait for the event here to make sure the kernel finished + // execution before swapping the buffers. + ev.wait(); + events.push_back(ev); }); } void submit_hierarchical(std::vector& events){ this->submit([this, &events](sycl::buffer *input, sycl::buffer *output, const size_t reduction_size, const size_t num_groups) { - events.push_back(this->local_reduce_hierarchical(input, output, reduction_size, - num_groups)); + auto ev = this->local_reduce_hierarchical(input, output, reduction_size, + num_groups); + // We need to wait for the event here to make sure the kernel finished + // execution before swapping the buffers. + ev.wait(); + events.push_back(ev); }); } bool verify(VerificationSetting &ver) { - T result = _final_output_buff->template get_access( - sycl::range<1>{0}, sycl::id<1>{1})[0]; + T result = _final_output_buff->get_host_access()[0]; // Calculate CPU result in fp64 to avoid obtaining a wrong verification result + std::vector initial_input(_input.size()); std::vector input_fp64(_input.size()); - for(std::size_t i = 0; i < _input.size(); ++i) - input_fp64[i] = static_cast(_input[i]); + // _input_buff will be re-used as output buffer in the multi-step reduction, + // overriding the content of _input. Initialize again with the original + // input values. + generate_input(initial_input); + for(std::size_t i = 0; i < initial_input.size(); ++i) + input_fp64[i] = static_cast(initial_input[i]); double delta = static_cast(result) - std::accumulate(input_fp64.begin(), input_fp64.end(), T{}); - + return std::abs(delta) < 1.e-5; } private: From f2a94cffdf0d20e18654a6c02ef43bfd38f617dd Mon Sep 17 00:00:00 2001 From: Victor Perez Date: Tue, 27 Jun 2023 13:14:29 +0100 Subject: [PATCH 2/4] Reimplement nbody verification nbody results can get relatively large, so verification may fail for lower precision FP types, i.e., float. Change verification so that it uses relative error. --- single-kernel/nbody.cpp | 43 +++++++++++------------------------------ 1 file changed, 11 insertions(+), 32 deletions(-) diff --git a/single-kernel/nbody.cpp b/single-kernel/nbody.cpp index 956f5a1..27846d8 100644 --- a/single-kernel/nbody.cpp +++ b/single-kernel/nbody.cpp @@ -2,6 +2,7 @@ #include #include +#include using namespace cl; @@ -105,41 +106,19 @@ class NBody host_resulting_velocities[i] = new_v; } - double deviation = std::sqrt( - calculateSquaredDifference(host_resulting_particles.data(), resulting_particles.get_pointer(), particles.size()) + - calculateSquaredDifference(host_resulting_velocities.data(), resulting_velocities.get_pointer(), particles.size())); - - return deviation < 1.e-6; + constexpr float_type maxErr = 10.f * std::numeric_limits::epsilon(); + return checkResults( + host_resulting_particles.begin(), host_resulting_particles.end(), resulting_particles.begin(), maxErr) && + checkResults(host_resulting_velocities.begin(), host_resulting_velocities.end(), + resulting_velocities.begin(), maxErr); } protected: - - template - double calculateSquaredDifference(sycl::vec a, sycl::vec b) { - auto diff = a - b; - diff *= diff; - - return static_cast(diff.x()+diff.y()+diff.z()); - } - - template - double calculateSquaredDifference(sycl::vec a, sycl::vec b) { - auto diff = a - b; - diff *= diff; - - return static_cast(diff.x()+diff.y()+diff.z()+diff.w()); - } - - template - double calculateSquaredDifference(const T* a, const T* b, std::size_t size) { - - double result = 0.0; - - for(std::size_t i = 0; i < size; ++i) { - result += calculateSquaredDifference(a[i], b[i]); - } - - return result; + template + static bool checkResults(InputIter0 expectedBegin, InputIter0 expectedEnd, InputIter1 gotBegin, float_type maxErr) { + return std::equal(expectedBegin, expectedEnd, gotBegin, [=](const auto& expected, const auto& got) { + return sycl::distance(expected, got) / sycl::length(expected) < maxErr; + }); } void submitNDRange(sycl::buffer& particles, sycl::buffer& velocities) { From c873c9763d14ca0ec9e7ad6c2e0342fead3f578b Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Thu, 29 Jun 2023 09:09:20 +0100 Subject: [PATCH 3/4] Address PR feedback Signed-off-by: Lukas Sommer --- pattern/reduction.cpp | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/pattern/reduction.cpp b/pattern/reduction.cpp index f11ef7c..5a20d42 100644 --- a/pattern/reduction.cpp +++ b/pattern/reduction.cpp @@ -48,24 +48,15 @@ class Reduction void submit_ndrange(std::vector& events){ this->submit([this, &events](sycl::buffer *input, sycl::buffer *output, const size_t reduction_size, const size_t num_groups) { - auto ev = this->local_reduce_ndrange(input, output, reduction_size, - num_groups); - // We need to wait for the event here to make sure the kernel finished - // execution before swapping the buffers. - ev.wait(); - events.push_back(ev); + events.push_back(this->local_reduce_ndrange(input, output, reduction_size, num_groups)); }); } void submit_hierarchical(std::vector& events){ this->submit([this, &events](sycl::buffer *input, sycl::buffer *output, const size_t reduction_size, const size_t num_groups) { - auto ev = this->local_reduce_hierarchical(input, output, reduction_size, - num_groups); - // We need to wait for the event here to make sure the kernel finished - // execution before swapping the buffers. - ev.wait(); - events.push_back(ev); + events.push_back(this->local_reduce_hierarchical(input, output, reduction_size, + num_groups)); }); } @@ -73,14 +64,9 @@ class Reduction T result = _final_output_buff->get_host_access()[0]; // Calculate CPU result in fp64 to avoid obtaining a wrong verification result - std::vector initial_input(_input.size()); std::vector input_fp64(_input.size()); - // _input_buff will be re-used as output buffer in the multi-step reduction, - // overriding the content of _input. Initialize again with the original - // input values. - generate_input(initial_input); - for(std::size_t i = 0; i < initial_input.size(); ++i) - input_fp64[i] = static_cast(initial_input[i]); + for(std::size_t i = 0; i < _input.size(); ++i) + input_fp64[i] = static_cast(_input[i]); double delta = static_cast(result) - std::accumulate(input_fp64.begin(), input_fp64.end(), T{}); From 5cfd475a3828451c3742744bb21997d9c43c0993 Mon Sep 17 00:00:00 2001 From: Victor Perez Date: Thu, 29 Jun 2023 14:37:22 +0100 Subject: [PATCH 4/4] Use accessor::get_pointer instead of accessor::begin --- single-kernel/nbody.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/single-kernel/nbody.cpp b/single-kernel/nbody.cpp index 27846d8..7f9f824 100644 --- a/single-kernel/nbody.cpp +++ b/single-kernel/nbody.cpp @@ -107,10 +107,10 @@ class NBody } constexpr float_type maxErr = 10.f * std::numeric_limits::epsilon(); - return checkResults( - host_resulting_particles.begin(), host_resulting_particles.end(), resulting_particles.begin(), maxErr) && + return checkResults(host_resulting_particles.begin(), host_resulting_particles.end(), + resulting_particles.get_pointer(), maxErr) && checkResults(host_resulting_velocities.begin(), host_resulting_velocities.end(), - resulting_velocities.begin(), maxErr); + resulting_velocities.get_pointer(), maxErr); } protected: