diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 00000000..d21f4880
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,28 @@
+---
+#
+# Clang-Tidy configuration for SYCL-Bench.
+#
+# There are three usage scenarios:
+# 1. Automatic checks through an IDE (CLion, VsCode, ...)
+# 2. Running manually on select files (not recommended)
+#    `clang-tidy -p path/to/compile_commands.json file1 [file2, ...]`
+#    Note: A script for running clang-tidy on all Celerity sources is provided in `ci/run-clang-tidy.sh`
+# 3. Running on a diff (for CI)
+#    `git diff -U0 --no-color | clang-tidy-diff.py -p1 -path path/to/compile_commands.json`
+#
+InheritParentConfig: false
+# See https://clang.llvm.org/extra/clang-tidy/checks/list.html for a full list of available checks.
+Checks: -*,
+  readability-*,
+  -readability-avoid-const-params-in-decls,
+  -readability-function-cognitive-complexity,
+  -readability-identifier-length,
+  -readability-magic-numbers,
+  -readability-uppercase-literal-suffix,
+  -readability-convert-member-functions-to-static
+  -readability-qualified-auto
+
+# Treat naming violations as errors
+WarningsAsErrors: "readability-identifier-naming"
+# Use .clang-format configuration for fixes
+FormatStyle: file
diff --git a/.gitignore b/.gitignore
index 10f3c3d8..b9b4ae53 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,10 @@
 /build*
+*.csv
+img/
 
+# Clangd
+.cache/
+.clangd
+
+# Vscode
+.vscode/
\ No newline at end of file
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 00000000..eff18c29
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,21 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+conference-paper: "Proceedings of the 12th International Workshop on OpenCL and SYCL (IWOCL 24)"
+authors:
+- family-names: "Luigi"
+  given-names: "Crisci"
+- family-names: "Lorenzo"
+  given-names: "Carpentieri"
+- family-names: "Peter"
+  given-names: "Thoman"
+- family-names: "Aksel"
+  given-names: "Alpay"
+- family-names: "Vincent"
+  given-names: "Heuveline"
+- family-names: "Biagio"
+  given-names: "Cosenza"
+title: "SYCL-Bench 2020: Benchmarking SYCL 2020 on AMD, Intel, and NVIDIA GPUs"
+version: 2.0.4
+doi: 10.1145/3648115.3648120
+date-released: 2024-04-08
+url: "https://github.com/unisa-hpc/sycl-bench/"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a22b0fcf..ae0f2426 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,24 +1,31 @@
-cmake_minimum_required (VERSION 3.5)
+cmake_minimum_required(VERSION 3.5)
 project(sycl-bench)
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_SOURCE_DIR}/cmake)
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
 if(NOT CMAKE_BUILD_TYPE)
-  set(CMAKE_BUILD_TYPE "Release" CACHE STRING "CMake Build Type" FORCE)
+	set(CMAKE_BUILD_TYPE "Release" CACHE STRING "CMake Build Type" FORCE)
 endif()
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
-# Due to CMake limitations, hipSYCL requires C++ standard to be set manually
-set(CMAKE_SYCL_FLAGS "${CMAKE_SYCL_FLAGS} -std=c++17")
+# Default build flags
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -DDEBUG -fno-omit-frame-pointer" CACHE STRING "Flags used by the C++ compiler during debug builds." FORCE)
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG -march=native -ffast-math" CACHE STRING "Flags used by the C++ compiler during release builds." FORCE)
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -DNDEBUG -march=native -ffast-math -g -fno-omit-frame-pointer" CACHE STRING "Flags used by the C++ compiler during release builds with debug info." FORCE)
+
 
 if(CMAKE_GENERATOR STREQUAL "Ninja")
-  set(CMAKE_SYCL_FLAGS "${CMAKE_SYCL_FLAGS} -fdiagnostics-color=always")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-color=always")
-  set(COMPUTECPP_USER_FLAGS "${COMPUTECPP_USER_FLAGS} -fdiagnostics-color=always")
+	set(CMAKE_SYCL_FLAGS "${CMAKE_SYCL_FLAGS} -fdiagnostics-color=always")
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-color=always")
+	set(COMPUTECPP_USER_FLAGS "${COMPUTECPP_USER_FLAGS} -fdiagnostics-color=always")
+endif()
+
+if(SYCL_BENCH_ENABLE_QUEUE_PROFILING)
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSYCL_BENCH_ENABLE_QUEUE_PROFILING")
 endif()
 
 include(InstallRequiredSystemLibraries)
@@ -28,31 +35,55 @@ include_directories(${CMAKE_SOURCE_DIR}/include)
 include_directories(${CMAKE_SOURCE_DIR}/polybench/common)
 
 set(supported_implementations
-  ComputeCpp
-  hipSYCL
-  LLVM
-  LLVM-CUDA
-  triSYCL
+	AdaptiveCpp
+	dpcpp
+	triSYCL
 )
 
 list(FIND supported_implementations ${SYCL_IMPL} impl_idx)
+
 if(NOT SYCL_IMPL OR impl_idx EQUAL -1)
-  message(FATAL_ERROR "Please specify SYCL_IMPL (one of: ${supported_implementations})")
+	message(FATAL_ERROR "Please specify SYCL_IMPL (one of: ${supported_implementations})")
 endif()
 
-if(SYCL_IMPL STREQUAL "ComputeCpp")
-  find_package(ComputeCpp MODULE REQUIRED)
-elseif(SYCL_IMPL STREQUAL "hipSYCL")
-  find_package(hipSYCL CONFIG REQUIRED)
-elseif(SYCL_IMPL STREQUAL "LLVM")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl")
-elseif(SYCL_IMPL STREQUAL "LLVM-CUDA")
-  set(CMAKE_CXX_FLAGS
-    "${CMAKE_CXX_FLAGS} -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice")
+if(SYCL_IMPL STREQUAL "AdaptiveCpp")
+	find_package(AdaptiveCpp REQUIRED)
+elseif(SYCL_IMPL STREQUAL "dpcpp")
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl")
+
+	if(DPCPP_WITH_CUDA_BACKEND)
+		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
+
+		set(CUDA_ARCH "" CACHE STRING "CUDA device architecture e.g. sm_70")
+
+		if(NOT CUDA_ARCH STREQUAL "")
+			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xsycl-target-backend --cuda-gpu-arch=${CUDA_ARCH}")
+		endif()
+	endif()
+
+	if(DPCPP_WITH_ROCM_BACKEND)
+		set(ROCM_ARCH "" CACHE STRING "ROCm device architecture e.g. gfx908")
+
+		if(NOT ROCM_ARCH STREQUAL "")
+			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amd_gpu_${ROCM_ARCH}")
+		endif()
+	endif()
+
+	if(DPCPP_WITH_LZ_BACKEND)
+		set(LZ_ARCH "" CACHE STRING "Level Zero device architecture e.g. acm-g10")
+
+		if(NOT LZ_ARCH STREQUAL "")
+		  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=${LZ_ARCH}")
+		endif()
+	endif()
+
 elseif(SYCL_IMPL STREQUAL "triSYCL")
-  find_package(TriSYCL MODULE REQUIRED)
+	find_package(TriSYCL MODULE REQUIRED)
 endif()
 
+# Check if SYCL implementation implements the required SYCL features
+include(HasFeatures)
+
 set(benchmarks
   micro/arith.cpp
   micro/DRAM.cpp
@@ -92,42 +123,56 @@ set(benchmarks
   polybench/mvt.cpp
   polybench/syr2k.cpp
   polybench/syrk.cpp
-  #compiletime/compiletime.cpp
-)
 
-foreach(benchmark IN LISTS benchmarks)
-  get_filename_component(target ${benchmark} NAME_WE)
+  # compiletime/compiletime.cpp
+  sycl2020/atomics/atomic_reduction.cpp
+  sycl2020/USM/usm_accessors_latency.cpp
+  sycl2020/USM/usm_instr_mix.cpp
+  sycl2020/USM/usm_pinned_overhead.cpp
+  sycl2020/USM/usm_allocation_latency.cpp
+)
+# Selectively add benchmarks based on some SYCL 2020 features
+if (SYCL_BENCH_HAS_SPEC_CONSTANTS)
+  list(APPEND benchmarks sycl2020/spec_constants/spec_constant_convolution.cpp)
+endif()
+if (SYCL_BENCH_HAS_KERNEL_REDUCTIONS)
+  list(APPEND benchmarks sycl2020/kernel_reduction/kernel_reduction.cpp)
+endif()
+if (SYCL_BENCH_HAS_GROUP_ALGORITHMS)
+  list(APPEND benchmarks sycl2020/group_algorithms/reduce_over_group.cpp)
+endif()
 
-  add_executable(${target} ${benchmark})
+# Setting variables
+add_compile_definitions(SYCL_BENCH_HAS_FP64_SUPPORT=$<BOOL:${SYCL_BENCH_HAS_FP64_SUPPORT}>)
 
-  if(SYCL_IMPL STREQUAL "ComputeCpp" OR SYCL_IMPL STREQUAL "hipSYCL")
-    add_sycl_to_target(TARGET ${target} SOURCES ${benchmark})
-  endif()
+foreach(benchmark IN LISTS benchmarks)
+	get_filename_component(target ${benchmark} NAME_WE)
 
-  if(SYCL_IMPL STREQUAL "ComputeCpp" AND COMPUTECPP_BITCODE STREQUAL "ptx64")
-    target_compile_definitions(${target} PRIVATE SYCL_BENCH_ENABLE_QUEUE_PROFILING)
-  endif()
+	add_executable(${target} ${benchmark})
 
-  if(SYCL_IMPL STREQUAL "LLVM")
-    target_compile_definitions(${target} PRIVATE __LLVM_SYCL__)
-  endif()
+	if(SYCL_IMPL STREQUAL "AdaptiveCpp")
+		add_sycl_to_target(TARGET ${target} SOURCES ${benchmark})
+	endif()
 
-  if(SYCL_IMPL STREQUAL "LLVM-CUDA")
-    target_compile_definitions(${target} PRIVATE __LLVM_SYCL_CUDA__)
-  endif()
+	if(SYCL_IMPL STREQUAL "dpcpp")
+		target_compile_definitions(${target} PRIVATE __DPCPP__)
+	endif()
 
   if(SYCL_IMPL STREQUAL "triSYCL")
     add_sycl_to_target(${target})
     target_compile_definitions(${target} PRIVATE __TRISYCL__)
   endif()
-
+  
+  if(ENABLE_TIME_EVENT_PROFILING)
+    target_compile_definitions(${target} PUBLIC SYCL_BENCH_ENABLE_QUEUE_PROFILING=1)
+  endif()
+  
   install(TARGETS ${target} RUNTIME DESTINATION bin/benchmarks/)
   get_filename_component(dir ${benchmark} DIRECTORY)
   set_property(TARGET ${target} PROPERTY FOLDER ${dir})
 endforeach(benchmark)
 
 # The "compiletime" target should only be used in the context of the compile time evaluation script
-#set_target_properties(compiletime PROPERTIES EXCLUDE_FROM_ALL 1)
-
+# set_target_properties(compiletime PROPERTIES EXCLUDE_FROM_ALL 1)
 install(PROGRAMS bin/run-suite DESTINATION bin/)
-install(FILES ${PROJECT_SOURCE_DIR}/Brommy.bmp DESTINATION share/)
+install(FILES ${PROJECT_SOURCE_DIR}/share/Brommy.bmp DESTINATION share/)
\ No newline at end of file
diff --git a/README.md b/README.md
index 9aa6b758..be957022 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ $ mkdir build && cd build
 
 Compile with CMake
 ```
-$ cmake -DSYCL_IMPL=[target SYCL implementation] [other compiler arguments] ..
+$ cmake -DSYCL_IMPL=[target SYCL implementation] [-DSYCL_BENCH_HAS_FP64_SUPPORT=ON|OFF] [other compiler arguments] ..
 $ cmake --build .
 $ sudo make install
 ```
@@ -57,6 +57,24 @@ Packages built via the `package` target will contain all files contained in a SY
 
 ## Attribution
 If you use SYCL-Bench, please cite the following papers:
+```
+@inproceedings{SYCL-Bench:IWOCL:2024,
+author = {Crisci, Luigi and Carpentieri, Lorenzo and Thoman, Peter and Alpay, Aksel and Heuveline, Vincent and Cosenza, Biagio},
+title = {SYCL-Bench 2020: Benchmarking SYCL 2020 on AMD, Intel, and NVIDIA GPUs},
+year = {2024},
+isbn = {9798400717901},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/3648115.3648120},
+doi = {10.1145/3648115.3648120},
+booktitle = {Proceedings of the 12th International Workshop on OpenCL and SYCL},
+articleno = {1},
+numpages = {12},
+keywords = {GPU, HPC, SYCL, benchmark, heterogeneous computing, portability},
+location = {<conf-loc>, <city>Chicago</city>, <state>IL</state>, <country>USA</country>, </conf-loc>},
+series = {IWOCL '24}
+}
+```
 
 ```
 @inproceedings{SYCL-Bench:Euro-Par:2020,
diff --git a/bin/run-suite b/bin/run-suite
index ea8c44ab..b8a75715 100755
--- a/bin/run-suite
+++ b/bin/run-suite
@@ -108,9 +108,36 @@ default_profile = {
     },
     'mvt' : {
       '--size' : create_log_range(2**14, 2**14)
-    },                                                                             
-  },
-  'individual-benchmark-flags' : set([])
+    },              
+    'usm_accessors_latency' : {
+      '--size' : create_log_range(2**20, 2**20)
+    },
+    'usm_allocation_latency' : {
+      '--size' : create_log_range(2**25, 2**25)
+    },
+    'usm_instr_mix' : {
+      '--size' : create_log_range(2**14, 2**14)
+    },
+    'usm_pinned_overhead' : {
+      '--size' : create_log_range(2**20, 2**20)
+    },
+    'spec_constant_convolution' : {
+      '--size' : create_log_range(2**11, 2**11)
+    },
+    'atomic_reduction' : {
+      '--size' : create_log_range(2**20, 2**20)
+    },
+    'reduce_over_group' : {
+      '--size' : create_log_range(2**20, 2**20)
+    },
+    'kernel_reduction' : {
+      '--size' : create_log_range(2**20, 2**20)
+    }
+    },
+  'individual-benchmark-flags' : {
+    'usm_instr_mix' : ['--instr-mix=6'],
+    'usm_pinned_overhead' : ['--num-copies=5'],
+  }
 }
 
 def construct_profile(overridden_options_dict,
@@ -222,8 +249,9 @@ if __name__ == '__main__':
         if benchmark_name in individual_benchmark_options:
           for param in individual_benchmark_options[benchmark_name]:
             options[param] = individual_benchmark_options[benchmark_name][param]
+            
         if benchmark_name in individual_benchmark_flags:
-          for f in individual_benchmark_flags:
+          for f in individual_benchmark_flags[benchmark_name]:
             flags.add(f)
         
         max_runtime = 0.0
diff --git a/cmake/HasFeatures.cmake b/cmake/HasFeatures.cmake
new file mode 100644
index 00000000..8ca8bd6c
--- /dev/null
+++ b/cmake/HasFeatures.cmake
@@ -0,0 +1,22 @@
+macro(check_feature VAR FILENAME)
+    if(NOT DEFINED RUN_RES_${VAR})
+            try_run(RUN_RES_${VAR} COMPILE_RES_${VAR} ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/has-features/src/${FILENAME} 
+                CMAKE_FLAGS ${CMAKE_CXX_FLAGS}
+                COMPILE_OUTPUT_VARIABLE OUTPUT_VAR
+                RUN_OUTPUT_VARIABLE RUN_VAR 
+            )
+    endif()
+
+    if (COMPILE_RES_${VAR} AND RUN_RES_${VAR} EQUAL 0)
+        set(RES ON)
+    else()
+        set(RES OFF)
+    endif()
+    message(STATUS "${VAR}: ${RES}")
+endmacro()
+
+message(STATUS "Checking for SYCL features....")
+check_feature(KERNEL_REDUCTIONS kernel_reduction_dummy.cpp)
+check_feature(SPEC_CONSTANTS spec_constants_dummy.cpp)
+check_feature(GROUP_ALGORITHMS group_algorithms_dummy.cpp)
+check_feature(FP64_SUPPORT fp64_support_dummy.cpp)
\ No newline at end of file
diff --git a/cmake/has-features/src/fp64_support_dummy.cpp b/cmake/has-features/src/fp64_support_dummy.cpp
new file mode 100644
index 00000000..525df029
--- /dev/null
+++ b/cmake/has-features/src/fp64_support_dummy.cpp
@@ -0,0 +1,15 @@
+#include <sycl/sycl.hpp>
+
+int main() {
+  sycl::queue q;
+  sycl::buffer<double> x(1);
+
+  q.submit([&](sycl::handler& cgh) {
+    sycl::accessor a(x, cgh, sycl::read_write);
+    cgh.parallel_for<class dummy>(sycl::range<1>(1), [=](sycl::id<1> idx) { a[idx] = 0; });
+  });
+
+  sycl::host_accessor host{x};
+  assert(host[0] == 0);
+
+}
\ No newline at end of file
diff --git a/cmake/has-features/src/group_algorithms_dummy.cpp b/cmake/has-features/src/group_algorithms_dummy.cpp
new file mode 100644
index 00000000..1e8e5308
--- /dev/null
+++ b/cmake/has-features/src/group_algorithms_dummy.cpp
@@ -0,0 +1,17 @@
+#include <sycl/sycl.hpp>
+#include <iostream>
+
+
+int main() {
+  sycl::queue q;
+  int* i = sycl::malloc_shared<int>(1, q);
+  q.submit([&](sycl::handler& cgh) {
+    cgh.parallel_for(sycl::nd_range<1>{{1}, {1}}, [=](sycl::nd_item<1> item) {
+      // call only the group algorithms used in SYCL-Bench
+      *i = sycl::reduce_over_group(item.get_group(), 1, sycl::plus<int>{});
+    });
+  }).wait();
+
+  assert(*i == 1);
+  sycl::free(i, q);
+}
\ No newline at end of file
diff --git a/cmake/has-features/src/kernel_reduction_dummy.cpp b/cmake/has-features/src/kernel_reduction_dummy.cpp
new file mode 100644
index 00000000..1cce2205
--- /dev/null
+++ b/cmake/has-features/src/kernel_reduction_dummy.cpp
@@ -0,0 +1,18 @@
+#include <sycl/sycl.hpp>
+
+int main() {
+  sycl::queue q;
+  sycl::buffer<int> x(1);
+  q.submit([&](sycl::handler& cgh) {
+#ifdef __ACPP__
+    auto r = sycl::reduction(x.template get_access<sycl::access_mode::read_write>(cgh), sycl::plus<int>{});
+#else
+    auto r = sycl::reduction(x, cgh, sycl::plus<int>{});
+#endif
+
+    cgh.parallel_for(sycl::range<1>{5}, r, [=](sycl::id<1> idx, auto& op) { op.combine(1); });
+  }).wait();
+
+  sycl::host_accessor host{x};
+  assert(host[0] == 5);
+}
\ No newline at end of file
diff --git a/cmake/has-features/src/spec_constants_dummy.cpp b/cmake/has-features/src/spec_constants_dummy.cpp
new file mode 100644
index 00000000..36942366
--- /dev/null
+++ b/cmake/has-features/src/spec_constants_dummy.cpp
@@ -0,0 +1,38 @@
+#include <sycl/sycl.hpp>
+
+#ifndef __ACPP__
+
+static constexpr sycl::specialization_id<int> x;
+
+int main() {
+  sycl::queue q;
+  int* i = sycl::malloc_shared<int>(1, q);
+  q.submit([&](sycl::handler& cgh) { 
+    cgh.set_specialization_constant<x>(5); 
+    cgh.parallel_for(sycl::range(1), [=](sycl::item<1> item, sycl::kernel_handler h) {
+      *i = h.get_specialization_constant<x>();
+    });
+   }).wait();
+
+  assert(*i == 5);
+  sycl::free(i, q);
+}
+
+#else
+
+// AdaptiveCpp implements sycl::specialized instead of spec constants
+
+int main() { 
+  sycl::queue q;
+  sycl::specialized<int> x;
+  x = 5; //Requires copy assignment operator
+  int* i = sycl::malloc_shared<int>(1, q);
+  q.parallel_for(sycl::range(1), [=](sycl::id<1> idx) {
+    *i = x;
+  }).wait();
+
+  assert(*i == 5);
+  sycl::free(i, q);
+}
+
+#endif
\ No newline at end of file
diff --git a/compiletime/compiletime.cpp b/compiletime/compiletime.cpp
index d57ab23f..ef8f32e2 100644
--- a/compiletime/compiletime.cpp
+++ b/compiletime/compiletime.cpp
@@ -1,14 +1,14 @@
 // Skeleton for compile time measurements -- doesn't do anything on its own, but should compile successfully
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
-namespace s = cl::sycl;
+namespace s = sycl;
 
 #include <kernel_declarations.inc>
 
 void run(size_t rt_size) {
-  cl::sycl::queue device_queue;
-  #include <kernels.inc>
+  sycl::queue device_queue;
+#include <kernels.inc>
   device_queue.wait_and_throw();
 }
 
diff --git a/compiletime/compiletime_gen.rb b/compiletime/compiletime_gen.rb
index e700ee10..e34c3d95 100644
--- a/compiletime/compiletime_gen.rb
+++ b/compiletime/compiletime_gen.rb
@@ -11,9 +11,9 @@
 # operations available for generation
 
 OP_MAPPING = {
-        "sin" => "OUT = cl::sycl::sin(IN1);",
-        "cos" => "OUT = cl::sycl::cos(IN1);",
-        "sqrt" => "OUT = cl::sycl::sqrt(IN1);",
+        "sin" => "OUT = sycl::sin(IN1);",
+        "cos" => "OUT = sycl::cos(IN1);",
+        "sqrt" => "OUT = sycl::sqrt(IN1);",
         "add" => "OUT = IN1 + IN2;",
         "mad" => "OUT = IN1 * IN2 + IN1;",
 }
@@ -129,17 +129,17 @@ def parse_cmd(args)
         f.puts
 
         kernel_names.each do |kn|
-                fwr.call "device_queue.submit([&](cl::sycl::handler& cgh) {"
+                fwr.call "device_queue.submit([&](sycl::handler& cgh) {"
 
                 buffers.each do |bn, an|
                         fwr.call "auto #{an} = #{bn}.get_access<s::access::mode::read_write>(cgh);"
                 end
 
-                fwr.call "cl::sycl::range<#{options.dimensions}> ndrange{#{ndrange}};"
+                fwr.call "sycl::range<#{options.dimensions}> ndrange{#{ndrange}};"
 
                 full_kernel_name = kn
                 full_kernel_name += "<#{options.type}, #{otions.dimensions}>" if options.templated
-                fwr.call "cgh.parallel_for<#{full_kernel_name}>(ndrange, [=](cl::sycl::id<#{options.dimensions}> gid) {"
+                fwr.call "cgh.parallel_for<#{full_kernel_name}>(ndrange, [=](sycl::id<#{options.dimensions}> gid) {"
                 fwr.call "#{acc_names[0]}[gid] += #{capture_names.join(" + ")};" # use each capture
                 fwr.call "#{acc_names[0]}[gid] += #{acc_names.join("[gid] + ")}[gid];" # use each buffer
 
diff --git a/include/benchmark_hook.h b/include/benchmark_hook.h
index 22db140b..fefa1008 100644
--- a/include/benchmark_hook.h
+++ b/include/benchmark_hook.h
@@ -3,17 +3,16 @@
 
 #include "result_consumer.h"
 
-class BenchmarkHook
-{
+class BenchmarkHook {
 public:
   virtual void atInit() = 0;
   virtual void preSetup() = 0;
-  virtual void postSetup()= 0;
+  virtual void postSetup() = 0;
   virtual void preKernel() = 0;
   virtual void postKernel() = 0;
   virtual void emitResults(ResultConsumer&) {}
 
-  virtual ~BenchmarkHook(){}
+  virtual ~BenchmarkHook() {}
 };
 
 #endif
diff --git a/include/benchmark_traits.h b/include/benchmark_traits.h
index 7b4b935d..2822b677 100644
--- a/include/benchmark_traits.h
+++ b/include/benchmark_traits.h
@@ -2,7 +2,7 @@
 
 #include <utility>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 namespace detail {
 
@@ -13,7 +13,7 @@ struct SupportsQueueProfiling {
 
 template <typename T>
 struct SupportsQueueProfiling<T,
-    std::void_t<decltype(std::declval<T>().run(std::declval<std::vector<cl::sycl::event>&>()))>> {
+    std::void_t<decltype(std::declval<T>().run(std::declval<std::vector<sycl::event>&>()))>> {
   static constexpr bool value = true;
 };
 
diff --git a/include/bitmap.h b/include/bitmap.h
index ff094d5d..02d282af 100644
--- a/include/bitmap.h
+++ b/include/bitmap.h
@@ -1,96 +1,93 @@
 #ifndef BITMAP_H
 #define BITMAP_H
 
-#include <CL/sycl.hpp> // float4 definition
 #include <string>
+#include <sycl/sycl.hpp> // float4 definition
 #include <vector>
 
 using std::string;
 
 
-void load_bitmap_mirrored(string filename, int size, std::vector<cl::sycl::float4> &pixels);
-void save_bitmap(string filename, int size, const std::vector<cl::sycl::float4> &buffer);
+void load_bitmap_mirrored(string filename, int size, std::vector<sycl::float4>& pixels);
+void save_bitmap(string filename, int size, const std::vector<sycl::float4>& buffer);
 
 /**
   A single Pixel in the image. A Pixel has red, green, and blue
   integer components in the range from 0 to 255.
 **/
-class Pixel
-{
+class Pixel {
 public:
   int r, g, b;
 
   // Initializes a Pixel with a default black color.
-  Pixel() : r(0), g(0), b(0) { }
+  Pixel() : r(0), g(0), b(0) {}
 
   // Initializes a color Pixel with the specified RGB values.
-  Pixel(int _r, int _g, int _b) : r(_r), g(_g), b(_b) { }
+  Pixel(int _r, int _g, int _b) : r(_r), g(_g), b(_b) {}
 };
 
-//To abbreviate a pixel matrix built as a vector of vectors
-typedef std::vector < std::vector <Pixel> > PixelMatrix;
+// To abbreviate a pixel matrix built as a vector of vectors
+typedef std::vector<std::vector<Pixel>> PixelMatrix;
 
 /**
  * Represents a bitmap with pixels in row-major order.
  * Limitations: Windows BMP, no compression, 24 bit color depth.
-**/
-class Bitmap
-{
-  private:
-    PixelMatrix pixels;
-
-  public:
-    /**
-     * Opens a file as its name is provided and reads pixel-by-pixel the colors
-     * into a matrix of RGB pixels. Any errors will cout but will result in an
-     * empty matrix (with no rows and no columns).
-     *
-     * @param name of the filename to be opened and read as a matrix of pixels
-    **/
-    void open(std::string);
-
-    /**
-     * Saves the current image, represented by the matrix of pixels, as a
-     * Windows BMP file with the name provided by the parameter. File extension
-     * is not forced but should be .bmp. Any errors will cout and will NOT 
-     * attempt to save the file.
-     *
-     * @param name of the filename to be written as a bmp image
-    **/
-    void save(std::string);
-
-    /**
-     * Validates whether or not the current matrix of pixels represents a
-     * proper image with non-zero-size rows and consistent non-zero-size
-     * columns for each row. In addition, each pixel in the matrix is validated
-     * to have red, green, and blue components with values between 0 and 255
-     *
-     * @return boolean value of whether or not the matrix is a valid image
-    **/
-    bool isImage();
-
-    /**
-     * Provides a vector of vector of pixels representing the bitmap
-     *
-     * @return the bitmap image, represented by a matrix of RGB pixels
-    **/
-    PixelMatrix toPixelMatrix();
-
-    /**
-     * Overwrites the current bitmap with that represented by a matrix of
-     * pixels. Does not validate that the new matrix of pixels is a proper
-     * image.
-     *
-     * @param a matrix of pixels to represent a bitmap
-    **/
-    void fromPixelMatrix(const PixelMatrix &);
-    
+ **/
+class Bitmap {
+private:
+  PixelMatrix pixels;
+
+public:
+  /**
+   * Opens a file as its name is provided and reads pixel-by-pixel the colors
+   * into a matrix of RGB pixels. Any errors will cout but will result in an
+   * empty matrix (with no rows and no columns).
+   *
+   * @param name of the filename to be opened and read as a matrix of pixels
+   **/
+  void open(std::string);
+
+  /**
+   * Saves the current image, represented by the matrix of pixels, as a
+   * Windows BMP file with the name provided by the parameter. File extension
+   * is not forced but should be .bmp. Any errors will cout and will NOT
+   * attempt to save the file.
+   *
+   * @param name of the filename to be written as a bmp image
+   **/
+  void save(std::string);
+
+  /**
+   * Validates whether or not the current matrix of pixels represents a
+   * proper image with non-zero-size rows and consistent non-zero-size
+   * columns for each row. In addition, each pixel in the matrix is validated
+   * to have red, green, and blue components with values between 0 and 255
+   *
+   * @return boolean value of whether or not the matrix is a valid image
+   **/
+  bool isImage();
+
+  /**
+   * Provides a vector of vector of pixels representing the bitmap
+   *
+   * @return the bitmap image, represented by a matrix of RGB pixels
+   **/
+  PixelMatrix toPixelMatrix();
+
+  /**
+   * Overwrites the current bitmap with that represented by a matrix of
+   * pixels. Does not validate that the new matrix of pixels is a proper
+   * image.
+   *
+   * @param a matrix of pixels to represent a bitmap
+   **/
+  void fromPixelMatrix(const PixelMatrix&);
 };
 
 //////////////////////////////////////////////////////////////////////////////////
-#include <iostream>
 #include <fstream>
-//#include "bitmap.h"
+#include <iostream>
+// #include "bitmap.h"
 #include <cstdlib>
 
 typedef unsigned char uchar_t;
@@ -99,27 +96,24 @@ typedef unsigned short int uint16_t;
 typedef signed int int32_t;
 typedef signed short int int16_t;
 
-const int MIN_RGB=0;
-const int MAX_RGB=255;
-const int BMP_MAGIC_ID=2;
+const int MIN_RGB = 0;
+const int MAX_RGB = 255;
+const int BMP_MAGIC_ID = 2;
 
 // --------------------------------------------------------------
 // Windows BMP-specific format data
-struct bmpfile_magic
-{
-	uchar_t magic[BMP_MAGIC_ID];
+struct bmpfile_magic {
+  uchar_t magic[BMP_MAGIC_ID];
 };
 
-struct bmpfile_header
-{
-	uint32_t file_size;
-	uint16_t creator1;
-	uint16_t creator2;
-	uint32_t bmp_offset;
+struct bmpfile_header {
+  uint32_t file_size;
+  uint16_t creator1;
+  uint16_t creator2;
+  uint32_t bmp_offset;
 };
 
-struct bmpfile_dib_info
-{
+struct bmpfile_dib_info {
   uint32_t header_size;
   int32_t width;
   int32_t height;
@@ -141,213 +135,180 @@ struct bmpfile_dib_info
  * empty matrix (with no rows and no columns).
  *
  * @param name of the filename to be opened and read as a matrix of pixels
-**/
-void Bitmap::open(std::string filename)
-{
-	std::ifstream file(filename.c_str(), std::ios::in | std::ios::binary);
-        //clear data if already holds information
-        for(int i=0; i<pixels.size(); i++)
-        {
-            pixels[i].clear();
+ **/
+void Bitmap::open(std::string filename) {
+  std::ifstream file(filename.c_str(), std::ios::in | std::ios::binary);
+  // clear data if already holds information
+  for(int i = 0; i < pixels.size(); i++) {
+    pixels[i].clear();
+  }
+  pixels.clear();
+
+  if(file.fail()) {
+    std::cout << filename << " could not be opened. Does it exist? "
+              << "Is it already open by another program?\n";
+    // pixels.resize(0); //make empty if it isn't already
+  } else {
+    bmpfile_magic magic;
+    file.read((char*)(&magic), sizeof(magic));
+
+    // Check to make sure that the first two bytes of the file are the "BM"
+    // identifier that identifies a bitmap image.
+    if(magic.magic[0] != 'B' || magic.magic[1] != 'M') {
+      std::cout << filename << " is not in proper BMP format.\n";
+    } else {
+      bmpfile_header header;
+      file.read((char*)(&header), sizeof(header));
+
+      bmpfile_dib_info dib_info;
+      file.read((char*)(&dib_info), sizeof(dib_info));
+
+      // Check for this here and so that we know later whether we need to insert
+      // each row at the bottom or top of the image.
+      bool flip = true;
+      if(dib_info.height < 0) {
+        flip = false;
+        dib_info.height = -dib_info.height;
+      }
+
+      // Only support for 24-bit images
+      if(dib_info.bits_per_pixel != 24) {
+        std::cout << filename << " uses " << dib_info.bits_per_pixel
+                  << "bits per pixel (bit depth). Bitmap only supports 24bit.\n";
+      }
+
+      // No support for compressed images
+      if(dib_info.compression != 0) {
+        std::cout << filename << " is compressed. "
+                  << "Bitmap only supports uncompressed images.\n";
+      }
+
+      file.seekg(header.bmp_offset);
+
+      // Read the pixels for each row and column of Pixels in the image.
+      for(int row = 0; row < dib_info.height; row++) {
+        std::vector<Pixel> row_data;
+
+        for(int col = 0; col < dib_info.width; col++) {
+          int blue = file.get();
+          int green = file.get();
+          int red = file.get();
+
+          row_data.push_back(Pixel(red, green, blue));
         }
-        pixels.clear();
-
-	if (file.fail())
-	{
-		std::cout<<filename<<" could not be opened. Does it exist? "
-		         <<"Is it already open by another program?\n";
-		//pixels.resize(0); //make empty if it isn't already
-	}
-	else
-	{
-		bmpfile_magic magic;
-		file.read((char*)(&magic), sizeof(magic));
-		
-		// Check to make sure that the first two bytes of the file are the "BM"
-		// identifier that identifies a bitmap image.
-		if (magic.magic[0] != 'B' || magic.magic[1] != 'M')
-		{
-			std::cout<<filename<<" is not in proper BMP format.\n";
-		}
-		else
-		{
-			bmpfile_header header;
-			file.read((char*)(&header), sizeof(header));
-
-			bmpfile_dib_info dib_info;
-			file.read((char*)(&dib_info), sizeof(dib_info));
-
-			// Check for this here and so that we know later whether we need to insert
-			// each row at the bottom or top of the image.
-			bool flip = true;
-			if (dib_info.height < 0)
-			{
-				flip = false;
-				dib_info.height = -dib_info.height;
-			}
-
-			// Only support for 24-bit images
-			if (dib_info.bits_per_pixel != 24)
-			{
-				std::cout<<filename<<" uses "<<dib_info.bits_per_pixel
-				         <<"bits per pixel (bit depth). Bitmap only supports 24bit.\n";
-			}
-
-			// No support for compressed images
-			if (dib_info.compression != 0)
-			{
-				std::cout<<filename<<" is compressed. "
-				         <<"Bitmap only supports uncompressed images.\n";
-			}
-
-			file.seekg(header.bmp_offset);
-
-			// Read the pixels for each row and column of Pixels in the image.
-			for (int row = 0; row < dib_info.height; row++)
-			{
-				std::vector <Pixel> row_data;
-
-				for (int col = 0; col < dib_info.width; col++)
-				{
-					int blue = file.get();
-					int green = file.get();
-					int red = file.get();
-
-					row_data.push_back( Pixel(red, green, blue) );
-				}
-
-				// Rows are padded so that they're always a multiple of 4
-				// bytes. This line skips the padding at the end of each row.
-				file.seekg(dib_info.width % 4, std::ios::cur);
-
-				if (flip)
-				{
-					pixels.insert(pixels.begin(), row_data);
-				}
-				else
-				{
-					pixels.push_back(row_data);
-				}
-			}
-
-			file.close();
-		}//end else (is an image)
-	}//end else (can open file)
+
+        // Rows are padded so that they're always a multiple of 4
+        // bytes. This line skips the padding at the end of each row.
+        file.seekg(dib_info.width % 4, std::ios::cur);
+
+        if(flip) {
+          pixels.insert(pixels.begin(), row_data);
+        } else {
+          pixels.push_back(row_data);
+        }
+      }
+
+      file.close();
+    } // end else (is an image)
+  }   // end else (can open file)
 }
 
 // ----------------------------------------------------------------------------
 /**
  * Saves the current image, represented by the matrix of pixels, as a
  * Windows BMP file with the name provided by the parameter. File extension
- * is not forced but should be .bmp. Any errors will cout and will NOT 
+ * is not forced but should be .bmp. Any errors will cout and will NOT
  * attempt to save the file.
  *
  * @param name of the filename to be written as a bmp image
-**/
-void Bitmap::save(std::string filename)
-{
-	std::ofstream file(filename.c_str(), std::ios::out | std::ios::binary);
-
-	if (file.fail())
-	{
-		std::cout<<filename<<" could not be opened for editing. "
-		         <<"Is it already open by another program or is it read-only?\n";
-		
-	}
-	else if( !isImage() )
-	{
-		std::cout<<"Bitmap cannot be saved. It is not a valid image.\n";
-	}
-	else
-	{
-		// Write all the header information that the BMP file format requires.
-		bmpfile_magic magic;
-		magic.magic[0] = 'B';
-		magic.magic[1] = 'M';
-		file.write((char*)(&magic), sizeof(magic));
-		bmpfile_header header = { 0 };
-		header.bmp_offset = sizeof(bmpfile_magic)
-				+ sizeof(bmpfile_header) + sizeof(bmpfile_dib_info);
-		header.file_size = header.bmp_offset
-				+ (pixels.size() * 3 + pixels[0].size() % 4) * pixels.size();
-		file.write((char*)(&header), sizeof(header));
-		bmpfile_dib_info dib_info = { 0 };
-		dib_info.header_size = sizeof(bmpfile_dib_info);
-		dib_info.width = pixels[0].size();
-		dib_info.height = pixels.size();
-		dib_info.num_planes = 1;
-		dib_info.bits_per_pixel = 24;
-		dib_info.compression = 0;
-		dib_info.bmp_byte_size = 0;
-		dib_info.hres = 2835;
-		dib_info.vres = 2835;
-		dib_info.num_colors = 0;
-		dib_info.num_important_colors = 0;
-		file.write((char*)(&dib_info), sizeof(dib_info));
-
-		// Write each row and column of Pixels into the image file -- we write
-		// the rows upside-down to satisfy the easiest BMP format.
-		for (int row = pixels.size() - 1; row >= 0; row--)
-		{
-			const std::vector <Pixel> & row_data = pixels[row];
-
-			for (int col = 0; col < row_data.size(); col++)
-			{
-				const Pixel& pix = row_data[col];
-
-				file.put((uchar_t)(pix.b));
-				file.put((uchar_t)(pix.g));
-				file.put((uchar_t)(pix.r));
-			}
-
-			// Rows are padded so that they're always a multiple of 4
-			// bytes. This line skips the padding at the end of each row.
-			for (int i = 0; i < row_data.size() % 4; i++)
-			{
-				file.put(0);
-			}
-		}
-
-		file.close();
-	}
+ **/
+void Bitmap::save(std::string filename) {
+  std::ofstream file(filename.c_str(), std::ios::out | std::ios::binary);
+
+  if(file.fail()) {
+    std::cout << filename << " could not be opened for editing. "
+              << "Is it already open by another program or is it read-only?\n";
+
+  } else if(!isImage()) {
+    std::cout << "Bitmap cannot be saved. It is not a valid image.\n";
+  } else {
+    // Write all the header information that the BMP file format requires.
+    bmpfile_magic magic;
+    magic.magic[0] = 'B';
+    magic.magic[1] = 'M';
+    file.write((char*)(&magic), sizeof(magic));
+    bmpfile_header header = {0};
+    header.bmp_offset = sizeof(bmpfile_magic) + sizeof(bmpfile_header) + sizeof(bmpfile_dib_info);
+    header.file_size = header.bmp_offset + (pixels.size() * 3 + pixels[0].size() % 4) * pixels.size();
+    file.write((char*)(&header), sizeof(header));
+    bmpfile_dib_info dib_info = {0};
+    dib_info.header_size = sizeof(bmpfile_dib_info);
+    dib_info.width = pixels[0].size();
+    dib_info.height = pixels.size();
+    dib_info.num_planes = 1;
+    dib_info.bits_per_pixel = 24;
+    dib_info.compression = 0;
+    dib_info.bmp_byte_size = 0;
+    dib_info.hres = 2835;
+    dib_info.vres = 2835;
+    dib_info.num_colors = 0;
+    dib_info.num_important_colors = 0;
+    file.write((char*)(&dib_info), sizeof(dib_info));
+
+    // Write each row and column of Pixels into the image file -- we write
+    // the rows upside-down to satisfy the easiest BMP format.
+    for(int row = pixels.size() - 1; row >= 0; row--) {
+      const std::vector<Pixel>& row_data = pixels[row];
+
+      for(int col = 0; col < row_data.size(); col++) {
+        const Pixel& pix = row_data[col];
+
+        file.put((uchar_t)(pix.b));
+        file.put((uchar_t)(pix.g));
+        file.put((uchar_t)(pix.r));
+      }
+
+      // Rows are padded so that they're always a multiple of 4
+      // bytes. This line skips the padding at the end of each row.
+      for(int i = 0; i < row_data.size() % 4; i++) {
+        file.put(0);
+      }
+    }
+
+    file.close();
+  }
 }
-	
+
 // ----------------------------------------------------------------------------
 /**
-  * Validates whether or not the current matrix of pixels represents a
-  * proper image with non-zero-size rows and consistent non-zero-size
-  * columns for each row. In addition, each pixel in the matrix is validated
-  * to have red, green, and blue components with values between 0 and 255
-  *
-  * @return boolean value of whether or not the matrix is a valid image
+ * Validates whether or not the current matrix of pixels represents a
+ * proper image with non-zero-size rows and consistent non-zero-size
+ * columns for each row. In addition, each pixel in the matrix is validated
+ * to have red, green, and blue components with values between 0 and 255
+ *
+ * @return boolean value of whether or not the matrix is a valid image
  **/
-bool Bitmap::isImage()
-{
-	const int height = pixels.size();
-
-	if( height == 0 || pixels[0].size() == 0)
-	{
-		return false;
-	}
-
-	const int width = pixels[0].size();
-
-	for(int row=0; row < height; row++)
-	{
-		if( pixels[row].size() != width )
-		{
-			return false;
-		}
-		for(int column=0; column < width; column++)
-		{
-			Pixel current = pixels[row][column];
-			if( current.r > MAX_RGB || current.r < MIN_RGB ||
-			    current.g > MAX_RGB || current.g < MIN_RGB ||
-			    current.b > MAX_RGB || current.b < MIN_RGB )
-				return false;
-		}
-	}
-	return true;
+bool Bitmap::isImage() {
+  const int height = pixels.size();
+
+  if(height == 0 || pixels[0].size() == 0) {
+    return false;
+  }
+
+  const int width = pixels[0].size();
+
+  for(int row = 0; row < height; row++) {
+    if(pixels[row].size() != width) {
+      return false;
+    }
+    for(int column = 0; column < width; column++) {
+      Pixel current = pixels[row][column];
+      if(current.r > MAX_RGB || current.r < MIN_RGB || current.g > MAX_RGB || current.g < MIN_RGB ||
+          current.b > MAX_RGB || current.b < MIN_RGB)
+        return false;
+    }
+  }
+  return true;
 }
 
 // ----------------------------------------------------------------------------
@@ -355,17 +316,13 @@ bool Bitmap::isImage()
  * Provides a vector of vector of pixels representing the bitmap
  *
  * @return the bitmap image, represented by a matrix of RGB pixels
-**/
-PixelMatrix Bitmap::toPixelMatrix()
-{
-	if( isImage() )
-	{
-		return pixels;
-	}	
-	else
-	{
-		return PixelMatrix();
-	}	
+ **/
+PixelMatrix Bitmap::toPixelMatrix() {
+  if(isImage()) {
+    return pixels;
+  } else {
+    return PixelMatrix();
+  }
 }
 
 // ----------------------------------------------------------------------------
@@ -375,60 +332,54 @@ PixelMatrix Bitmap::toPixelMatrix()
  * image.
  *
  * @param a matrix of pixels to represent a bitmap
-**/
-void Bitmap::fromPixelMatrix(const PixelMatrix & values)
-{
-	pixels = values;
-}
+ **/
+void Bitmap::fromPixelMatrix(const PixelMatrix& values) { pixels = values; }
 
 
 #endif
 
 
-void load_bitmap_mirrored(string filename, int size, std::vector<cl::sycl::float4> &input){
+void load_bitmap_mirrored(string filename, int size, std::vector<sycl::float4>& input) {
   Bitmap input_image;
   input_image.open(filename);
-  //std::cout << "input image " << filename << " loaded" << std::endl;
+  // std::cout << "input image " << filename << " loaded" << std::endl;
   PixelMatrix pixels = input_image.toPixelMatrix();
   int w = pixels.size();
   int h;
-  if(w>0)
+  if(w > 0)
     h = pixels[0].size();
   else
     h = 0;
   // prepare the input buffer (similar to a GL_MIRRORED_REPEAT of the input picture)
   input.resize(size * size);
-  for(size_t i=0; i<size; i++)
-    for(size_t j=0; j<size; j++){
-      Pixel pixel = pixels[i%w][j%h]; // mirror repeat
-      cl::sycl::float4 color = cl::sycl::float4(pixel.r / 255.0f, pixel.g / 255.0f, pixel.b / 255.0f, 1.0f); // cast to float  
+  for(size_t i = 0; i < size; i++)
+    for(size_t j = 0; j < size; j++) {
+      Pixel pixel = pixels[i % w][j % h];                                                            // mirror repeat
+      sycl::float4 color = sycl::float4(pixel.r / 255.0f, pixel.g / 255.0f, pixel.b / 255.0f, 1.0f); // cast to float
       input[j + i * size] = color; // write to input buffer
     }
-    //std::cout << "image resized to match the input size: ";
-    //std::cout << "[" << w << "x" << h << "] => [" << size << "x" << size << "]" << std::endl;
+  // std::cout << "image resized to match the input size: ";
+  // std::cout << "[" << w << "x" << h << "] => [" << size << "x" << size << "]" << std::endl;
 }
 
 
-void save_bitmap(string filename, int size, const std::vector<cl::sycl::float4> &output){
-    // write the output picture
-    //std::cout << "saving the output picture in " << filename << std::endl;
-    Bitmap output_image;
-    PixelMatrix pixels;
-    pixels.resize(size);
-//std::cout << "debug " << size << " - " << output.size() << std::endl;
-    for(size_t i=0; i<size; i++){
-        pixels[i].resize(size);
-        for(size_t j=0; j<size; j++){
-          cl::sycl::float4 color = output[i * size + j] * 255.f;
-//std::cout << color.x() << "," << color.z() << "/";
-          pixels[i][j].r = (int) color.x();
-          pixels[i][j].g = (int) color.y();
-          pixels[i][j].b = (int) color.z();
-        }
+void save_bitmap(string filename, int size, const std::vector<sycl::float4>& output) {
+  // write the output picture
+  // std::cout << "saving the output picture in " << filename << std::endl;
+  Bitmap output_image;
+  PixelMatrix pixels;
+  pixels.resize(size);
+  // std::cout << "debug " << size << " - " << output.size() << std::endl;
+  for(size_t i = 0; i < size; i++) {
+    pixels[i].resize(size);
+    for(size_t j = 0; j < size; j++) {
+      sycl::float4 color = output[i * size + j] * 255.f;
+      // std::cout << color.x() << "," << color.z() << "/";
+      pixels[i][j].r = (int)color.x();
+      pixels[i][j].g = (int)color.y();
+      pixels[i][j].b = (int)color.z();
     }
-    output_image.fromPixelMatrix(pixels);
-    output_image.save(filename);
+  }
+  output_image.fromPixelMatrix(pixels);
+  output_image.save(filename);
 }
-
-
-
diff --git a/include/command_line.h b/include/command_line.h
index 1fb73f93..852fbfea 100644
--- a/include/command_line.h
+++ b/include/command_line.h
@@ -1,50 +1,48 @@
 #ifndef BENCHMARK_COMMAND_LINE_HPP
 #define BENCHMARK_COMMAND_LINE_HPP
 
+#include "common.h"
+
+#include "result_consumer.h"
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
 #include <string>
+#include <sycl/sycl.hpp>
 #include <unordered_map>
 #include <unordered_set>
-#include <stdexcept>
 #include <vector>
-#include <iostream>
-#include <sstream>
-#include <memory>
-#include <CL/sycl.hpp>
-#include "result_consumer.h"
 
 using CommandLineArguments = std::unordered_map<std::string, std::string>;
 using FlagList = std::unordered_set<std::string>;
 
 namespace detail {
 
-template<class T>
-inline T simple_cast(const std::string& s)
-{
+template <class T>
+inline T simple_cast(const std::string& s) {
   std::stringstream sstr{s};
   T result;
   sstr >> result;
   return result;
 }
 
-template<class T>
-inline std::vector<T> parseCommaDelimitedList(const std::string& s)
-{
+template <class T>
+inline std::vector<T> parseCommaDelimitedList(const std::string& s) {
   std::stringstream istr(s);
   std::string current;
   std::vector<T> result;
 
-  while(std::getline(istr, current, ','))
-    result.push_back(simple_cast<T>(current));
-  
+  while(std::getline(istr, current, ',')) result.push_back(simple_cast<T>(current));
+
   return result;
 }
 
-template<class SyclArraylike>
-inline SyclArraylike parseSyclArray(const std::string& s, std::size_t defaultValue)
-{
+template <class SyclArraylike>
+inline SyclArraylike parseSyclArray(const std::string& s, std::size_t defaultValue) {
   auto elements = parseCommaDelimitedList<std::size_t>(s);
   if(s.size() > 3)
-    throw std::invalid_argument{"Invalid sycl range/id: "+s};
+    throw std::invalid_argument{"Invalid sycl range/id: " + s};
   else if(s.size() == 3)
     return SyclArraylike{elements[0], elements[1], elements[2]};
   else if(s.size() == 2)
@@ -52,116 +50,88 @@ inline SyclArraylike parseSyclArray(const std::string& s, std::size_t defaultVal
   else if(s.size() == 1)
     return SyclArraylike{elements[0], defaultValue, defaultValue};
   else
-    throw std::invalid_argument{"Invalid sycl range/id: "+s};
+    throw std::invalid_argument{"Invalid sycl range/id: " + s};
 }
 
-}
+} // namespace detail
 
-template<class T>
-inline T cast(const std::string& s)
-{
+template <class T>
+inline T cast(const std::string& s) {
   return detail::simple_cast<T>(s);
 }
 
-template<>
-inline cl::sycl::range<3>
-cast(const std::string& s)
-{
-  return detail::parseSyclArray<cl::sycl::range<3>>(s, 1);
+template <>
+inline sycl::range<3> cast(const std::string& s) {
+  return detail::parseSyclArray<sycl::range<3>>(s, 1);
 }
 
-template<>
-inline cl::sycl::id<3>
-cast(const std::string& s)
-{
-  return detail::parseSyclArray<cl::sycl::id<3>>(s, 0);
+template <>
+inline sycl::id<3> cast(const std::string& s) {
+  return detail::parseSyclArray<sycl::id<3>>(s, 0);
 }
 
-class CommandLine
-{
+class CommandLine {
 public:
   CommandLine() = default;
 
-  CommandLine(int argc, char** argv)
-  {
-    for (int i = 0; i < argc; ++i)
-    {
+  CommandLine(int argc, char** argv) {
+    for(int i = 0; i < argc; ++i) {
       std::string arg = argv[i];
       auto pos = arg.find("=");
-      if(pos != std::string::npos)
-      {
-        auto argName = arg.substr(0,pos);
-        auto argVal = arg.substr(pos+1);
-
-        if(args.find(argName) != args.end())
-        {
-          throw std::invalid_argument{
-              "Encountered command line argument several times: " + argName};
+      if(pos != std::string::npos) {
+        auto argName = arg.substr(0, pos);
+        auto argVal = arg.substr(pos + 1);
+
+        if(args.find(argName) != args.end()) {
+          throw std::invalid_argument{"Encountered command line argument several times: " + argName};
         }
 
         args[argName] = argVal;
-      }
-      else
-      {
+      } else {
         flags.insert(arg);
       }
     }
   }
 
-  bool isArgSet(const std::string& arg) const
-  {
-    return args.find(arg) != args.end();
-  }
+  bool isArgSet(const std::string& arg) const { return args.find(arg) != args.end(); }
 
-  template<class T>
-  T getOrDefault(const std::string& arg, const T& defaultVal) const
-  {
+  template <class T>
+  T getOrDefault(const std::string& arg, const T& defaultVal) const {
     if(isArgSet(arg))
       return cast<T>(args.at(arg));
     return defaultVal;
   }
 
-  template<class T>
-  T get(const std::string& arg) const
-  {
-    try
-    {
+  template <class T>
+  T get(const std::string& arg) const {
+    try {
       return cast<T>(args.at(arg));
-    }
-    catch(std::out_of_range& e)
-    {
-      throw std::invalid_argument{"Command line argument was requested but missing: "+arg};
+    } catch(std::out_of_range& e) {
+      throw std::invalid_argument{"Command line argument was requested but missing: " + arg};
     }
   }
 
-  bool isFlagSet(const std::string& flag) const
-  {
-    return flags.find(flag) != flags.end();
-  }
+  bool isFlagSet(const std::string& flag) const { return flags.find(flag) != flags.end(); }
 
-  
 
 private:
-  
-
   CommandLineArguments args;
   FlagList flags;
 };
 
 
-struct VerificationSetting
-{
+struct VerificationSetting {
   bool enabled;
-  cl::sycl::id<3> begin = {0, 0, 0};
-  cl::sycl::range<3> range = {1, 1, 1};
+  sycl::id<3> begin = {0, 0, 0};
+  sycl::range<3> range = {1, 1, 1};
 };
 
-struct BenchmarkArgs
-{
+struct BenchmarkArgs {
   size_t problem_size;
   size_t local_size;
   size_t num_runs;
-  cl::sycl::queue device_queue;
+  sycl::queue device_queue;
+  sycl::queue device_queue_in_order;
   VerificationSetting verification;
   // can be used to query additional benchmark specific information from the command line
   CommandLine cli;
@@ -169,68 +139,38 @@ struct BenchmarkArgs
   bool warmup_run;
 };
 
-class CUDASelector : public cl::sycl::device_selector {
-public:
-  int operator()(const cl::sycl::device& device) const override {
-    using namespace cl::sycl::info;
-    const std::string driverVersion = device.get_info<device::driver_version>();
-    if(device.is_gpu() && (driverVersion.find("CUDA") != std::string::npos)) {
-      return 1;
-    };
-    return -1;
-  }
-};
 
-class BenchmarkCommandLine
-{
+class BenchmarkCommandLine {
 public:
-  BenchmarkCommandLine(int argc, char **argv) 
-  : cli_parser{argc, argv} {}
+  BenchmarkCommandLine(int argc, char** argv) : cli_parser{argc, argv} {}
 
-  BenchmarkArgs getBenchmarkArgs() const
-  {
+  BenchmarkArgs getBenchmarkArgs() const {
     std::size_t size = cli_parser.getOrDefault<std::size_t>("--size", 3072);
     std::size_t local_size = cli_parser.getOrDefault<std::size_t>("--local", 256);
     std::size_t num_runs = cli_parser.getOrDefault<std::size_t>("--num-runs", 5);
 
     std::string device_type = cli_parser.getOrDefault<std::string>("--device", "default");
-    bool warmup_run = cli_parser.isFlagSet("--warmup-run");
-    if (warmup_run) {
-      // Make drop of first run transparent to the user
-      ++num_runs;
-    }
-    cl::sycl::queue q = getQueue(device_type);
+    sycl::queue q = getQueue(device_type);
+    sycl::queue q_in_order = getQueue(device_type, sycl::property::queue::in_order{});
 
     bool verification_enabled = true;
     if(cli_parser.isFlagSet("--no-verification"))
       verification_enabled = false;
 
-    auto verification_begin = cli_parser.getOrDefault<cl::sycl::id<3>>(
-      "--verification-begin", cl::sycl::id<3>{0,0,0});
-    
-    auto verification_range = cli_parser.getOrDefault<cl::sycl::range<3>>(
-      "--verification-range", cl::sycl::range<3>{1,1,1});
-
-    auto result_consumer = getResultConsumer(
-      cli_parser.getOrDefault<std::string>("--output","stdio"));
-
-    return BenchmarkArgs{size,
-                         local_size,
-                         num_runs,
-                         q,
-                         VerificationSetting{verification_enabled,
-                                             verification_begin,
-                                             verification_range},
-                         cli_parser,
-                         result_consumer,
-                         warmup_run};
+    auto verification_begin = cli_parser.getOrDefault<sycl::id<3>>("--verification-begin", sycl::id<3>{0, 0, 0});
+
+    auto verification_range = cli_parser.getOrDefault<sycl::range<3>>("--verification-range", sycl::range<3>{1, 1, 1});
+
+    auto result_consumer = getResultConsumer(cli_parser.getOrDefault<std::string>("--output", "stdio"));
+
+    return BenchmarkArgs{size, local_size, num_runs, q, q_in_order,
+        VerificationSetting{verification_enabled, verification_begin, verification_range}, cli_parser, result_consumer};
   }
 
 private:
   std::shared_ptr<ResultConsumer>
 
-  getResultConsumer(const std::string& result_consumer_name) const
-  {
+  getResultConsumer(const std::string& result_consumer_name) const {
     if(result_consumer_name == "stdio")
       return std::shared_ptr<ResultConsumer>{new OstreamResultConsumer{std::cout}};
     else
@@ -239,27 +179,23 @@ class BenchmarkCommandLine
       return std::shared_ptr<ResultConsumer>{new AppendingCsvResultConsumer{result_consumer_name}};
   }
 
-  cl::sycl::queue getQueue(const std::string& device_type) const {
-    const auto getQueueProperties = [&]() -> cl::sycl::property_list {
+  template <typename... Props>
+  sycl::queue getQueue(const std::string& device_type, Props&&... props) const {
+    const auto getQueueProperties = [&]() -> sycl::property_list {
+
 #if defined(SYCL_BENCH_ENABLE_QUEUE_PROFILING)
-      return cl::sycl::property::queue::enable_profiling{};
+      return {sycl::property::queue::enable_profiling{}, props...};
 #endif
-      return {};
+      return {props...};
     };
 
-#if defined(__LLVM_SYCL_CUDA__)
-    if(device_type != "gpu") {
-      throw std::invalid_argument{"Only the 'gpu' device is supported on LLVM CUDA"};
-    }
-    return cl::sycl::queue{CUDASelector{}, getQueueProperties()};
-#endif
 
     if(device_type == "cpu") {
-      return cl::sycl::queue{cl::sycl::cpu_selector{}, getQueueProperties()};
+      return sycl::queue{sycl::cpu_selector_v, getQueueProperties()};
     } else if(device_type == "gpu") {
-      return cl::sycl::queue{cl::sycl::gpu_selector{}, getQueueProperties()};
+      return sycl::queue{sycl::gpu_selector_v, getQueueProperties()};
     } else if(device_type == "default") {
-      return cl::sycl::queue{getQueueProperties()};
+      return sycl::queue{getQueueProperties()};
     } else {
       throw std::invalid_argument{"unknown device type: " + device_type};
     }
@@ -269,4 +205,3 @@ class BenchmarkCommandLine
 };
 
 #endif
-
diff --git a/include/common.h b/include/common.h
index e5e5e6d4..e3c65380 100644
--- a/include/common.h
+++ b/include/common.h
@@ -1,57 +1,47 @@
-#pragma once 
-#include <CL/sycl.hpp>
+#pragma once
+#include <sycl/sycl.hpp>
 
-#include <string>
-#include <iostream>
+#include <algorithm> // for std::min
 #include <cassert>
-#include <sstream>
+#include <iostream>
 #include <memory>
-#include <algorithm> // for std::min
+#include <optional>
+#include <sstream>
+#include <string>
 #include <type_traits>
 #include <unordered_set>
-#include <optional>
 
 #include "command_line.h"
 #include "result_consumer.h"
 #include "type_traits.h"
 
-  
+
 #include "benchmark_hook.h"
 #include "benchmark_traits.h"
-#include "prefetched_buffer.h"
+#include "memory_wrappers.h"
 #include "time_metrics.h"
 
-#ifdef NV_ENERGY_MEAS    
-  #include "nv_energy_meas.h"
+#ifdef NV_ENERGY_MEAS
+#include "nv_energy_meas.h"
 #endif
 
 
-
-template<class Benchmark>
-class BenchmarkManager
-{
+template <class Benchmark>
+class BenchmarkManager {
 public:
-  BenchmarkManager(const BenchmarkArgs &_args) : args(_args)  {}
+  BenchmarkManager(const BenchmarkArgs& _args) : args(_args) {}
 
-  void addHook(BenchmarkHook &h)
-  {
-    hooks.push_back(&h);
-  }
+  void addHook(BenchmarkHook& h) { hooks.push_back(&h); }
 
-  template<typename... Args>
-  void run(Args&&... additionalArgs)
-  {
+  template <typename... Args>
+  void run(Args&&... additionalArgs) {
     args.result_consumer->proceedToBenchmark(Benchmark{args, additionalArgs...}.getBenchmarkName(args));
 
+    args.result_consumer->consumeResult("problem-size", std::to_string(args.problem_size));
+    args.result_consumer->consumeResult("local-size", std::to_string(args.local_size));
     args.result_consumer->consumeResult(
-      "problem-size", std::to_string(args.problem_size));
-    args.result_consumer->consumeResult(
-      "local-size", std::to_string(args.local_size));
-    args.result_consumer->consumeResult(
-      "device-name", args.device_queue.get_device()
-                           .template get_info<cl::sycl::info::device::name>());
-    args.result_consumer->consumeResult(
-      "sycl-implementation", this->getSyclImplementation());
+        "device-name", args.device_queue.get_device().get_info<sycl::info::device::name>());
+    args.result_consumer->consumeResult("sycl-implementation", this->getSyclImplementation());
 
     TimeMetricsProcessor<Benchmark> time_metrics(args);
 
@@ -71,7 +61,7 @@ class BenchmarkManager
         args.device_queue.wait_and_throw();
         for(auto h : hooks) h->postSetup();
 
-        std::vector<cl::sycl::event> run_events;
+        std::vector<sycl::event> run_events;
         run_events.reserve(1024); // Make sure we don't need to resize during benchmarking.
 
         // Performance critical measurement section starts here
@@ -87,23 +77,37 @@ class BenchmarkManager
         for(auto h : hooks) h->postKernel();
         // Performance critical measurement section ends here
 
-        time_metrics.addTimingResult("run-time", std::chrono::duration_cast<std::chrono::nanoseconds>(after - before));
+        auto run_time = std::chrono::duration_cast<std::chrono::nanoseconds>(after - before);
+        time_metrics.addTimingResult("run-time", run_time);
 
         if(detail::BenchmarkTraits<Benchmark>::supportsQueueProfiling) {
 #if defined(SYCL_BENCH_ENABLE_QUEUE_PROFILING)
           // TODO: We might also want to consider the "command_submit" time.
           std::chrono::nanoseconds total_time{0};
+          std::chrono::nanoseconds submit_time{0};
+          // Runtime without kernel time
+          std::chrono::nanoseconds system_time{0};
           for(auto& e : run_events) {
-            const auto start = e.get_profiling_info<cl::sycl::info::event_profiling::command_start>();
-            const auto end = e.get_profiling_info<cl::sycl::info::event_profiling::command_end>();
+            const auto start = e.get_profiling_info<sycl::info::event_profiling::command_start>();
+            const auto end = e.get_profiling_info<sycl::info::event_profiling::command_end>();
+            const auto submit = e.get_profiling_info<sycl::info::event_profiling::command_submit>();
             total_time += std::chrono::nanoseconds(end - start);
+            submit_time += std::chrono::nanoseconds(start - submit);
           }
+          system_time += std::chrono::nanoseconds(run_time - total_time);
+
           time_metrics.addTimingResult("kernel-time", total_time);
+          time_metrics.addTimingResult("submit-time", submit_time);
+          time_metrics.addTimingResult("system-time", system_time);
 #else
           time_metrics.markAsUnavailable("kernel-time");
+          time_metrics.markAsUnavailable("submit-time");
+          time_metrics.markAsUnavailable("system-time");
 #endif
         } else {
           time_metrics.markAsUnavailable("kernel-time");
+          time_metrics.markAsUnavailable("submit-time");
+          time_metrics.markAsUnavailable("system-time");
         }
 
         if constexpr(detail::BenchmarkTraits<Benchmark>::hasVerify) {
@@ -123,7 +127,7 @@ class BenchmarkManager
 
     time_metrics.emitResults(*args.result_consumer);
 
-    for (auto h : hooks) {
+    for(auto h : hooks) {
       // Extract results from the hooks
       h->emitResults(*args.result_consumer);
     }
@@ -131,33 +135,26 @@ class BenchmarkManager
     if(args.verification.range.size() == 0 || !args.verification.enabled ||
         !detail::BenchmarkTraits<Benchmark>::hasVerify) {
       args.result_consumer->consumeResult("Verification", "N/A");
-    }
-    else if(!all_runs_pass){
+    } else if(!all_runs_pass) {
       // error
       args.result_consumer->consumeResult("Verification", "FAIL");
-    }
-    else {
+    } else {
       // pass
       args.result_consumer->consumeResult("Verification", "PASS");
-    }        
-    
+    }
+
     args.result_consumer->flush();
-    
   }
 
 private:
-  BenchmarkArgs args;  
+  BenchmarkArgs args;
   std::vector<BenchmarkHook*> hooks;
 
   std::string getSyclImplementation() const {
-#if defined(__HIPSYCL__)
-    return "hipSYCL";
-#elif defined(__COMPUTECPP__)
-    return "ComputeCpp";
-#elif defined(__LLVM_SYCL__)
+#if defined(__ACPP__)
+    return "AdaptiveCpp";
+#elif defined(__DPCPP__)
     return "LLVM (Intel DPC++)";
-#elif defined(__LLVM_SYCL_CUDA__)
-    return "LLVM CUDA (Codeplay)";
 #elif defined(__TRISYCL__)
     return "triSYCL";
 #else
@@ -167,38 +164,30 @@ class BenchmarkManager
 };
 
 
-class BenchmarkApp
-{
-  BenchmarkArgs args;  
-  cl::sycl::queue device_queue;
+class BenchmarkApp {
+  BenchmarkArgs args;
+  sycl::queue device_queue;
   std::unordered_set<std::string> benchmark_names;
-  
-public:  
-  BenchmarkApp(int argc, char** argv)
-  {
-    try{
+
+public:
+  BenchmarkApp(int argc, char** argv) {
+    try {
       args = BenchmarkCommandLine{argc, argv}.getBenchmarkArgs();
-    }
-    catch(std::exception& e){
+    } catch(std::exception& e) {
       std::cerr << "Error while parsing command lines: " << e.what() << std::endl;
     }
   }
 
-  const BenchmarkArgs& getArgs() const
-  { return args; }
+  const BenchmarkArgs& getArgs() const { return args; }
 
-  bool shouldRunNDRangeKernels() const
-  {
-    return !args.cli.isFlagSet("--no-ndrange-kernels");
-  }
+  bool shouldRunNDRangeKernels() const { return !args.cli.isFlagSet("--no-ndrange-kernels"); }
 
-  bool deviceHasAspect(cl::sycl::aspect asp) const { return device_queue.get_device().has(asp); }
+  bool deviceHasAspect(sycl::aspect asp) const { return device_queue.get_device().has(asp); }
 
-  bool deviceSupportsFP64() const { return deviceHasAspect(cl::sycl::aspect::fp64); }
+  bool deviceSupportsFP64() const { return deviceHasAspect(sycl::aspect::fp64); }
 
-  template<class Benchmark, typename... AdditionalArgs>
-  void run(AdditionalArgs&&... additional_args)
-  {
+  template <class Benchmark, typename... AdditionalArgs>
+  void run(AdditionalArgs&&... additional_args) {
     try {
       const auto name = Benchmark{args, additional_args...}.getBenchmarkName(args);
       if(benchmark_names.count(name) == 0) {
@@ -216,11 +205,9 @@ class BenchmarkApp
 #endif
 
       mgr.run(additional_args...);
-    }
-    catch(cl::sycl::exception& e){
+    } catch(sycl::exception& e) {
       std::cerr << "SYCL error: " << e.what() << std::endl;
-    }
-    catch(std::exception& e){
+    } catch(std::exception& e) {
       std::cerr << "Error: " << e.what() << std::endl;
     }
   }
diff --git a/include/memory_wrappers.h b/include/memory_wrappers.h
new file mode 100644
index 00000000..f895c282
--- /dev/null
+++ b/include/memory_wrappers.h
@@ -0,0 +1,296 @@
+#pragma once
+#include "common.h"
+#include <memory>
+
+#include "utils.h"
+
+
+template <class AccType>
+class InitializationDummyKernel {
+public:
+  InitializationDummyKernel(AccType acc) : acc{acc} {}
+
+  void operator()() const {}
+
+private:
+  AccType acc;
+};
+
+class InitializationDummyKernel2;
+
+template <class BufferType>
+inline void forceDataTransfer(sycl::queue& q, BufferType b) {
+  q.submit([&](sycl::handler& cgh) {
+    auto acc = b.template get_access<sycl::access::mode::read>(cgh);
+    cgh.single_task(InitializationDummyKernel{acc});
+  });
+  q.wait_and_throw();
+}
+
+template <class BufferType>
+inline void forceDataAllocation(sycl::queue& q, BufferType b) {
+  q.submit([&](sycl::handler& cgh) {
+    auto acc = b.template get_access<sycl::access::mode::discard_write>(cgh);
+    cgh.single_task(InitializationDummyKernel{acc});
+  });
+  q.wait_and_throw();
+}
+
+template <class T, int Dimensions = 1>
+class PrefetchedBuffer {
+public:
+  void initialize(sycl::queue& q, sycl::range<Dimensions> r) {
+    buff = std::make_shared<sycl::buffer<T, Dimensions>>(r);
+    forceDataAllocation(q, *buff);
+  }
+
+  void initialize(sycl::queue& q, T* data, sycl::range<Dimensions> r) {
+    buff = std::make_shared<sycl::buffer<T, Dimensions>>(data, r);
+    buff->set_write_back(false);
+    forceDataTransfer(q, *buff);
+  }
+
+  void initialize(sycl::queue& q, const T* data, sycl::range<Dimensions> r) {
+    buff = std::make_shared<sycl::buffer<T, Dimensions>>(data, r);
+    buff->set_write_back(false);
+    forceDataTransfer(q, *buff);
+  }
+
+
+  template <sycl::access::mode mode, sycl::target target = sycl::target::device>
+  auto get_access(sycl::handler& commandGroupHandler) {
+    return buff->template get_access<mode, target>(commandGroupHandler);
+  }
+
+  template <sycl::access::mode mode>
+  auto get_access() {
+    return buff->template get_access<mode>();
+  }
+
+  template <sycl::access::mode mode, sycl::target target = sycl::target::device>
+  auto get_access(
+      sycl::handler& commandGroupHandler, sycl::range<Dimensions> accessRange, sycl::id<Dimensions> accessOffset = {}) {
+    return buff->template get_access<mode, target>(commandGroupHandler, accessRange, accessOffset);
+  }
+
+  template <sycl::access::mode mode>
+  auto get_access(sycl::range<Dimensions> accessRange, sycl::id<Dimensions> accessOffset = {}) {
+    return buff->template get_access<mode>(accessRange, accessOffset);
+  }
+
+  auto get_host_access() { return buff->template get_host_access(); }
+
+  sycl::range<Dimensions> get_range() const { return buff->get_range(); }
+
+  sycl::buffer<T, Dimensions>& get() const { return *buff; }
+
+  void reset() { buff = nullptr; }
+
+private:
+  // Wrap in a shared_ptr to allow default constructing this class
+  std::shared_ptr<sycl::buffer<T, Dimensions>> buff;
+};
+
+
+namespace detail {
+template <typename T, typename U, size_t val, size_t expected>
+struct has_dim_impl {
+  static constexpr bool value = val == expected;
+};
+
+template <typename T, size_t val, size_t expected>
+static constexpr bool has_dim_v = has_dim_impl<T, T, val, expected>::value;
+
+template <typename T, size_t val, size_t expected>
+using has_dim_t = std::enable_if_t<has_dim_v<T, val, expected> == true, void>;
+
+template <sycl::usm::alloc type>
+struct usm_properties;
+
+using namespace sycl::usm;
+template <>
+struct usm_properties<alloc::device> {
+  static constexpr bool is_device_accessible = true;
+  static constexpr bool is_host_accessible = false;
+};
+template <>
+struct usm_properties<alloc::host> {
+  static constexpr bool is_device_accessible = true;
+  static constexpr bool is_host_accessible = true;
+};
+template <>
+struct usm_properties<alloc::shared> {
+  static constexpr bool is_device_accessible = true;
+  static constexpr bool is_host_accessible = true;
+};
+
+
+} // namespace detail
+
+
+template <typename T, std::size_t dim = 1, sycl::usm::alloc type = sycl::usm::alloc::device>
+class USMBuffer {
+  static_assert(dim >= 1 && dim <= 3, "Invalid dim provided");
+
+protected:
+  T* _data;
+  T* _host_ptr;
+  sycl::range<dim> _count;
+  std::size_t total_size;
+  sycl::queue* queue;
+
+public:
+  USMBuffer() : _data(nullptr), _host_ptr(nullptr), _count(getRange()), total_size(0), queue(nullptr) {}
+
+  ~USMBuffer() {
+    if(_data != nullptr) {
+      sycl::free(_data, *queue);
+    }
+    if constexpr(!detail::usm_properties<type>::is_host_accessible) {
+      if(_host_ptr != nullptr) {
+        sycl::free(_host_ptr, *queue);
+      }
+    }
+  }
+
+  template <typename U = T, typename = detail::has_dim_t<U, dim, 1>>
+  void initialize(sycl::queue& q, size_t count) {
+    queue = &q;
+    allocate(count);
+  }
+
+  void initialize(sycl::queue& q, sycl::range<dim> count) {
+    queue = &q;
+    allocate(count);
+  }
+
+  void initialize(const T* data, size_t count) {
+    allocate(queue, count);
+    copy(queue, data, _data, count);
+  }
+
+  void initialize(const T* data, sycl::range<dim> count) {
+    allocate(count);
+    copy(data, _data, count);
+  }
+
+
+  void update_host() {
+    if constexpr(!detail::usm_properties<type>::is_host_accessible) {
+      queue->copy(_data, _host_ptr, total_size);
+      queue->wait_and_throw();
+    }
+  }
+
+  sycl::event update_host(sycl::event event) {
+    if constexpr(!detail::usm_properties<type>::is_host_accessible) {
+      return queue->copy(_data, _host_ptr, total_size, event);
+    } else
+      return event;
+  }
+
+  sycl::event update_device() {
+    if constexpr(detail::usm_properties<type>::is_device_accessible &&
+                 !detail::usm_properties<type>::is_host_accessible) {
+      assert(_host_ptr != nullptr && "calling update_device when no modification has been made on the host");
+      // auto event = queue.copy(_host_ptr, _data, total_size);
+      // queue.wait_and_throw();
+      return queue->copy(_host_ptr, _data, total_size);
+    } else
+      return sycl::event{};
+  }
+
+  sycl::event update_device(sycl::event event) {
+    if constexpr(detail::usm_properties<type>::is_device_accessible &&
+                 !detail::usm_properties<type>::is_host_accessible) {
+      assert(_host_ptr != nullptr && "calling update_device when no modification has been made on the host");
+      return queue->copy(_host_ptr, _data, total_size, event);
+    } else
+      return event;
+  }
+
+  T* get() const { return _data; }
+
+  T* get_host_ptr() const {
+    assert(_host_ptr != nullptr && "_host_ptr not initialized. You should first call update_host()");
+    return _host_ptr;
+  }
+
+  T* update_and_get_host_ptr() {
+    update_host();
+    return _host_ptr;
+  }
+
+  std::tuple<T*, sycl::event> update_and_get_host_ptr(sycl::event event) {
+    auto new_event = update_host(event);
+    return {_host_ptr, new_event};
+  }
+
+
+  auto size() const { return total_size; }
+
+private:
+  template <sycl::usm::alloc alloc_type>
+  T* malloc(size_t count) {
+    return static_cast<T*>(sycl::malloc(count * sizeof(T), *queue, alloc_type));
+  }
+
+  auto constexpr getRange() {
+    if constexpr(dim == 1) {
+      return sycl::range<dim>(0);
+    }
+    if constexpr(dim == 2) {
+      return sycl::range<dim>(0, 0);
+    }
+    if constexpr(dim == 3) {
+      return sycl::range<dim>(0, 0, 0);
+    }
+  }
+
+  std::size_t inline getSize(const sycl::range<dim>& count) {
+    std::size_t total_size = 0;
+    loop<dim>([&](std::size_t val) { total_size += count[val]; });
+    return total_size;
+  }
+
+  template <typename U = T, typename = detail::has_dim_t<U, dim, 1>>
+  void allocate(size_t count) {
+    assert(count >= 0 && "Cannot allocate negative num bytes");
+    _data = malloc<type>(count);
+    if constexpr(!detail::usm_properties<type>::is_host_accessible) {
+      _host_ptr = static_cast<T*>(sycl::malloc_host(count * sizeof(T), *queue));
+    } else {
+      _host_ptr = _data;
+    }
+    this->_count = sycl::range<dim>{count};
+    total_size = count;
+  }
+
+  void allocate(const sycl::range<dim>& count) {
+    loop<dim>([&](std::size_t idx) { assert(count[idx] >= 0 && "Cannot allocate negative num bytes"); });
+
+    const size_t total_size = getSize(count);
+    _data = malloc<type>(total_size);
+    if constexpr(!detail::usm_properties<type>::is_host_accessible) {
+      _host_ptr = static_cast<T*>(sycl::malloc_host(total_size * sizeof(T), *queue));
+    } else {
+      _host_ptr = _data;
+    }
+
+    this->_count = count;
+    this->total_size = total_size;
+  }
+
+  void copy(const T* src, T* dst, std::size_t count) const {
+    // assert(count <= _count[0] && "Cannot copy negative num bytes");
+    // assert(_data != nullptr && "Called copy on initialized USM buffer");
+    queue->copy(src, dst, count).wait_and_throw();
+  }
+
+  void copy(const T* src, T* dst, sycl::range<dim> count) const {
+    loop<dim>([&](std::size_t idx) { assert(count[idx] >= 0 && "Cannot copy negative num bytes"); });
+
+    const size_t total_size = getSize(count);
+    queue->copy(src, dst, count).wait_and_throw();
+  }
+};
\ No newline at end of file
diff --git a/include/nv_energy_meas.h b/include/nv_energy_meas.h
index cda24f81..71f55038 100644
--- a/include/nv_energy_meas.h
+++ b/include/nv_energy_meas.h
@@ -1,4 +1,4 @@
-#pragma once 
+#pragma once
 
 // NVML energy measurmeent
 // TODO
\ No newline at end of file
diff --git a/include/prefetched_buffer.h b/include/prefetched_buffer.h
deleted file mode 100644
index 251ac592..00000000
--- a/include/prefetched_buffer.h
+++ /dev/null
@@ -1,89 +0,0 @@
-#pragma once
-#include <CL/sycl.hpp>
-#include <memory>
-
-template<class AccType>
-class InitializationDummyKernel
-{
-public:
-  InitializationDummyKernel(AccType acc)
-  : acc{acc} {}
-
-  void operator()() const {}
-private:
-  AccType acc;
-};
-
-class InitializationDummyKernel2;
-
-template <class BufferType>
-inline void forceDataTransfer(cl::sycl::queue& q, BufferType b) {
-  q.submit([&](cl::sycl::handler& cgh) {
-    auto acc = b.template get_access<cl::sycl::access::mode::read>(cgh);
-    cgh.single_task(InitializationDummyKernel{acc});
-  });
-  q.wait_and_throw();
-}
-
-template <class BufferType>
-inline void forceDataAllocation(cl::sycl::queue& q, BufferType b) {
-  q.submit([&](cl::sycl::handler& cgh) {
-    auto acc = b.template get_access<cl::sycl::access::mode::discard_write>(cgh);
-    cgh.single_task(InitializationDummyKernel{acc});
-  });
-  q.wait_and_throw();
-}
-
-template <class T, int Dimensions=1>
-class PrefetchedBuffer {
-public:
-  void initialize(cl::sycl::queue& q, cl::sycl::range<Dimensions> r) {
-    buff = std::make_shared<cl::sycl::buffer<T, Dimensions>>(r);
-    forceDataAllocation(q, *buff);
-  }
-
-  void initialize(cl::sycl::queue& q, T* data, cl::sycl::range<Dimensions> r) {
-    buff = std::make_shared<cl::sycl::buffer<T, Dimensions>>(data, r);
-    forceDataTransfer(q, *buff);
-  }
-
-  void initialize(cl::sycl::queue& q, const T* data, cl::sycl::range<Dimensions> r) {
-    buff = std::make_shared<cl::sycl::buffer<T, Dimensions>>(data, r);
-    forceDataTransfer(q, *buff);
-  }
-
-
-  template <cl::sycl::access::mode mode, cl::sycl::access::target target = cl::sycl::access::target::global_buffer>
-  auto get_access(cl::sycl::handler& commandGroupHandler) {
-    return buff->template get_access<mode, target>(commandGroupHandler);
-  }
-
-  template <cl::sycl::access::mode mode>
-  auto get_access() {
-    return buff->template get_access<mode>();
-  }
-
-  template <cl::sycl::access::mode mode, cl::sycl::access::target target = cl::sycl::access::target::global_buffer>
-  auto get_access(cl::sycl::handler& commandGroupHandler, cl::sycl::range<Dimensions> accessRange,
-      cl::sycl::id<Dimensions> accessOffset = {}) {
-    return buff->template get_access<mode, target>(commandGroupHandler, accessRange, accessOffset);
-  }
-
-  template <cl::sycl::access::mode mode>
-  auto get_access(cl::sycl::range<Dimensions> accessRange, cl::sycl::id<Dimensions> accessOffset = {}) {
-    return buff->template get_access<mode>(accessRange, accessOffset);
-  }
-
-  cl::sycl::range<Dimensions> get_range() const
-  {
-    return buff->get_range();
-  }
-
-  cl::sycl::buffer<T, Dimensions>& get() const { return *buff; }
-
-  void reset() { buff = nullptr; }
-
-private:
-  // Wrap in a shared_ptr to allow default constructing this class
-  std::shared_ptr<cl::sycl::buffer<T, Dimensions>> buff;
-};
diff --git a/include/result_consumer.h b/include/result_consumer.h
index 24715f4e..77b2e8a2 100644
--- a/include/result_consumer.h
+++ b/include/result_consumer.h
@@ -1,22 +1,20 @@
 #ifndef RESULT_CONSUMER_HPP
 #define RESULT_CONSUMER_HPP
 
+#include <algorithm>
 #include <cassert>
 #include <fstream>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include <algorithm>
 
-class ResultConsumer
-{
+class ResultConsumer {
 public:
   virtual void proceedToBenchmark(const std::string& name) = 0;
   // Register a result in the result consumer
-  virtual void consumeResult(const std::string& result_name,
-                            const std::string& result,
-                            const std::string& unit = "") = 0;
+  virtual void consumeResult(
+      const std::string& result_name, const std::string& result, const std::string& unit = "") = 0;
 
   // Guarantees that the results have been emitted to the output
   // as specified by the ResultConsumer implementation
@@ -25,32 +23,23 @@ class ResultConsumer
   // Discards the current benchmark's results, useful e.g. in case of errors.
   virtual void discard() {}
 
-  virtual ~ResultConsumer(){}
-  
+  virtual ~ResultConsumer() {}
 };
 
-class OstreamResultConsumer : public ResultConsumer
-{
+class OstreamResultConsumer : public ResultConsumer {
   std::ostream& output;
   std::string name;
 
 public:
-  OstreamResultConsumer(std::ostream& ostr)
-  : output{ostr}
-  {}
+  OstreamResultConsumer(std::ostream& ostr) : output{ostr} {}
 
-  virtual void proceedToBenchmark(const std::string& benchmark_name) override
-  {
+  virtual void proceedToBenchmark(const std::string& benchmark_name) override {
     name = benchmark_name;
-    output << "********** Results for " << name 
-           << "**********" << std::endl;
-    
+    output << "********** Results for " << name << "**********" << std::endl;
   }
 
-  virtual void consumeResult(const std::string& result_name,
-                            const std::string& result,
-                            const std::string& unit = "") override
-  {
+  virtual void consumeResult(
+      const std::string& result_name, const std::string& result, const std::string& unit = "") override {
     output << result_name << ": " << result;
     if(!unit.empty()) {
       output << " [" << unit << "]";
@@ -58,63 +47,48 @@ class OstreamResultConsumer : public ResultConsumer
     output << std::endl;
   }
 
-  virtual void flush() override
-  {
-  }
+  virtual void flush() override {}
 };
 
 // TODO ResultConsumer that appends to a csv
-class AppendingCsvResultConsumer : public ResultConsumer
-{
+class AppendingCsvResultConsumer : public ResultConsumer {
 public:
   using benchmark_data = std::unordered_map<std::string, std::string>;
 
-  AppendingCsvResultConsumer(const std::string& filename)
-  : output{filename, std::ios::app}
-  {}
+  AppendingCsvResultConsumer(const std::string& filename) : output{filename, std::ios::app} {}
 
-  virtual void proceedToBenchmark(const std::string& benchmark_name) override
-  {
-    currentBenchmark = benchmark_name;
-  }
+  virtual void proceedToBenchmark(const std::string& benchmark_name) override { currentBenchmark = benchmark_name; }
 
-  virtual void consumeResult(const std::string& result_name,
-                            const std::string& result,
-                            const std::string& unit = "") override
-  {
+  virtual void consumeResult(
+      const std::string& result_name, const std::string& result, const std::string& unit = "") override {
     data[currentBenchmark][result_name] = result;
   }
 
-  virtual void flush() override
-  {
+  virtual void flush() override {
     std::unordered_set<std::string> columns;
 
-    for(const auto& benchmark: data) {
+    for(const auto& benchmark : data) {
       for(auto entry : benchmark.second) {
         columns.insert(entry.first);
       }
     }
 
     std::vector<std::string> sorted_columns;
-    for(auto c : columns)
-      sorted_columns.push_back(c);
+    for(auto c : columns) sorted_columns.push_back(c);
     // To make sure order of columns is deterministic
-    std::sort(sorted_columns.begin(),sorted_columns.end());
+    std::sort(sorted_columns.begin(), sorted_columns.end());
 
     output << "# Benchmark name";
-    for(auto c : sorted_columns)
-      output << "," << c;
+    for(auto c : sorted_columns) output << "," << c;
     output << std::endl;
 
     for(const auto& benchmark : data) {
       output << benchmark.first;
-      for(auto c : sorted_columns)
-        output << "," << benchmark.second.at(c);
+      for(auto c : sorted_columns) output << "," << benchmark.second.at(c);
       output << std::endl;
     }
 
     data.clear();
-    
   }
 
   void discard() override {
@@ -132,4 +106,3 @@ class AppendingCsvResultConsumer : public ResultConsumer
 };
 
 #endif
-
diff --git a/include/type_traits.h b/include/type_traits.h
index cdb4f1b9..f374707b 100644
--- a/include/type_traits.h
+++ b/include/type_traits.h
@@ -1,14 +1,15 @@
 #ifndef TYPE_TRAITS_H
 #define TYPE_TRAITS_H
 
-template<class T>
-struct ReadableTypename
-{};
+template <class T>
+struct ReadableTypename {};
 
-#define MAKE_READABLE_TYPENAME(T, str) \
-template<> \
-struct ReadableTypename<T> \
-{ static const char* name; }; const char* ReadableTypename<T>::name = str;
+#define MAKE_READABLE_TYPENAME(T, str)                                                                                 \
+  template <>                                                                                                          \
+  struct ReadableTypename<T> {                                                                                         \
+    static const char* name;                                                                                           \
+  };                                                                                                                   \
+  const char* ReadableTypename<T>::name = str;
 
 MAKE_READABLE_TYPENAME(char, "int8")
 MAKE_READABLE_TYPENAME(unsigned char, "uint8")
diff --git a/include/utils.h b/include/utils.h
new file mode 100644
index 00000000..6dd3586d
--- /dev/null
+++ b/include/utils.h
@@ -0,0 +1,12 @@
+#include <array>
+#include <type_traits.h>
+
+template <std::size_t... Idx, typename F>
+void loop_impl(std::integer_sequence<std::size_t, Idx...>, F&& f) {
+  (f(std::integral_constant<std::size_t, Idx>{}), ...);
+}
+
+template <std::size_t count, typename F>
+void loop(F&& f) {
+  loop_impl(std::make_index_sequence<count>{}, std::forward<F>(f));
+}
\ No newline at end of file
diff --git a/micro/DRAM.cpp b/micro/DRAM.cpp
index 59c5b7d5..26c5252b 100644
--- a/micro/DRAM.cpp
+++ b/micro/DRAM.cpp
@@ -1,6 +1,6 @@
 #include "common.h"
 
-namespace s = cl::sycl;
+namespace s = sycl;
 
 template <typename DataT, int Dims>
 class MicroBenchDRAMKernel;
@@ -52,7 +52,7 @@ class MicroBenchDRAM {
   }
 
   void run(std::vector<s::event>& events) {
-    events.push_back(args.device_queue.submit([&](cl::sycl::handler& cgh) {
+    events.push_back(args.device_queue.submit([&](sycl::handler& cgh) {
       auto in = input_buf.template get_access<s::access::mode::read>(cgh);
       auto out = output_buf.template get_access<s::access::mode::discard_write>(cgh);
       // We spawn one work item for each buffer element to be copied.
@@ -62,7 +62,7 @@ class MicroBenchDRAM {
   }
 
   bool verify(VerificationSetting& ver) {
-    auto result = output_buf.template get_access<s::access::mode::read>();
+    auto result = output_buf.get_host_access();
     for(size_t i = 0; i < buffer_size[0]; ++i) {
       for(size_t j = 0; j < (Dims < 2 ? 1 : buffer_size[1]); ++j) {
         for(size_t k = 0; k < (Dims < 3 ? 1 : buffer_size[2]); ++k) {
@@ -102,11 +102,10 @@ int main(int argc, char** argv) {
   app.run<MicroBenchDRAM<float, 1>>();
   app.run<MicroBenchDRAM<float, 2>>();
   app.run<MicroBenchDRAM<float, 3>>();
-  if(app.deviceSupportsFP64()) {
+  if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) {
     app.run<MicroBenchDRAM<double, 1>>();
     app.run<MicroBenchDRAM<double, 2>>();
     app.run<MicroBenchDRAM<double, 3>>();
   }
-
   return 0;
 }
diff --git a/micro/arith.cpp b/micro/arith.cpp
index b6962447..1029c74c 100644
--- a/micro/arith.cpp
+++ b/micro/arith.cpp
@@ -1,6 +1,6 @@
 #include "common.h"
 
-namespace s = cl::sycl;
+namespace s = sycl;
 
 template <typename DataT, int Iterations>
 class MicroBenchArithmeticKernel;
@@ -46,13 +46,13 @@ class MicroBenchArithmetic {
     return {};
   }
 
-  void run(std::vector<cl::sycl::event>& events) {
-    events.push_back(args.device_queue.submit([&](cl::sycl::handler& cgh) {
+  void run(std::vector<sycl::event>& events) {
+    events.push_back(args.device_queue.submit([&](sycl::handler& cgh) {
       auto in = input_buf.template get_access<s::access::mode::read>(cgh);
       auto out = output_buf.template get_access<s::access::mode::discard_write>(cgh);
 
       cgh.parallel_for<MicroBenchArithmeticKernel<DataT, Iterations>>(
-          s::range<1>{args.problem_size}, [=](cl::sycl::id<1> gid) {
+          s::range<1>{args.problem_size}, [=](sycl::id<1> gid) {
             DataT a1 = in[gid];
             const DataT a2 = a1;
 
@@ -68,7 +68,7 @@ class MicroBenchArithmetic {
   }
 
   bool verify(VerificationSetting& ver) {
-    auto result = output_buf.template get_access<s::access::mode::read>();
+    auto result = output_buf.get_host_access();
     for(size_t i = 0; i < args.problem_size; ++i) {
       if(result[i] != DataT{1}) {
         return false;
@@ -91,8 +91,8 @@ int main(int argc, char** argv) {
 
   app.run<MicroBenchArithmetic<int>>();
   app.run<MicroBenchArithmetic<float>>();
-  if(app.deviceSupportsFP64())
+  if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) {
     app.run<MicroBenchArithmetic<double>>();
-
+  }
   return 0;
 }
diff --git a/micro/host_device_bandwidth.cpp b/micro/host_device_bandwidth.cpp
index 24b03db3..bc0c32c8 100644
--- a/micro/host_device_bandwidth.cpp
+++ b/micro/host_device_bandwidth.cpp
@@ -1,6 +1,6 @@
 #include "common.h"
 
-namespace s = cl::sycl;
+namespace s = sycl;
 
 // The data type to be copied. This was originally a single byte (char), however
 // this causes device-side initialization kernels to quickly reach the
@@ -117,8 +117,10 @@ class MicroBenchHostDeviceBandwidth {
         // Initialize buffer on device
         args.device_queue.submit([&](s::handler& cgh) {
           auto acc = buffer->template get_access<s::access::mode::discard_write>(cgh);
-          cgh.parallel_for<D2HInitKernel<Dims, Strided>>(
-              copy_size, getStridedCopyOffset<Dims, true>(), [=](s::id<Dims> gid) { acc[gid] = TEST_VALUE; });
+          cgh.parallel_for<D2HInitKernel<Dims, Strided>>(copy_size, [=](s::id<Dims> gid) {
+            auto offset = getStridedCopyOffset<Dims, true>();
+            acc[gid + offset] = TEST_VALUE;
+          });
         });
       }
     }
@@ -142,7 +144,7 @@ class MicroBenchHostDeviceBandwidth {
 
       if constexpr(Direction == CopyDirection::DEVICE_TO_HOST) {
         // Request host accessor for data that has been written on device
-        buffer->template get_access<s::access::mode::read>();
+        buffer->get_host_access();
       }
     }
 
@@ -200,13 +202,13 @@ class MicroBenchHostDeviceBandwidth {
         cgh.single_task<H2DVerificationKernel<Dims, Strided>>([=]() { /* NOP */ });
       });
 
-      auto acc = buffer->template get_access<s::access::mode::read>();
+      auto acc = buffer->get_host_access();
       return verifyAccessor(acc);
     }
 
     if constexpr(Direction == CopyDirection::DEVICE_TO_HOST) {
       if constexpr(!Strided) {
-        auto acc = buffer->template get_access<s::access::mode::read>();
+        auto acc = buffer->get_host_access();
         return verifyAccessor(acc);
       }
 
@@ -254,4 +256,4 @@ int main(int argc, char** argv) {
   app.run<MicroBenchHostDeviceBandwidth<3, CopyDirection::DEVICE_TO_HOST, true>>();
 
   return 0;
-}
+}
\ No newline at end of file
diff --git a/micro/local_mem.cpp b/micro/local_mem.cpp
index 72c6fdbe..8eea7dac 100644
--- a/micro/local_mem.cpp
+++ b/micro/local_mem.cpp
@@ -2,7 +2,7 @@
 
 #include <iostream>
 
-namespace s = cl::sycl;
+namespace s = sycl;
 
 template <typename DATA_TYPE, int COMP_ITERS>
 class MicroBenchLocalMemoryKernel;
@@ -30,12 +30,12 @@ class MicroBenchLocalMemory {
     output_buf.initialize(args.device_queue, s::range<1>(args.problem_size));
   }
 
-  void run(std::vector<cl::sycl::event>& events) {
-    events.push_back(args.device_queue.submit([&](cl::sycl::handler& cgh) {
+  void run(std::vector<sycl::event>& events) {
+    events.push_back(args.device_queue.submit([&](sycl::handler& cgh) {
       auto in = input_buf.template get_access<s::access::mode::read>(cgh);
       auto out = output_buf.template get_access<s::access::mode::discard_write>(cgh);
       // local memory definition
-      s::accessor<DATA_TYPE, 1, s::access::mode::read_write, s::access::target::local> local_mem(args.local_size, cgh);
+      s::local_accessor<DATA_TYPE, 1> local_mem(args.local_size, cgh);
 
       s::nd_range<1> ndrange{{args.problem_size}, {args.local_size}};
 
@@ -43,20 +43,21 @@ class MicroBenchLocalMemory {
         DATA_TYPE r0;
         int gid = item.get_global_id(0);
         int lid = item.get_local_id(0);
-        int lid2 = (item.get_local_id(0)+1) % item.get_local_range()[0];
+        int lid2 = (item.get_local_id(0) + 1) % item.get_local_range()[0];
 
         local_mem[lid] = in[gid];
 
-        item.barrier(s::access::fence_space::local_space);
+        s::group_barrier(item.get_group());
 
-        // Note: this is dangerous, as a compiler could in principle be smart enough to figure out that it can just drop this
-        //       so far, we haven't encountered such a compiler, and all options to make it "safer" 
+        // Note: this is dangerous, as a compiler could in principle be smart enough to figure out that it can just drop
+        // this
+        //       so far, we haven't encountered such a compiler, and all options to make it "safer"
         //       introduce overhead on at least some platform / data type combinations
         for(int i = 0; i < COMP_ITERS; i++) {
           local_mem[lid2] = local_mem[lid];
         }
 
-        item.barrier(s::access::fence_space::local_space);
+        s::group_barrier(item.get_group());
 
         out[gid] = local_mem[lid];
       });
@@ -72,7 +73,7 @@ class MicroBenchLocalMemory {
   }
 
   bool verify(VerificationSetting& ver) {
-    auto result = output_buf.template get_access<s::access::mode::read>();
+    auto result = output_buf.get_host_access();
     for(size_t i = 0; i < args.problem_size; ++i) {
       if(result[i] != 42) {
         return false;
@@ -99,8 +100,8 @@ int main(int argc, char** argv) {
   app.run<MicroBenchLocalMemory<float, compute_iters>>();
 
   // double precision
-  if(app.deviceSupportsFP64())
+  if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) {
     app.run<MicroBenchLocalMemory<double, compute_iters>>();
-
+  }
   return 0;
 }
diff --git a/micro/pattern_L2.cpp b/micro/pattern_L2.cpp
index edd478c6..b6d03229 100644
--- a/micro/pattern_L2.cpp
+++ b/micro/pattern_L2.cpp
@@ -2,46 +2,43 @@
 
 #include <iostream>
 
-namespace s = cl::sycl;
+namespace s = sycl;
 
-template <typename DATA_TYPE, int COMP_ITERS> class MicroBenchL2Kernel;
+template <typename DATA_TYPE, int COMP_ITERS>
+class MicroBenchL2Kernel;
 
 /* Microbenchmark stressing the main arithmetic units. */
 template <typename DATA_TYPE, int COMP_ITERS>
-class MicroBenchL2
-{
+class MicroBenchL2 {
 protected:
-    std::vector<DATA_TYPE> input;
-    BenchmarkArgs args;
+  std::vector<DATA_TYPE> input;
+  BenchmarkArgs args;
+
+  PrefetchedBuffer<DATA_TYPE, 1> input_buf;
+  PrefetchedBuffer<DATA_TYPE, 1> output_buf;
 
-    PrefetchedBuffer<DATA_TYPE, 1> input_buf;
-    PrefetchedBuffer<DATA_TYPE, 1> output_buf;
 public:
-  MicroBenchL2(const BenchmarkArgs &_args) : args(_args) {}
+  MicroBenchL2(const BenchmarkArgs& _args) : args(_args) {}
 
   void setup() {
-    // buffers initialized to a default value 
-    input. resize(args.problem_size, 10);
+    // buffers initialized to a default value
+    input.resize(args.problem_size, 10);
 
     input_buf.initialize(args.device_queue, input.data(), s::range<1>(args.problem_size));
     output_buf.initialize(args.device_queue, s::range<1>(args.problem_size));
   }
 
-  void run(std::vector<cl::sycl::event>& events){
-
-    events.push_back(args.device_queue.submit(
-        [&](cl::sycl::handler& cgh) {
-      auto in  =  input_buf.template get_access<s::access::mode::read>(cgh);
+  void run(std::vector<sycl::event>& events) {
+    events.push_back(args.device_queue.submit([&](sycl::handler& cgh) {
+      auto in = input_buf.template get_access<s::access::mode::read>(cgh);
       auto out = output_buf.template get_access<s::access::mode::discard_write>(cgh);
-      cl::sycl::range<1> ndrange {args.problem_size};
+      sycl::range<1> ndrange{args.problem_size};
 
-      cgh.parallel_for<MicroBenchL2Kernel<DATA_TYPE,COMP_ITERS>>(ndrange,
-        [=](cl::sycl::id<1> gid)
-      {
+      cgh.parallel_for<MicroBenchL2Kernel<DATA_TYPE, COMP_ITERS>>(ndrange, [=](sycl::id<1> gid) {
         DATA_TYPE r0;
-        for (int i=0;i<COMP_ITERS;i++) {
-            r0 = in[gid];
-            out[gid] = r0; 
+        for(int i = 0; i < COMP_ITERS; i++) {
+          r0 = in[gid];
+          out[gid] = r0;
         }
         out[gid] = r0;
       });
@@ -57,35 +54,31 @@ class MicroBenchL2
   }
 };
 
-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
   BenchmarkApp app(argc, argv);
 
   // int
-  app.run< MicroBenchL2<int,1> >();
-  app.run< MicroBenchL2<int,2> >();
-  app.run< MicroBenchL2<int,4> >();
-  app.run< MicroBenchL2<int,8> >();
-  app.run< MicroBenchL2<int,16> >();
-
-  // single precision  
-  app.run< MicroBenchL2<float,1> >();
-  app.run< MicroBenchL2<float,2> >();
-  app.run< MicroBenchL2<float,4> >();
-  app.run< MicroBenchL2<float,8> >();
-  app.run< MicroBenchL2<float,16> >();
+  app.run<MicroBenchL2<int, 1>>();
+  app.run<MicroBenchL2<int, 2>>();
+  app.run<MicroBenchL2<int, 4>>();
+  app.run<MicroBenchL2<int, 8>>();
+  app.run<MicroBenchL2<int, 16>>();
+
+  // single precision
+  app.run<MicroBenchL2<float, 1>>();
+  app.run<MicroBenchL2<float, 2>>();
+  app.run<MicroBenchL2<float, 4>>();
+  app.run<MicroBenchL2<float, 8>>();
+  app.run<MicroBenchL2<float, 16>>();
+
 
   // double precision
-  if(app.deviceSupportsFP64()) {
+  if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) {
     app.run<MicroBenchL2<double, 1>>();
     app.run<MicroBenchL2<double, 2>>();
     app.run<MicroBenchL2<double, 4>>();
     app.run<MicroBenchL2<double, 8>>();
     app.run<MicroBenchL2<double, 16>>();
   }
-
   return 0;
 }
-
-
-
diff --git a/micro/sf.cpp b/micro/sf.cpp
index 47b9202d..90c5e9a8 100644
--- a/micro/sf.cpp
+++ b/micro/sf.cpp
@@ -1,6 +1,6 @@
 #include "common.h"
 
-namespace s = cl::sycl;
+namespace s = sycl;
 
 template <typename DataT, int N>
 class MicroBenchSpecialFuncKernel;
@@ -32,7 +32,7 @@ class MicroBenchSpecialFunc {
     return {OP / 1024.0 / 1024.0 / 1024.0, "GOP"};
   }
 
-  void run(std::vector<cl::sycl::event>& events) {
+  void run(std::vector<sycl::event>& events) {
     events.push_back(args.device_queue.submit([&](s::handler& cgh) {
       auto in = input_buf.template get_access<s::access::mode::read>(cgh);
       auto out = output_buf.template get_access<s::access::mode::discard_write>(cgh);
@@ -62,7 +62,7 @@ class MicroBenchSpecialFunc {
       v2 = s::tan(v0);
     }
     const DataT expected = v2;
-    auto result = output_buf.template get_access<s::access::mode::read>();
+    auto result = output_buf.get_host_access();
     for(size_t i = 0; i < args.problem_size; ++i) {
       constexpr DataT EPSILON = 1e-5;
       if(std::abs(result[i] - expected) > EPSILON) {
@@ -85,8 +85,8 @@ int main(int argc, char** argv) {
   BenchmarkApp app(argc, argv);
 
   app.run<MicroBenchSpecialFunc<float>>();
-  if(app.deviceSupportsFP64())
+  if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) {
     app.run<MicroBenchSpecialFunc<double>>();
-
+  }
   return 0;
 }
diff --git a/pattern/prefixsum.cpp b/pattern/prefixsum.cpp
index 34adabac..d508fd7f 100644
--- a/pattern/prefixsum.cpp
+++ b/pattern/prefixsum.cpp
@@ -1 +1 @@
-TODO: SYCL code with a prefix sum
+TODO : SYCL code with a prefix sum
diff --git a/pattern/reduction.cpp b/pattern/reduction.cpp
index bb5892fb..e1d938f4 100644
--- a/pattern/reduction.cpp
+++ b/pattern/reduction.cpp
@@ -2,81 +2,74 @@
 #include "common.h"
 
 #include <algorithm>
+#include <cassert>
 #include <iostream>
 #include <numeric>
 #include <vector>
-#include <cassert>
 
-using namespace cl;
+using namespace sycl;
 
-template <typename T> class ReductionKernelNDRange;
-template <typename T> class ReductionKernelHierarchical;
+template <typename T>
+class ReductionKernelNDRange;
+template <typename T>
+class ReductionKernelHierarchical;
 
 template <typename T>
-class Reduction
-{
+class Reduction {
 protected:
-    std::vector<T> _input;
-    BenchmarkArgs _args;
+  std::vector<T> _input;
+  BenchmarkArgs _args;
+
+  PrefetchedBuffer<T, 1> _input_buff;
+  PrefetchedBuffer<T, 1> _output_buff;
+  sycl::buffer<T, 1>* _final_output_buff;
+  T _result;
 
-    PrefetchedBuffer<T, 1> _input_buff;
-    PrefetchedBuffer<T, 1> _output_buff;
-    sycl::buffer<T, 1>* _final_output_buff;
-    T _result;
 public:
-    Reduction(const BenchmarkArgs &args)
-      : _args{args}
-  {  
-    assert(_args.problem_size % _args.local_size == 0);
-  }
+  Reduction(const BenchmarkArgs& args) : _args{args} { assert(_args.problem_size % _args.local_size == 0); }
 
-  void generate_input(std::vector<T>& out)
-  {
+  void generate_input(std::vector<T>& out) {
     out.resize(_args.problem_size);
-    for(std::size_t i = 0; i < out.size(); ++i)
-      out[i] = static_cast<T>(i);
+    for(std::size_t i = 0; i < out.size(); ++i) out[i] = static_cast<T>(i);
   }
 
   void setup() {
     generate_input(_input);
 
-    _input_buff.initialize(_args.device_queue, static_cast<const T*>(_input.data()), sycl::range<1>(_args.problem_size));
+    _input_buff.initialize(
+        _args.device_queue, static_cast<const T*>(_input.data()), sycl::range<1>(_args.problem_size));
     _output_buff.initialize(_args.device_queue, sycl::range<1>{_args.problem_size});
   }
 
 
-  void submit_ndrange(std::vector<cl::sycl::event>& events){
-    this->submit([this, &events](sycl::buffer<T, 1> *input, sycl::buffer<T, 1> *output,
-                        const size_t reduction_size, const size_t num_groups) {
+  void submit_ndrange(std::vector<sycl::event>& events) {
+    this->submit([this, &events](sycl::buffer<T, 1>* input, sycl::buffer<T, 1>* output, const size_t reduction_size,
+                     const size_t num_groups) {
       events.push_back(this->local_reduce_ndrange(input, output, reduction_size, num_groups));
     });
   }
 
-  void submit_hierarchical(std::vector<cl::sycl::event>& events){
-    this->submit([this, &events](sycl::buffer<T, 1> *input, sycl::buffer<T, 1> *output,
-                        const size_t reduction_size, const size_t num_groups) {
-      events.push_back(this->local_reduce_hierarchical(input, output, reduction_size,
-                                      num_groups));
+  void submit_hierarchical(std::vector<sycl::event>& events) {
+    this->submit([this, &events](sycl::buffer<T, 1>* input, sycl::buffer<T, 1>* output, const size_t reduction_size,
+                     const size_t num_groups) {
+      events.push_back(this->local_reduce_hierarchical(input, output, reduction_size, num_groups));
     });
   }
 
-  bool verify(VerificationSetting &ver) {
+  bool verify(VerificationSetting& ver) {
     T result = _final_output_buff->get_host_access()[0];
 
     // Calculate CPU result in fp64 to avoid obtaining a wrong verification result
     std::vector<double> input_fp64(_input.size());
-    for(std::size_t i = 0; i < _input.size(); ++i)
-      input_fp64[i] = static_cast<double>(_input[i]);
-
+    for(std::size_t i = 0; i < _input.size(); ++i) input_fp64[i] = static_cast<double>(i);
     double delta =
-        static_cast<double>(result) - std::accumulate(input_fp64.begin(), input_fp64.end(), T{});
-
+        static_cast<double>(result) - std::reduce(input_fp64.begin(), input_fp64.end(), 0, std::plus<double>());
     return std::abs(delta) < 1.e-5;
   }
+
 private:
-  template<class Kernel_invocation_function>
-  void submit(Kernel_invocation_function kernel)
-  {
+  template <class Kernel_invocation_function>
+  void submit(Kernel_invocation_function kernel) {
     sycl::buffer<T, 1>* input_buff = &_input_buff.get();
     sycl::buffer<T, 1>* output_buff = &_output_buff.get();
 
@@ -85,13 +78,11 @@ class Reduction
 
     do {
       // invoke local reduction
-      kernel(input_buff, output_buff, current_reduction_size,
-             current_num_groups);
+      kernel(input_buff, output_buff, current_reduction_size, current_num_groups);
 
       current_reduction_size = current_num_groups;
       if(current_num_groups > 1)
-        current_num_groups = 
-          (current_reduction_size + _args.local_size - 1) / _args.local_size;
+        current_num_groups = (current_reduction_size + _args.local_size - 1) / _args.local_size;
       else
         // This was the final iteration
         current_num_groups = 0;
@@ -101,113 +92,87 @@ class Reduction
         std::swap(input_buff, output_buff);
 
     } while(current_num_groups > 0);
-    
+
     _final_output_buff = output_buff;
   }
 
-  sycl::event local_reduce_ndrange(
-    sycl::buffer<T,1>* input, sycl::buffer<T,1>* output,
-    const size_t reduction_size, const std::size_t num_groups)
-  {
-    return _args.device_queue.submit([&](sycl::handler &cgh) {
-
-      sycl::nd_range<1> ndrange{num_groups * _args.local_size,
-                                _args.local_size};
+  sycl::event local_reduce_ndrange(sycl::buffer<T, 1>* input, sycl::buffer<T, 1>* output, const size_t reduction_size,
+      const std::size_t num_groups) {
+    return _args.device_queue.submit([&](sycl::handler& cgh) {
+      sycl::nd_range<1> ndrange{num_groups * _args.local_size, _args.local_size};
 
-      using namespace cl::sycl::access;
+      using namespace sycl::access;
 
-      auto acc     = input->template get_access<mode::read>(cgh);
+      auto acc = input->template get_access<mode::read>(cgh);
       auto acc_out = output->template get_access<mode::discard_write>(cgh);
-      auto scratch = sycl::accessor<T, 1, mode::read_write, target::local>
-        {_args.local_size, cgh};
-
+      auto scratch = sycl::local_accessor<T, 1>{_args.local_size, cgh};
       const int group_size = _args.local_size;
 
-      cgh.parallel_for<ReductionKernelNDRange<T>>(
-        ndrange,
-        [=](sycl::nd_item<1> item) {
-          
-          const int lid = item.get_local_id(0);
-          const auto gid = item.get_global_id();
-
-          scratch[lid] = (gid[0] < reduction_size) ? acc[gid] : 0;
-          
-          for(int i = group_size/2; i > 0; i /= 2) {
-
-            item.barrier();
-            if(lid < i) 
-              scratch[lid] += scratch[lid + i];
-
-          }
-          if(lid == 0)
-            acc_out[item.get_group(0)] = scratch[0];
-        });
+      cgh.parallel_for<ReductionKernelNDRange<T>>(ndrange, [=](sycl::nd_item<1> item) {
+        const int lid = item.get_local_id(0);
+        const auto gid = item.get_global_id();
+
+        scratch[lid] = (gid[0] < reduction_size) ? acc[gid] : 0;
+
+        for(int i = group_size / 2; i > 0; i /= 2) {
+          sycl::group_barrier(item.get_group());
+          if(lid < i)
+            scratch[lid] += scratch[lid + i];
+        }
+        if(lid == 0)
+          acc_out[item.get_group(0)] = scratch[0];
+      });
     }); // submit
   }
 
-  sycl::event local_reduce_hierarchical(
-    sycl::buffer<T,1>* input, sycl::buffer<T,1>* output, 
-    const size_t reduction_size, const std::size_t num_groups)
-  {
-    return _args.device_queue.submit(
-        [&](sycl::handler& cgh) {
-
+  sycl::event local_reduce_hierarchical(sycl::buffer<T, 1>* input, sycl::buffer<T, 1>* output,
+      const size_t reduction_size, const std::size_t num_groups) {
+    return _args.device_queue.submit([&](sycl::handler& cgh) {
       using namespace sycl::access;
 
-      auto acc     = input->template get_access<mode::read>(cgh);
+      auto acc = input->template get_access<mode::read>(cgh);
       auto acc_out = output->template get_access<mode::discard_write>(cgh);
 
-      auto scratch = sycl::accessor<T, 1, mode::read_write, target::local>
-        {_args.local_size, cgh};
+      auto scratch = sycl::local_accessor<T, 1>{_args.local_size, cgh};
 
       const int group_size = _args.local_size;
 
       cgh.parallel_for_work_group<ReductionKernelHierarchical<T>>(
-        sycl::range<1>{num_groups},
-        sycl::range<1>{_args.local_size},
-        [=](sycl::group<1> grp) {
-
-          grp.parallel_for_work_item([&](sycl::h_item<1> idx){
-            const int lid = idx.get_local_id(0);
-            const auto gid = idx.get_global_id();
-
-            scratch[lid] = (gid[0] < reduction_size) ? acc[gid] : 0;
-          });
-        
-          for(int i = group_size/2; i > 0; i /= 2) {
-            grp.parallel_for_work_item([&](sycl::h_item<1> idx){
+          sycl::range<1>{num_groups}, sycl::range<1>{_args.local_size}, [=](sycl::group<1> grp) {
+            grp.parallel_for_work_item([&](sycl::h_item<1> idx) {
               const int lid = idx.get_local_id(0);
+              const auto gid = idx.get_global_id();
 
-              if (lid < i) 
-                scratch[lid] += scratch[lid + i];
+              scratch[lid] = (gid[0] < reduction_size) ? acc[gid] : 0;
             });
-          }
-
-          // Spawn another parallel_for_work_item to work around
-          // limitations in hipSYCL device implementation of
-          // hierarchical parallel for
-          grp.parallel_for_work_item([&](sycl::h_item<1> idx){
-            if(idx.get_local_id(0) == 0)
-              acc_out[grp.get_id(0)] = scratch[0];
-          });
 
-        });
+            for(int i = group_size / 2; i > 0; i /= 2) {
+              grp.parallel_for_work_item([&](sycl::h_item<1> idx) {
+                const int lid = idx.get_local_id(0);
+
+                if(lid < i)
+                  scratch[lid] += scratch[lid + i];
+              });
+            }
+
+            // Spawn another parallel_for_work_item to work around
+            // limitations in hipSYCL device implementation of
+            // hierarchical parallel for
+            grp.parallel_for_work_item([&](sycl::h_item<1> idx) {
+              if(idx.get_local_id(0) == 0)
+                acc_out[grp.get_group_id(0)] = scratch[0];
+            });
+          });
     }); // submit
   }
- 
 };
 
-template<class T>
-class ReductionNDRange : public Reduction<T>
-{
+template <class T>
+class ReductionNDRange : public Reduction<T> {
 public:
-  ReductionNDRange(const BenchmarkArgs &args)
-  : Reduction<T>{args}
-  {}
+  ReductionNDRange(const BenchmarkArgs& args) : Reduction<T>{args} {}
 
-  void run(std::vector<cl::sycl::event>& events){
-    this->submit_ndrange(events);
-  }
+  void run(std::vector<sycl::event>& events) { this->submit_ndrange(events); }
 
   static std::string getBenchmarkName(BenchmarkArgs& args) {
     std::stringstream name;
@@ -217,15 +182,12 @@ class ReductionNDRange : public Reduction<T>
   }
 };
 
-template<class T>
-class ReductionHierarchical : public Reduction<T>
-{
+template <class T>
+class ReductionHierarchical : public Reduction<T> {
 public:
-  ReductionHierarchical(const BenchmarkArgs &args)
-  : Reduction<T>{args}
-  {}
+  ReductionHierarchical(const BenchmarkArgs& args) : Reduction<T>{args} {}
 
-  void run(std::vector<cl::sycl::event>& events){
+  void run(std::vector<sycl::event>& events) {
     this->submit_hierarchical(events);
     // Waiting is not necessary as the BenchmarkManager will already call
     // wait_and_throw() here
@@ -239,29 +201,26 @@ class ReductionHierarchical : public Reduction<T>
   }
 };
 
-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
   BenchmarkApp app(argc, argv);
 
   // Using short will lead to overflow even for
   // small problem sizes
-  //app.run< ReductionNDRange<short>>();
-  if(app.shouldRunNDRangeKernels()){
-    app.run< ReductionNDRange<int>>();
-    app.run< ReductionNDRange<long long>>();
-    app.run< ReductionNDRange<float>>();
-    if(app.deviceSupportsFP64())
+  // app.run< ReductionNDRange<short>>();
+  if(app.shouldRunNDRangeKernels()) {
+    app.run<ReductionNDRange<int>>();
+    app.run<ReductionNDRange<long long>>();
+    app.run<ReductionNDRange<float>>();
+    if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) {
       app.run<ReductionNDRange<double>>();
+    }
   }
-  //app.run< ReductionHierarchical<short>>();
-  app.run< ReductionHierarchical<int>>();
-  app.run< ReductionHierarchical<long long>>();
-  app.run< ReductionHierarchical<float>>();
-  if(app.deviceSupportsFP64())
+  // app.run< ReductionHierarchical<short>>();
+  app.run<ReductionHierarchical<int>>();
+  app.run<ReductionHierarchical<long long>>();
+  app.run<ReductionHierarchical<float>>();
+  if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) {
     app.run<ReductionHierarchical<double>>();
-
+  }
   return 0;
 }
-
-
-
diff --git a/pattern/scan.cpp b/pattern/scan.cpp
index 1aaacc15..9258b297 100644
--- a/pattern/scan.cpp
+++ b/pattern/scan.cpp
@@ -1 +1 @@
-TODO: SYCL code with a scan
+TODO : SYCL code with a scan
diff --git a/pattern/segmentedreduction.cpp b/pattern/segmentedreduction.cpp
index ea22f1f1..fe35c0f9 100644
--- a/pattern/segmentedreduction.cpp
+++ b/pattern/segmentedreduction.cpp
@@ -1,130 +1,108 @@
 
 #include "common.h"
 
+#include <cassert>
 #include <iostream>
 #include <vector>
-#include <cassert>
 
-using namespace cl;
+using namespace sycl;
 
-template <typename T> class ReductionKernelNDRange;
-template <typename T> class ReductionKernelHierarchical;
+template <typename T>
+class ReductionKernelNDRange;
+template <typename T>
+class ReductionKernelHierarchical;
 
 template <typename T>
-class SegmentedReduction
-{
+class SegmentedReduction {
 protected:
-    std::vector<T> _input;
-    BenchmarkArgs _args;
-    PrefetchedBuffer<T, 1> _buff;
+  std::vector<T> _input;
+  BenchmarkArgs _args;
+  PrefetchedBuffer<T, 1> _buff;
+
 public:
-  SegmentedReduction(const BenchmarkArgs &args)
-      : _args{args}
-  {
-    
-    assert(_args.problem_size % _args.local_size == 0);
-  }
+  SegmentedReduction(const BenchmarkArgs& args) : _args{args} { assert(_args.problem_size % _args.local_size == 0); }
 
-  void generate_input(std::vector<T>& out)
-  {
+  void generate_input(std::vector<T>& out) {
     out.resize(_args.problem_size);
-    for(std::size_t i = 0; i < out.size(); ++i)
-      out[i] = static_cast<T>(i);
+    for(std::size_t i = 0; i < out.size(); ++i) out[i] = static_cast<T>(i);
   }
 
   void setup() {
     generate_input(_input);
-    _buff.initialize(_args.device_queue,_input.data(), sycl::range<1>(_args.problem_size));
+    _buff.initialize(_args.device_queue, _input.data(), sycl::range<1>(_args.problem_size));
   }
 
-  void submit_ndrange(std::vector<cl::sycl::event>& events){
-    
-    events.push_back(_args.device_queue.submit(
-        [&](sycl::handler& cgh) {
-
-      sycl::nd_range<1> ndrange {_args.problem_size, _args.local_size};
+  void submit_ndrange(std::vector<sycl::event>& events) {
+    events.push_back(_args.device_queue.submit([&](sycl::handler& cgh) {
+      sycl::nd_range<1> ndrange{_args.problem_size, _args.local_size};
 
-      using namespace cl::sycl::access;
+      using namespace sycl::access;
 
       auto acc = _buff.template get_access<mode::read_write>(cgh);
-      auto scratch = sycl::accessor<T, 1, mode::read_write, target::local>
-        {_args.local_size, cgh};
+      auto scratch = sycl::local_accessor<T, 1>{_args.local_size, cgh};
 
       const int group_size = _args.local_size;
 
-      cgh.parallel_for<ReductionKernelNDRange<T>>(
-        ndrange,
-        [=](sycl::nd_item<1> item) {
-          
-          const int lid = item.get_local_id(0);
-          const auto gid = item.get_global_id();
-
-          scratch[lid] = acc[gid];
-
-          for(int i = group_size/2; i > 0; i /= 2) {
+      cgh.parallel_for<ReductionKernelNDRange<T>>(ndrange, [=](sycl::nd_item<1> item) {
+        const int lid = item.get_local_id(0);
+        const auto gid = item.get_global_id();
 
-            item.barrier();
-            if(lid < i) 
-              scratch[lid] += scratch[lid + i];
+        scratch[lid] = acc[gid];
 
-          }
-          if(lid == 0) 
-            acc[gid] = scratch[0];
-        });
+        for(int i = group_size / 2; i > 0; i /= 2) {
+          sycl::group_barrier(item.get_group());
+          if(lid < i)
+            scratch[lid] += scratch[lid + i];
+        }
+        if(lid == 0)
+          acc[gid] = scratch[0];
+      });
     })); // submit
   }
 
-  void submit_hierarchical(std::vector<cl::sycl::event>& events){
-
-    events.push_back(_args.device_queue.submit(
-        [&](sycl::handler& cgh) {
-
+  void submit_hierarchical(std::vector<sycl::event>& events) {
+    events.push_back(_args.device_queue.submit([&](sycl::handler& cgh) {
       using namespace sycl::access;
 
       auto acc = _buff.template get_access<mode::read_write>(cgh);
-      auto scratch = sycl::accessor<T, 1, mode::read_write, target::local>
-        {_args.local_size, cgh};
+      auto scratch = sycl::local_accessor<T, 1>{_args.local_size, cgh};
 
       const int group_size = _args.local_size;
 
-      cgh.parallel_for_work_group<ReductionKernelHierarchical<T>>(
-        sycl::range<1>{_args.problem_size / _args.local_size},
-        sycl::range<1>{_args.local_size},
-        [=](sycl::group<1> grp) {
-
-          grp.parallel_for_work_item([&](sycl::h_item<1> idx){
-            const int lid = idx.get_local_id(0);
-            const auto gid = idx.get_global_id();
-
-            scratch[lid] = acc[gid];
-          });
-        
-          for(int i = group_size/2; i > 0; i /= 2) {
-            grp.parallel_for_work_item([&](sycl::h_item<1> idx){
+      cgh.parallel_for_work_group<ReductionKernelHierarchical<T>>(sycl::range<1>{_args.problem_size / _args.local_size},
+          sycl::range<1>{_args.local_size}, [=](sycl::group<1> grp) {
+            grp.parallel_for_work_item([&](sycl::h_item<1> idx) {
               const int lid = idx.get_local_id(0);
+              const auto gid = idx.get_global_id();
 
-              if (lid < i) 
-                scratch[lid] += scratch[lid + i];
+              scratch[lid] = acc[gid];
             });
-          }
 
-          grp.parallel_for_work_item([&](sycl::h_item<1> idx){
-            if(idx.get_local_id(0) == 0)
-              acc[idx.get_global_id()] = scratch[0];
+            for(int i = group_size / 2; i > 0; i /= 2) {
+              grp.parallel_for_work_item([&](sycl::h_item<1> idx) {
+                const int lid = idx.get_local_id(0);
+
+                if(lid < i)
+                  scratch[lid] += scratch[lid + i];
+              });
+            }
+
+            grp.parallel_for_work_item([&](sycl::h_item<1> idx) {
+              if(idx.get_local_id(0) == 0)
+                acc[idx.get_global_id()] = scratch[0];
+            });
           });
-        });
     })); // submit
   }
 
-  bool verify(VerificationSetting &ver) {
+  bool verify(VerificationSetting& ver) {
     std::vector<T> original_input;
     generate_input(original_input);
 
-    auto acc = _buff.template get_access<sycl::access::mode::read>();
+    auto acc = _buff.get_host_access();
     size_t num_groups = _args.problem_size / _args.local_size;
 
     for(size_t group = 0; group < num_groups; ++group) {
-      
       size_t group_offset = group * _args.local_size;
       T sum = 0;
 
@@ -144,19 +122,14 @@ class SegmentedReduction
 
     return true;
   }
-
- 
 };
 
-template<class T>
-class SegmentedReductionNDRange : public SegmentedReduction<T>
-{
+template <class T>
+class SegmentedReductionNDRange : public SegmentedReduction<T> {
 public:
-  SegmentedReductionNDRange(const BenchmarkArgs &args)
-  : SegmentedReduction<T>{args}
-  {}
+  SegmentedReductionNDRange(const BenchmarkArgs& args) : SegmentedReduction<T>{args} {}
 
-  void run(std::vector<cl::sycl::event>& events){
+  void run(std::vector<sycl::event>& events) {
     this->submit_ndrange(events);
     // Waiting is not necessary as the BenchmarkManager will already call
     // wait_and_throw() here
@@ -170,15 +143,12 @@ class SegmentedReductionNDRange : public SegmentedReduction<T>
   }
 };
 
-template<class T>
-class SegmentedReductionHierarchical : public SegmentedReduction<T>
-{
+template <class T>
+class SegmentedReductionHierarchical : public SegmentedReduction<T> {
 public:
-  SegmentedReductionHierarchical(const BenchmarkArgs &args)
-  : SegmentedReduction<T>{args}
-  {}
+  SegmentedReductionHierarchical(const BenchmarkArgs& args) : SegmentedReduction<T>{args} {}
 
-  void run(std::vector<cl::sycl::event>& events){
+  void run(std::vector<sycl::event>& events) {
     this->submit_hierarchical(events);
     // Waiting is not necessary as the BenchmarkManager will already call
     // wait_and_throw() here
@@ -192,28 +162,25 @@ class SegmentedReductionHierarchical : public SegmentedReduction<T>
   }
 };
 
-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
   BenchmarkApp app(argc, argv);
 
   if(app.shouldRunNDRangeKernels()) {
-    app.run< SegmentedReductionNDRange<short>>();
-    app.run< SegmentedReductionNDRange<int>>();
-    app.run< SegmentedReductionNDRange<long long>>();
-    app.run< SegmentedReductionNDRange<float>>();
-    if(app.deviceSupportsFP64())
+    app.run<SegmentedReductionNDRange<short>>();
+    app.run<SegmentedReductionNDRange<int>>();
+    app.run<SegmentedReductionNDRange<long long>>();
+    app.run<SegmentedReductionNDRange<float>>();
+    if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) {
       app.run<SegmentedReductionNDRange<double>>();
+    }
   }
 
-  app.run< SegmentedReductionHierarchical<short>>();
-  app.run< SegmentedReductionHierarchical<int>>();
-  app.run< SegmentedReductionHierarchical<long long>>();
-  app.run< SegmentedReductionHierarchical<float>>();
-  if(app.deviceSupportsFP64())
+  app.run<SegmentedReductionHierarchical<short>>();
+  app.run<SegmentedReductionHierarchical<int>>();
+  app.run<SegmentedReductionHierarchical<long long>>();
+  app.run<SegmentedReductionHierarchical<float>>();
+  if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) {
     app.run<SegmentedReductionHierarchical<double>>();
-
+  }
   return 0;
 }
-
-
-
diff --git a/pattern/segmentedscan.cpp b/pattern/segmentedscan.cpp
index 64de7319..dc892c82 100644
--- a/pattern/segmentedscan.cpp
+++ b/pattern/segmentedscan.cpp
@@ -1 +1 @@
-TODO: SYCL code with segmented scan
+TODO : SYCL code with segmented scan
diff --git a/polybench/2DConvolution.cpp b/polybench/2DConvolution.cpp
index 65c2d825..c35410cc 100644
--- a/polybench/2DConvolution.cpp
+++ b/polybench/2DConvolution.cpp
@@ -3,7 +3,7 @@
 
 #include <cstdlib>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "common.h"
 #include "polybenchUtilFuncts.h"
@@ -13,106 +13,108 @@ using DATA_TYPE = float;
 class conv2D;
 
 void init(DATA_TYPE* A, size_t size) {
-	const auto NI = size;
-	const auto NJ = size;
-
-	for(size_t i = 0; i < NI; ++i) {
-		for(size_t j = 0; j < NJ; ++j) {
-			A[i * NJ + j] = (float)rand() / (float)RAND_MAX;
-		}
-	}
+  const auto NI = size;
+  const auto NJ = size;
+
+  for(size_t i = 0; i < NI; ++i) {
+    for(size_t j = 0; j < NJ; ++j) {
+      A[i * NJ + j] = (float)rand() / (float)RAND_MAX;
+    }
+  }
 }
 
 void conv2D(DATA_TYPE* A, DATA_TYPE* B, size_t size) {
-	const auto NI = size;
-	const auto NJ = size;
-
-	const DATA_TYPE c11 = +0.2, c21 = +0.5, c31 = -0.8;
-	const DATA_TYPE c12 = -0.3, c22 = +0.6, c32 = -0.9;
-	const DATA_TYPE c13 = +0.4, c23 = +0.7, c33 = +0.10;
-
-	for(size_t i = 1; i < NI - 1; ++i) {
-		for(size_t j = 1; j < NJ - 1; ++j) {
-			B[i * NJ + j] = c11 * A[(i - 1) * NJ + (j - 1)] + c12 * A[(i + 0) * NJ + (j - 1)] + c13 * A[(i + 1) * NJ + (j - 1)]
-			                + c21 * A[(i - 1) * NJ + (j + 0)] + c22 * A[(i + 0) * NJ + (j + 0)] + c23 * A[(i + 1) * NJ + (j + 0)]
-			                + c31 * A[(i - 1) * NJ + (j + 1)] + c32 * A[(i + 0) * NJ + (j + 1)] + c33 * A[(i + 1) * NJ + (j + 1)];
-		}
-	}
+  const auto NI = size;
+  const auto NJ = size;
+
+  const DATA_TYPE c11 = +0.2, c21 = +0.5, c31 = -0.8;
+  const DATA_TYPE c12 = -0.3, c22 = +0.6, c32 = -0.9;
+  const DATA_TYPE c13 = +0.4, c23 = +0.7, c33 = +0.10;
+
+  for(size_t i = 1; i < NI - 1; ++i) {
+    for(size_t j = 1; j < NJ - 1; ++j) {
+      B[i * NJ + j] =
+          c11 * A[(i - 1) * NJ + (j - 1)] + c12 * A[(i + 0) * NJ + (j - 1)] + c13 * A[(i + 1) * NJ + (j - 1)] +
+          c21 * A[(i - 1) * NJ + (j + 0)] + c22 * A[(i + 0) * NJ + (j + 0)] + c23 * A[(i + 1) * NJ + (j + 0)] +
+          c31 * A[(i - 1) * NJ + (j + 1)] + c32 * A[(i + 0) * NJ + (j + 1)] + c33 * A[(i + 1) * NJ + (j + 1)];
+    }
+  }
 }
 
 class Polybench_2DConvolution {
-  public:
-	Polybench_2DConvolution(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
+public:
+  Polybench_2DConvolution(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
 
-	void setup() {
-		A.resize(size * size);
-		B.resize(size * size);
+  void setup() {
+    A.resize(size * size);
+    B.resize(size * size);
 
-		init(A.data(), size);
+    init(A.data(), size);
 
-		A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<2>(size, size));
-		B_buffer.initialize(args.device_queue, B.data(), cl::sycl::range<2>(size, size));
-	}
+    A_buffer.initialize(args.device_queue, A.data(), sycl::range<2>(size, size));
+    B_buffer.initialize(args.device_queue, B.data(), sycl::range<2>(size, size));
+  }
 
-	void run(std::vector<cl::sycl::event>& events) {
-		using namespace cl::sycl;
+  void run(std::vector<sycl::event>& events) {
+    using namespace sycl;
 
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto A = A_buffer.get_access<access::mode::read>(cgh);
-			auto B = B_buffer.get_access<access::mode::discard_write>(cgh);
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto A = A_buffer.get_access<access::mode::read>(cgh);
+      auto B = B_buffer.get_access<access::mode::discard_write>(cgh);
 
-			cgh.parallel_for<class conv2D>(B_buffer.get_range(), [=, size_ = size](item<2> item) {
-				const auto i = item[0];
-				const auto j = item[1];
+      cgh.parallel_for<class conv2D>(B_buffer.get_range(), [=, size_ = size](item<2> item) {
+        const auto i = item[0];
+        const auto j = item[1];
 
-				const DATA_TYPE c11 = +0.2, c21 = +0.5, c31 = -0.8;
-				const DATA_TYPE c12 = -0.3, c22 = +0.6, c32 = -0.9;
-				const DATA_TYPE c13 = +0.4, c23 = +0.7, c33 = +0.10;
+        const DATA_TYPE c11 = +0.2, c21 = +0.5, c31 = -0.8;
+        const DATA_TYPE c12 = -0.3, c22 = +0.6, c32 = -0.9;
+        const DATA_TYPE c13 = +0.4, c23 = +0.7, c33 = +0.10;
 
-				if((i > 0) && (j > 0) && (i < size_ - 1) && (j < size_ - 1)) {
-					B[item] = c11 * A[{(i - 1), (j - 1)}] + c12 * A[{(i + 0), (j - 1)}] + c13 * A[{(i + 1), (j - 1)}] + c21 * A[{(i - 1), (j + 0)}]
-					          + c22 * A[{(i + 0), (j + 0)}] + c23 * A[{(i + 1), (j + 0)}] + c31 * A[{(i - 1), (j + 1)}] + c32 * A[{(i + 0), (j + 1)}]
-					          + c33 * A[{(i + 1), (j + 1)}];
-				}
-			});
-		}));
-	}
+        if((i > 0) && (j > 0) && (i < size_ - 1) && (j < size_ - 1)) {
+          B[item] = c11 * A[{(i - 1), (j - 1)}] + c12 * A[{(i + 0), (j - 1)}] + c13 * A[{(i + 1), (j - 1)}] +
+                    c21 * A[{(i - 1), (j + 0)}] + c22 * A[{(i + 0), (j + 0)}] + c23 * A[{(i + 1), (j + 0)}] +
+                    c31 * A[{(i - 1), (j + 1)}] + c32 * A[{(i + 0), (j + 1)}] + c33 * A[{(i + 1), (j + 1)}];
+        }
+      });
+    }));
+  }
 
-	bool verify(VerificationSetting&) {
-		constexpr auto ERROR_THRESHOLD = 0.05;
+  bool verify(VerificationSetting&) {
+    constexpr auto ERROR_THRESHOLD = 0.05;
 
-		auto B_acc = B_buffer.get_access<cl::sycl::access::mode::read>();
+    auto B_acc = B_buffer.get_host_access();
 
-		std::vector<DATA_TYPE> B_cpu(size * size);
-		conv2D(A.data(), B_cpu.data(), size);
+    std::vector<DATA_TYPE> B_cpu(size * size);
+    conv2D(A.data(), B_cpu.data(), size);
 
-		for(size_t i = 0; i < size; i++) {
-			for(size_t j = 0; j < size; j++) {
-				if((i > 0) && (j > 0) && (i < size - 1) && (j < size - 1)) {
-					const auto diff = percentDiff(B_cpu[i * size + j], B_acc.get_pointer()[i * size + j]);
-					if(diff > ERROR_THRESHOLD) return false;
-				}
-			}
-		}
+    for(size_t i = 0; i < size; i++) {
+      for(size_t j = 0; j < size; j++) {
+        if((i > 0) && (j > 0) && (i < size - 1) && (j < size - 1)) {
+          const auto diff = percentDiff(B_cpu[i * size + j], B_acc.get_pointer()[i * size + j]);
+          if(diff > ERROR_THRESHOLD)
+            return false;
+        }
+      }
+    }
 
-		return true;
-	}
+    return true;
+  }
 
-	static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_2DConvolution"; }
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_2DConvolution"; }
 
-  private:
-	BenchmarkArgs args;
+private:
+  BenchmarkArgs args;
 
-	const size_t size;
-	std::vector<DATA_TYPE> A;
-	std::vector<DATA_TYPE> B;
+  const size_t size;
+  std::vector<DATA_TYPE> A;
+  std::vector<DATA_TYPE> B;
 
-	PrefetchedBuffer<DATA_TYPE, 2> A_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> B_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> A_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> B_buffer;
 };
 
 int main(int argc, char** argv) {
-	BenchmarkApp app(argc, argv);
-	app.run<Polybench_2DConvolution>();
-	return 0;
+  BenchmarkApp app(argc, argv);
+  app.run<Polybench_2DConvolution>();
+  return 0;
 }
diff --git a/polybench/2mm.cpp b/polybench/2mm.cpp
index 13286190..0f28d864 100644
--- a/polybench/2mm.cpp
+++ b/polybench/2mm.cpp
@@ -3,7 +3,7 @@
 
 #include <cstdlib>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "common.h"
 #include "polybenchUtilFuncts.h"
@@ -14,156 +14,157 @@ class Polybench_2mm_2;
 class Polybench_2mm_1;
 
 void init_array(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* C, DATA_TYPE* D, size_t size) {
-	const auto NI = size;
-	const auto NJ = size;
-	const auto NK = size;
-	const auto NL = size;
-
-	for(size_t i = 0; i < NI; i++) {
-		for(size_t j = 0; j < NK; j++) {
-			A[i * NI + j] = ((DATA_TYPE)i * j) / NI;
-		}
-	}
-
-	for(size_t i = 0; i < NK; i++) {
-		for(size_t j = 0; j < NJ; j++) {
-			B[i * NK + j] = ((DATA_TYPE)i * (j + 1)) / NJ;
-		}
-	}
-
-	for(size_t i = 0; i < NL; i++) {
-		for(size_t j = 0; j < NJ; j++) {
-			C[i * NL + j] = ((DATA_TYPE)i * (j + 3)) / NL;
-		}
-	}
-
-	for(size_t i = 0; i < NI; i++) {
-		for(size_t j = 0; j < NL; j++) {
-			D[i * NL + j] = ((DATA_TYPE)i * (j + 2)) / NK;
-		}
-	}
+  const auto NI = size;
+  const auto NJ = size;
+  const auto NK = size;
+  const auto NL = size;
+
+  for(size_t i = 0; i < NI; i++) {
+    for(size_t j = 0; j < NK; j++) {
+      A[i * NI + j] = ((DATA_TYPE)i * j) / NI;
+    }
+  }
+
+  for(size_t i = 0; i < NK; i++) {
+    for(size_t j = 0; j < NJ; j++) {
+      B[i * NK + j] = ((DATA_TYPE)i * (j + 1)) / NJ;
+    }
+  }
+
+  for(size_t i = 0; i < NL; i++) {
+    for(size_t j = 0; j < NJ; j++) {
+      C[i * NL + j] = ((DATA_TYPE)i * (j + 3)) / NL;
+    }
+  }
+
+  for(size_t i = 0; i < NI; i++) {
+    for(size_t j = 0; j < NL; j++) {
+      D[i * NL + j] = ((DATA_TYPE)i * (j + 2)) / NK;
+    }
+  }
 }
 
 void mm2_cpu(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* C, DATA_TYPE* D, DATA_TYPE* E, size_t size) {
-	const auto NI = size;
-	const auto NJ = size;
-	const auto NK = size;
-	const auto NL = size;
-
-	for(size_t i = 0; i < NI; i++) {
-		for(size_t j = 0; j < NJ; j++) {
-			for(size_t k = 0; k < NK; ++k) {
-				C[i * NJ + j] += A[i * NK + k] * B[k * NJ + j];
-			}
-		}
-	}
-
-	for(size_t i = 0; i < NI; i++) {
-		for(size_t j = 0; j < NL; j++) {
-			E[i * NL + j] = 0;
-			for(size_t k = 0; k < NJ; ++k) {
-				E[i * NL + j] += C[i * NJ + k] * D[k * NL + j];
-			}
-		}
-	}
+  const auto NI = size;
+  const auto NJ = size;
+  const auto NK = size;
+  const auto NL = size;
+
+  for(size_t i = 0; i < NI; i++) {
+    for(size_t j = 0; j < NJ; j++) {
+      for(size_t k = 0; k < NK; ++k) {
+        C[i * NJ + j] += A[i * NK + k] * B[k * NJ + j];
+      }
+    }
+  }
+
+  for(size_t i = 0; i < NI; i++) {
+    for(size_t j = 0; j < NL; j++) {
+      E[i * NL + j] = 0;
+      for(size_t k = 0; k < NJ; ++k) {
+        E[i * NL + j] += C[i * NJ + k] * D[k * NL + j];
+      }
+    }
+  }
 }
 
 class Polybench_2mm {
-  public:
-	Polybench_2mm(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
-
-	void setup() {
-		A.resize(size * size);
-		B.resize(size * size);
-		C.resize(size * size);
-		D.resize(size * size);
-		E.resize(size * size);
-
-		init_array(A.data(), B.data(), C.data(), D.data(), size);
-
-		A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<2>(size, size));
-		B_buffer.initialize(args.device_queue, B.data(), cl::sycl::range<2>(size, size));
-		C_buffer.initialize(args.device_queue, C.data(), cl::sycl::range<2>(size, size));
-		D_buffer.initialize(args.device_queue, D.data(), cl::sycl::range<2>(size, size));
-		E_buffer.initialize(args.device_queue, E.data(), cl::sycl::range<2>(size, size));
-	}
-
-	void run(std::vector<cl::sycl::event>& events) {
-		using namespace cl::sycl;
-
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto A = A_buffer.get_access<access::mode::read>(cgh);
-			auto B = B_buffer.get_access<access::mode::read>(cgh);
-			auto C = C_buffer.get_access<access::mode::read_write>(cgh);
-
-			cgh.parallel_for<Polybench_2mm_1>(C_buffer.get_range(), [=, size_ = size](item<2> item) {
-				const auto i = item[0];
-				const auto j = item[1];
-
-				for(size_t k = 0; k < size_; k++) {
-					C[item] += A[{i, k}] * B[{k, j}];
-				}
-			});
-		}));
-
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto C = C_buffer.get_access<access::mode::read>(cgh);
-			auto D = D_buffer.get_access<access::mode::read>(cgh);
-			auto E = E_buffer.get_access<access::mode::discard_write>(cgh);
-
-			cgh.parallel_for<Polybench_2mm_2>(E_buffer.get_range(), [=, size_ = size](item<2> item) {
-				const auto i = item[0];
-				const auto j = item[1];
-
-				E[item] = 0;
-				for(size_t k = 0; k < size_; k++) {
-					E[item] += C[{i, k}] * D[{k, j}];
-				}
-			});
-		}));
-	}
-
-	bool verify(VerificationSetting&) {
-		constexpr auto ERROR_THRESHOLD = 0.05;
-
-		init_array(A.data(), B.data(), C.data(), D.data(), size);
-
-		std::vector<DATA_TYPE> E_cpu(size * size);
-		mm2_cpu(A.data(), B.data(), C.data(), D.data(), E_cpu.data(), size);
-
-		auto E_acc = E_buffer.get_access<cl::sycl::access::mode::read>();
-
-		for(size_t i = 0; i < size; i++) {
-			for(size_t j = 0; j < size; j++) {
-				const auto diff = percentDiff(E_cpu[i * size + j], E_acc.get_pointer()[i * size + j]);
-				if(diff > ERROR_THRESHOLD) return false;
-			}
-		}
-
-		return true;
-	}
-
-	static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_2mm"; }
-
-  private:
-	BenchmarkArgs args;
-
-	const size_t size;
-	std::vector<DATA_TYPE> A;
-	std::vector<DATA_TYPE> B;
-	std::vector<DATA_TYPE> C;
-	std::vector<DATA_TYPE> D;
-	std::vector<DATA_TYPE> E;
-
-	PrefetchedBuffer<DATA_TYPE, 2> A_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> B_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> C_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> D_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> E_buffer;
+public:
+  Polybench_2mm(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
+
+  void setup() {
+    A.resize(size * size);
+    B.resize(size * size);
+    C.resize(size * size);
+    D.resize(size * size);
+    E.resize(size * size);
+
+    init_array(A.data(), B.data(), C.data(), D.data(), size);
+
+    A_buffer.initialize(args.device_queue, A.data(), sycl::range<2>(size, size));
+    B_buffer.initialize(args.device_queue, B.data(), sycl::range<2>(size, size));
+    C_buffer.initialize(args.device_queue, C.data(), sycl::range<2>(size, size));
+    D_buffer.initialize(args.device_queue, D.data(), sycl::range<2>(size, size));
+    E_buffer.initialize(args.device_queue, E.data(), sycl::range<2>(size, size));
+  }
+
+  void run(std::vector<sycl::event>& events) {
+    using namespace sycl;
+
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto A = A_buffer.get_access<access::mode::read>(cgh);
+      auto B = B_buffer.get_access<access::mode::read>(cgh);
+      auto C = C_buffer.get_access<access::mode::read_write>(cgh);
+
+      cgh.parallel_for<Polybench_2mm_1>(C_buffer.get_range(), [=, size_ = size](item<2> item) {
+        const auto i = item[0];
+        const auto j = item[1];
+
+        for(size_t k = 0; k < size_; k++) {
+          C[item] += A[{i, k}] * B[{k, j}];
+        }
+      });
+    }));
+
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto C = C_buffer.get_access<access::mode::read>(cgh);
+      auto D = D_buffer.get_access<access::mode::read>(cgh);
+      auto E = E_buffer.get_access<access::mode::discard_write>(cgh);
+
+      cgh.parallel_for<Polybench_2mm_2>(E_buffer.get_range(), [=, size_ = size](item<2> item) {
+        const auto i = item[0];
+        const auto j = item[1];
+
+        E[item] = 0;
+        for(size_t k = 0; k < size_; k++) {
+          E[item] += C[{i, k}] * D[{k, j}];
+        }
+      });
+    }));
+  }
+
+  bool verify(VerificationSetting&) {
+    constexpr auto ERROR_THRESHOLD = 0.05;
+
+    init_array(A.data(), B.data(), C.data(), D.data(), size);
+
+    std::vector<DATA_TYPE> E_cpu(size * size);
+    mm2_cpu(A.data(), B.data(), C.data(), D.data(), E_cpu.data(), size);
+
+    auto E_acc = E_buffer.get_host_access();
+
+    for(size_t i = 0; i < size; i++) {
+      for(size_t j = 0; j < size; j++) {
+        const auto diff = percentDiff(E_cpu[i * size + j], E_acc.get_pointer()[i * size + j]);
+        if(diff > ERROR_THRESHOLD)
+          return false;
+      }
+    }
+
+    return true;
+  }
+
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_2mm"; }
+
+private:
+  BenchmarkArgs args;
+
+  const size_t size;
+  std::vector<DATA_TYPE> A;
+  std::vector<DATA_TYPE> B;
+  std::vector<DATA_TYPE> C;
+  std::vector<DATA_TYPE> D;
+  std::vector<DATA_TYPE> E;
+
+  PrefetchedBuffer<DATA_TYPE, 2> A_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> B_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> C_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> D_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> E_buffer;
 };
 
 int main(int argc, char** argv) {
-	BenchmarkApp app(argc, argv);
-	app.run<Polybench_2mm>();
-	return 0;
+  BenchmarkApp app(argc, argv);
+  app.run<Polybench_2mm>();
+  return 0;
 }
diff --git a/polybench/3DConvolution.cpp b/polybench/3DConvolution.cpp
index 2b057ac7..d199b5a3 100644
--- a/polybench/3DConvolution.cpp
+++ b/polybench/3DConvolution.cpp
@@ -3,7 +3,7 @@
 
 #include <cstdlib>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "common.h"
 #include "polybenchUtilFuncts.h"
@@ -13,125 +13,135 @@ using DATA_TYPE = float;
 class conv3D;
 
 void init(DATA_TYPE* A, size_t size) {
-	const auto NI = size;
-	const auto NJ = size;
-	const auto NK = size;
-
-	for(size_t i = 0; i < NI; ++i) {
-		for(size_t j = 0; j < NJ; ++j) {
-			for(size_t k = 0; k < NK; ++k) {
-				A[i * (NK * NJ) + j * NK + k] = i % 12 + 2 * (j % 7) + 3 * (k % 13);
-			}
-		}
-	}
+  const auto NI = size;
+  const auto NJ = size;
+  const auto NK = size;
+
+  for(size_t i = 0; i < NI; ++i) {
+    for(size_t j = 0; j < NJ; ++j) {
+      for(size_t k = 0; k < NK; ++k) {
+        A[i * (NK * NJ) + j * NK + k] = i % 12 + 2 * (j % 7) + 3 * (k % 13);
+      }
+    }
+  }
 }
 
 void conv3D(DATA_TYPE* A, DATA_TYPE* B, size_t size) {
-	const auto NI = size;
-	const auto NJ = size;
-	const auto NK = size;
-
-	const DATA_TYPE c11 = +2, c21 = +5, c31 = -8;
-	const DATA_TYPE c12 = -3, c22 = +6, c32 = -9;
-	const DATA_TYPE c13 = +4, c23 = +7, c33 = +10;
-
-	for(size_t i = 1; i < NI - 1; ++i) {
-		for(size_t j = 1; j < NJ - 1; ++j) {
-			for(size_t k = 1; k < NK - 1; ++k) {
-				B[i * (NK * NJ) + j * NK + k] = c11 * A[(i - 1) * (NK * NJ) + (j - 1) * NK + (k - 1)] + c13 * A[(i + 1) * (NK * NJ) + (j - 1) * NK + (k - 1)]
-				                                + c21 * A[(i - 1) * (NK * NJ) + (j - 1) * NK + (k - 1)] + c23 * A[(i + 1) * (NK * NJ) + (j - 1) * NK + (k - 1)]
-				                                + c31 * A[(i - 1) * (NK * NJ) + (j - 1) * NK + (k - 1)] + c33 * A[(i + 1) * (NK * NJ) + (j - 1) * NK + (k - 1)]
-				                                + c12 * A[(i + 0) * (NK * NJ) + (j - 1) * NK + (k + 0)] + c22 * A[(i + 0) * (NK * NJ) + (j + 0) * NK + (k + 0)]
-				                                + c32 * A[(i + 0) * (NK * NJ) + (j + 1) * NK + (k + 0)] + c11 * A[(i - 1) * (NK * NJ) + (j - 1) * NK + (k + 1)]
-				                                + c13 * A[(i + 1) * (NK * NJ) + (j - 1) * NK + (k + 1)] + c21 * A[(i - 1) * (NK * NJ) + (j + 0) * NK + (k + 1)]
-				                                + c23 * A[(i + 1) * (NK * NJ) + (j + 0) * NK + (k + 1)] + c31 * A[(i - 1) * (NK * NJ) + (j + 1) * NK + (k + 1)]
-				                                + c33 * A[(i + 1) * (NK * NJ) + (j + 1) * NK + (k + 1)];
-			}
-		}
-	}
+  const auto NI = size;
+  const auto NJ = size;
+  const auto NK = size;
+
+  const DATA_TYPE c11 = +2, c21 = +5, c31 = -8;
+  const DATA_TYPE c12 = -3, c22 = +6, c32 = -9;
+  const DATA_TYPE c13 = +4, c23 = +7, c33 = +10;
+
+  for(size_t i = 1; i < NI - 1; ++i) {
+    for(size_t j = 1; j < NJ - 1; ++j) {
+      for(size_t k = 1; k < NK - 1; ++k) {
+        B[i * (NK * NJ) + j * NK + k] = c11 * A[(i - 1) * (NK * NJ) + (j - 1) * NK + (k - 1)] +
+                                        c13 * A[(i + 1) * (NK * NJ) + (j - 1) * NK + (k - 1)] +
+                                        c21 * A[(i - 1) * (NK * NJ) + (j - 1) * NK + (k - 1)] +
+                                        c23 * A[(i + 1) * (NK * NJ) + (j - 1) * NK + (k - 1)] +
+                                        c31 * A[(i - 1) * (NK * NJ) + (j - 1) * NK + (k - 1)] +
+                                        c33 * A[(i + 1) * (NK * NJ) + (j - 1) * NK + (k - 1)] +
+                                        c12 * A[(i + 0) * (NK * NJ) + (j - 1) * NK + (k + 0)] +
+                                        c22 * A[(i + 0) * (NK * NJ) + (j + 0) * NK + (k + 0)] +
+                                        c32 * A[(i + 0) * (NK * NJ) + (j + 1) * NK + (k + 0)] +
+                                        c11 * A[(i - 1) * (NK * NJ) + (j - 1) * NK + (k + 1)] +
+                                        c13 * A[(i + 1) * (NK * NJ) + (j - 1) * NK + (k + 1)] +
+                                        c21 * A[(i - 1) * (NK * NJ) + (j + 0) * NK + (k + 1)] +
+                                        c23 * A[(i + 1) * (NK * NJ) + (j + 0) * NK + (k + 1)] +
+                                        c31 * A[(i - 1) * (NK * NJ) + (j + 1) * NK + (k + 1)] +
+                                        c33 * A[(i + 1) * (NK * NJ) + (j + 1) * NK + (k + 1)];
+      }
+    }
+  }
 }
 
 class Polybench_3DConvolution {
-  public:
-	Polybench_3DConvolution(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
-
-	void setup() {
-		A.resize(size * size * size);
-		B.resize(size * size * size);
-
-		init(A.data(), size);
-
-		A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<3>(size, size, size));
-		B_buffer.initialize(args.device_queue, B.data(), cl::sycl::range<3>(size, size, size));
-	}
-
-	void run(std::vector<cl::sycl::event>& events) {
-		using namespace cl::sycl;
-
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto A = A_buffer.get_access<access::mode::read>(cgh);
-			auto B = B_buffer.get_access<access::mode::discard_write>(cgh);
-
-			cgh.parallel_for<class conv3D>(B_buffer.get_range(), [=, size_ = size](item<3> item) {
-				const auto i = item[0];
-				const auto j = item[1];
-				const auto k = item[2];
-
-				const DATA_TYPE c11 = +2, c21 = +5, c31 = -8;
-				const DATA_TYPE c12 = -3, c22 = +6, c32 = -9;
-				const DATA_TYPE c13 = +4, c23 = +7, c33 = +10;
-
-				if((i > 0) && (j > 0) && (k > 0) && (i < (size_ - 1)) && (j < (size_ - 1)) && (k < (size_ - 1))) {
-					B[item] = c11 * A[{(i - 1), (j - 1), (k - 1)}] + c13 * A[{(i + 1), (j - 1), (k - 1)}] + c21 * A[{(i - 1), (j - 1), (k - 1)}]
-					          + c23 * A[{(i + 1), (j - 1), (k - 1)}] + c31 * A[{(i - 1), (j - 1), (k - 1)}] + c33 * A[{(i + 1), (j - 1), (k - 1)}]
-					          + c12 * A[{(i + 0), (j - 1), (k + 0)}] + c22 * A[{(i + 0), (j + 0), (k + 0)}] + c32 * A[{(i + 0), (j + 1), (k + 0)}]
-					          + c11 * A[{(i - 1), (j - 1), (k + 1)}] + c13 * A[{(i + 1), (j - 1), (k + 1)}] + c21 * A[{(i - 1), (j + 0), (k + 1)}]
-					          + c23 * A[{(i + 1), (j + 0), (k + 1)}] + c31 * A[{(i - 1), (j + 1), (k + 1)}] + c33 * A[{(i + 1), (j + 1), (k + 1)}];
-				}
-			});
-		}));
-	}
-
-
-	bool verify(VerificationSetting&) {
-		constexpr auto ERROR_THRESHOLD = 0.05;
-
-		std::vector<DATA_TYPE> B_cpu(size * size * size);
-		conv3D(A.data(), B_cpu.data(), size);
-
-		auto B_acc = B_buffer.get_access<cl::sycl::access::mode::read>();
-
-		for(size_t i = 0; i < size; i++) {
-			for(size_t j = 0; j < size; j++) {
-				for(size_t k = 0; k < size; k++) {
-					if((i > 0) && (j > 0) && (k > 0) && (i < (size - 1)) && (j < (size - 1)) && (k < (size - 1))) {
-						const auto diff = percentDiff(B_cpu[i * (size * size) + j * size + k],
-								B_acc.get_pointer()[i * (size * size) + j * size + k]);
-						if(diff > ERROR_THRESHOLD)
-							return false;
-					}
-				}
-			}
-		}
-
-		return true;
-	}
-
-	static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_3DConvolution"; }
-
-  private:
-	BenchmarkArgs args;
-
-	const size_t size;
-	std::vector<DATA_TYPE> A;
-	std::vector<DATA_TYPE> B;
-
-	PrefetchedBuffer<DATA_TYPE, 3> A_buffer;
-	PrefetchedBuffer<DATA_TYPE, 3> B_buffer;
+public:
+  Polybench_3DConvolution(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
+
+  void setup() {
+    A.resize(size * size * size);
+    B.resize(size * size * size);
+
+    init(A.data(), size);
+
+    A_buffer.initialize(args.device_queue, A.data(), sycl::range<3>(size, size, size));
+    B_buffer.initialize(args.device_queue, B.data(), sycl::range<3>(size, size, size));
+  }
+
+  void run(std::vector<sycl::event>& events) {
+    using namespace sycl;
+
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto A = A_buffer.get_access<access::mode::read>(cgh);
+      auto B = B_buffer.get_access<access::mode::discard_write>(cgh);
+
+      cgh.parallel_for<class conv3D>(B_buffer.get_range(), [=, size_ = size](item<3> item) {
+        const auto i = item[0];
+        const auto j = item[1];
+        const auto k = item[2];
+
+        const DATA_TYPE c11 = +2, c21 = +5, c31 = -8;
+        const DATA_TYPE c12 = -3, c22 = +6, c32 = -9;
+        const DATA_TYPE c13 = +4, c23 = +7, c33 = +10;
+
+        if((i > 0) && (j > 0) && (k > 0) && (i < (size_ - 1)) && (j < (size_ - 1)) && (k < (size_ - 1))) {
+          B[item] = c11 * A[{(i - 1), (j - 1), (k - 1)}] + c13 * A[{(i + 1), (j - 1), (k - 1)}] +
+                    c21 * A[{(i - 1), (j - 1), (k - 1)}] + c23 * A[{(i + 1), (j - 1), (k - 1)}] +
+                    c31 * A[{(i - 1), (j - 1), (k - 1)}] + c33 * A[{(i + 1), (j - 1), (k - 1)}] +
+                    c12 * A[{(i + 0), (j - 1), (k + 0)}] + c22 * A[{(i + 0), (j + 0), (k + 0)}] +
+                    c32 * A[{(i + 0), (j + 1), (k + 0)}] + c11 * A[{(i - 1), (j - 1), (k + 1)}] +
+                    c13 * A[{(i + 1), (j - 1), (k + 1)}] + c21 * A[{(i - 1), (j + 0), (k + 1)}] +
+                    c23 * A[{(i + 1), (j + 0), (k + 1)}] + c31 * A[{(i - 1), (j + 1), (k + 1)}] +
+                    c33 * A[{(i + 1), (j + 1), (k + 1)}];
+        }
+      });
+    }));
+  }
+
+
+  bool verify(VerificationSetting&) {
+    constexpr auto ERROR_THRESHOLD = 0.05;
+
+    std::vector<DATA_TYPE> B_cpu(size * size * size);
+    conv3D(A.data(), B_cpu.data(), size);
+
+    auto B_acc = B_buffer.get_host_access();
+
+    for(size_t i = 0; i < size; i++) {
+      for(size_t j = 0; j < size; j++) {
+        for(size_t k = 0; k < size; k++) {
+          if((i > 0) && (j > 0) && (k > 0) && (i < (size - 1)) && (j < (size - 1)) && (k < (size - 1))) {
+            const auto diff = percentDiff(
+                B_cpu[i * (size * size) + j * size + k], B_acc.get_pointer()[i * (size * size) + j * size + k]);
+            if(diff > ERROR_THRESHOLD)
+              return false;
+          }
+        }
+      }
+    }
+
+    return true;
+  }
+
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_3DConvolution"; }
+
+private:
+  BenchmarkArgs args;
+
+  const size_t size;
+  std::vector<DATA_TYPE> A;
+  std::vector<DATA_TYPE> B;
+
+  PrefetchedBuffer<DATA_TYPE, 3> A_buffer;
+  PrefetchedBuffer<DATA_TYPE, 3> B_buffer;
 };
 
 int main(int argc, char** argv) {
-	BenchmarkApp app(argc, argv);
-	app.run<Polybench_3DConvolution>();
-	return 0;
+  BenchmarkApp app(argc, argv);
+  app.run<Polybench_3DConvolution>();
+  return 0;
 }
diff --git a/polybench/3mm.cpp b/polybench/3mm.cpp
index d3c42467..f451b11f 100644
--- a/polybench/3mm.cpp
+++ b/polybench/3mm.cpp
@@ -3,7 +3,7 @@
 
 #include <cstdlib>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "common.h"
 #include "polybenchUtilFuncts.h"
@@ -15,196 +15,198 @@ class Polybench_3mm_2;
 class Polybench_3mm_3;
 
 void init_array(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* C, DATA_TYPE* D, size_t size) {
-	const auto NI = size;
-	const auto NJ = size;
-	const auto NK = size;
-	const auto NL = size;
-	const auto NM = size;
-
-	for(size_t i = 0; i < NI; i++) {
-		for(size_t j = 0; j < NK; j++) {
-			A[i * NK + j] = ((DATA_TYPE)i * j) / NI;
-		}
-	}
-
-	for(size_t i = 0; i < NK; i++) {
-		for(size_t j = 0; j < NJ; j++) {
-			B[i * NJ + j] = ((DATA_TYPE)i * (j + 1)) / NJ;
-		}
-	}
-
-	for(size_t i = 0; i < NJ; i++) {
-		for(size_t j = 0; j < NM; j++) {
-			C[i * NM + j] = ((DATA_TYPE)i * (j + 3)) / NL;
-		}
-	}
-
-	for(size_t i = 0; i < NM; i++) {
-		for(size_t j = 0; j < NL; j++) {
-			D[i * NL + j] = ((DATA_TYPE)i * (j + 2)) / NK;
-		}
-	}
+  const auto NI = size;
+  const auto NJ = size;
+  const auto NK = size;
+  const auto NL = size;
+  const auto NM = size;
+
+  for(size_t i = 0; i < NI; i++) {
+    for(size_t j = 0; j < NK; j++) {
+      A[i * NK + j] = ((DATA_TYPE)i * j) / NI;
+    }
+  }
+
+  for(size_t i = 0; i < NK; i++) {
+    for(size_t j = 0; j < NJ; j++) {
+      B[i * NJ + j] = ((DATA_TYPE)i * (j + 1)) / NJ;
+    }
+  }
+
+  for(size_t i = 0; i < NJ; i++) {
+    for(size_t j = 0; j < NM; j++) {
+      C[i * NM + j] = ((DATA_TYPE)i * (j + 3)) / NL;
+    }
+  }
+
+  for(size_t i = 0; i < NM; i++) {
+    for(size_t j = 0; j < NL; j++) {
+      D[i * NL + j] = ((DATA_TYPE)i * (j + 2)) / NK;
+    }
+  }
 }
 
-void mm3_cpu(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* C, DATA_TYPE* D, DATA_TYPE* E, DATA_TYPE* F, DATA_TYPE* G, size_t size) {
-	const auto NI = size;
-	const auto NJ = size;
-	const auto NK = size;
-	const auto NL = size;
-	const auto NM = size;
-
-	/* E := A*B */
-	for(size_t i = 0; i < NI; i++) {
-		for(size_t j = 0; j < NJ; j++) {
-			E[i * NJ + j] = 0;
-			for(size_t k = 0; k < NK; ++k) {
-				E[i * NJ + j] += A[i * NK + k] * B[k * NJ + j];
-			}
-		}
-	}
-
-	/* F := C*D */
-	for(size_t i = 0; i < NI; i++) {
-		for(size_t j = 0; j < NL; j++) {
-			F[i * NL + j] = 0;
-			for(size_t k = 0; k < NM; ++k) {
-				F[i * NL + j] += C[i * NM + k] * D[k * NL + j];
-			}
-		}
-	}
-
-	/* G := E*F */
-	for(size_t i = 0; i < NI; i++) {
-		for(size_t j = 0; j < NL; j++) {
-			G[i * NL + j] = 0;
-			for(size_t k = 0; k < NJ; ++k) {
-				G[i * NL + j] += E[i * NJ + k] * F[k * NL + j];
-			}
-		}
-	}
+void mm3_cpu(
+    DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* C, DATA_TYPE* D, DATA_TYPE* E, DATA_TYPE* F, DATA_TYPE* G, size_t size) {
+  const auto NI = size;
+  const auto NJ = size;
+  const auto NK = size;
+  const auto NL = size;
+  const auto NM = size;
+
+  /* E := A*B */
+  for(size_t i = 0; i < NI; i++) {
+    for(size_t j = 0; j < NJ; j++) {
+      E[i * NJ + j] = 0;
+      for(size_t k = 0; k < NK; ++k) {
+        E[i * NJ + j] += A[i * NK + k] * B[k * NJ + j];
+      }
+    }
+  }
+
+  /* F := C*D */
+  for(size_t i = 0; i < NI; i++) {
+    for(size_t j = 0; j < NL; j++) {
+      F[i * NL + j] = 0;
+      for(size_t k = 0; k < NM; ++k) {
+        F[i * NL + j] += C[i * NM + k] * D[k * NL + j];
+      }
+    }
+  }
+
+  /* G := E*F */
+  for(size_t i = 0; i < NI; i++) {
+    for(size_t j = 0; j < NL; j++) {
+      G[i * NL + j] = 0;
+      for(size_t k = 0; k < NJ; ++k) {
+        G[i * NL + j] += E[i * NJ + k] * F[k * NL + j];
+      }
+    }
+  }
 }
 
 class Polybench_3mm {
-  public:
-	Polybench_3mm(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
-
-	void setup() {
-		A.resize(size * size);
-		B.resize(size * size);
-		C.resize(size * size);
-		D.resize(size * size);
-		E.resize(size * size);
-		F.resize(size * size);
-		G.resize(size * size);
-
-		init_array(A.data(), B.data(), C.data(), D.data(), size);
-
-		A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<2>(size, size));
-		B_buffer.initialize(args.device_queue, B.data(), cl::sycl::range<2>(size, size));
-		C_buffer.initialize(args.device_queue, C.data(), cl::sycl::range<2>(size, size));
-		D_buffer.initialize(args.device_queue, D.data(), cl::sycl::range<2>(size, size));
-		E_buffer.initialize(args.device_queue, E.data(), cl::sycl::range<2>(size, size));
-		F_buffer.initialize(args.device_queue, F.data(), cl::sycl::range<2>(size, size));
-		G_buffer.initialize(args.device_queue, G.data(), cl::sycl::range<2>(size, size));
-	}
-
-	void run(std::vector<cl::sycl::event>& events) {
-		using namespace cl::sycl;
-
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto A = A_buffer.get_access<access::mode::read>(cgh);
-			auto B = B_buffer.get_access<access::mode::read>(cgh);
-			auto E = E_buffer.get_access<access::mode::read_write>(cgh);
-
-			cgh.parallel_for<Polybench_3mm_1>(E_buffer.get_range(), [=, size_ = size](item<2> item) {
-				const auto i = item[0];
-				const auto j = item[1];
-
-				for(size_t k = 0; k < size_; k++) {
-					E[item] += A[{i, k}] * B[{k, j}];
-				}
-			});
-		}));
-
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto C = C_buffer.get_access<access::mode::read>(cgh);
-			auto D = D_buffer.get_access<access::mode::read>(cgh);
-			auto F = F_buffer.get_access<access::mode::read_write>(cgh);
-
-			cgh.parallel_for<Polybench_3mm_2>(F_buffer.get_range(), [=, size_ = size](item<2> item) {
-				const auto i = item[0];
-				const auto j = item[1];
-
-				for(size_t k = 0; k < size_; k++) {
-					F[item] += C[{i, k}] * D[{k, j}];
-				}
-			});
-		}));
-
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto E = E_buffer.get_access<access::mode::read>(cgh);
-			auto F = F_buffer.get_access<access::mode::read>(cgh);
-			auto G = G_buffer.get_access<access::mode::read_write>(cgh);
-
-			cgh.parallel_for<Polybench_3mm_3>(F_buffer.get_range(), [=, size_ = size](item<2> item) {
-				const auto i = item[0];
-				const auto j = item[1];
-
-				for(size_t k = 0; k < size_; k++) {
-					G[item] += E[{i, k}] * F[{k, j}];
-				}
-			});
-		}));
-	}
-
-	bool verify(VerificationSetting&) {
-		constexpr auto ERROR_THRESHOLD = 0.05;
-
-		init_array(A.data(), B.data(), C.data(), D.data(), size);
-
-		std::vector<DATA_TYPE> E_cpu(size * size);
-		std::vector<DATA_TYPE> F_cpu(size * size);
-		std::vector<DATA_TYPE> G_cpu(size * size);
-
-		mm3_cpu(A.data(), B.data(), C.data(), D.data(), E_cpu.data(), F_cpu.data(), G_cpu.data(), size);
-
-		auto G_acc = G_buffer.get_access<cl::sycl::access::mode::read>();
-
-		for(size_t i = 0; i < size; i++) {
-			for(size_t j = 0; j < size; j++) {
-				const auto diff = percentDiff(G_cpu[i * size + j], G_acc.get_pointer()[i * size + j]);
-				if(diff > ERROR_THRESHOLD) return false;
-			}
-		}
-
-		return true;
-	}
-
-	static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_3mm"; }
-
-  private:
-	BenchmarkArgs args;
-
-	const size_t size;
-	std::vector<DATA_TYPE> A;
-	std::vector<DATA_TYPE> B;
-	std::vector<DATA_TYPE> C;
-	std::vector<DATA_TYPE> D;
-	std::vector<DATA_TYPE> E;
-	std::vector<DATA_TYPE> F;
-	std::vector<DATA_TYPE> G;
-
-	PrefetchedBuffer<DATA_TYPE, 2> A_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> B_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> C_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> D_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> E_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> F_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> G_buffer;
+public:
+  Polybench_3mm(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
+
+  void setup() {
+    A.resize(size * size);
+    B.resize(size * size);
+    C.resize(size * size);
+    D.resize(size * size);
+    E.resize(size * size);
+    F.resize(size * size);
+    G.resize(size * size);
+
+    init_array(A.data(), B.data(), C.data(), D.data(), size);
+
+    A_buffer.initialize(args.device_queue, A.data(), sycl::range<2>(size, size));
+    B_buffer.initialize(args.device_queue, B.data(), sycl::range<2>(size, size));
+    C_buffer.initialize(args.device_queue, C.data(), sycl::range<2>(size, size));
+    D_buffer.initialize(args.device_queue, D.data(), sycl::range<2>(size, size));
+    E_buffer.initialize(args.device_queue, E.data(), sycl::range<2>(size, size));
+    F_buffer.initialize(args.device_queue, F.data(), sycl::range<2>(size, size));
+    G_buffer.initialize(args.device_queue, G.data(), sycl::range<2>(size, size));
+  }
+
+  void run(std::vector<sycl::event>& events) {
+    using namespace sycl;
+
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto A = A_buffer.get_access<access::mode::read>(cgh);
+      auto B = B_buffer.get_access<access::mode::read>(cgh);
+      auto E = E_buffer.get_access<access::mode::read_write>(cgh);
+
+      cgh.parallel_for<Polybench_3mm_1>(E_buffer.get_range(), [=, size_ = size](item<2> item) {
+        const auto i = item[0];
+        const auto j = item[1];
+
+        for(size_t k = 0; k < size_; k++) {
+          E[item] += A[{i, k}] * B[{k, j}];
+        }
+      });
+    }));
+
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto C = C_buffer.get_access<access::mode::read>(cgh);
+      auto D = D_buffer.get_access<access::mode::read>(cgh);
+      auto F = F_buffer.get_access<access::mode::read_write>(cgh);
+
+      cgh.parallel_for<Polybench_3mm_2>(F_buffer.get_range(), [=, size_ = size](item<2> item) {
+        const auto i = item[0];
+        const auto j = item[1];
+
+        for(size_t k = 0; k < size_; k++) {
+          F[item] += C[{i, k}] * D[{k, j}];
+        }
+      });
+    }));
+
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto E = E_buffer.get_access<access::mode::read>(cgh);
+      auto F = F_buffer.get_access<access::mode::read>(cgh);
+      auto G = G_buffer.get_access<access::mode::read_write>(cgh);
+
+      cgh.parallel_for<Polybench_3mm_3>(F_buffer.get_range(), [=, size_ = size](item<2> item) {
+        const auto i = item[0];
+        const auto j = item[1];
+
+        for(size_t k = 0; k < size_; k++) {
+          G[item] += E[{i, k}] * F[{k, j}];
+        }
+      });
+    }));
+  }
+
+  bool verify(VerificationSetting&) {
+    constexpr auto ERROR_THRESHOLD = 0.05;
+
+    init_array(A.data(), B.data(), C.data(), D.data(), size);
+
+    std::vector<DATA_TYPE> E_cpu(size * size);
+    std::vector<DATA_TYPE> F_cpu(size * size);
+    std::vector<DATA_TYPE> G_cpu(size * size);
+
+    mm3_cpu(A.data(), B.data(), C.data(), D.data(), E_cpu.data(), F_cpu.data(), G_cpu.data(), size);
+
+    auto G_acc = G_buffer.get_host_access();
+
+    for(size_t i = 0; i < size; i++) {
+      for(size_t j = 0; j < size; j++) {
+        const auto diff = percentDiff(G_cpu[i * size + j], G_acc.get_pointer()[i * size + j]);
+        if(diff > ERROR_THRESHOLD)
+          return false;
+      }
+    }
+
+    return true;
+  }
+
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_3mm"; }
+
+private:
+  BenchmarkArgs args;
+
+  const size_t size;
+  std::vector<DATA_TYPE> A;
+  std::vector<DATA_TYPE> B;
+  std::vector<DATA_TYPE> C;
+  std::vector<DATA_TYPE> D;
+  std::vector<DATA_TYPE> E;
+  std::vector<DATA_TYPE> F;
+  std::vector<DATA_TYPE> G;
+
+  PrefetchedBuffer<DATA_TYPE, 2> A_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> B_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> C_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> D_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> E_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> F_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> G_buffer;
 };
 
 int main(int argc, char** argv) {
-	BenchmarkApp app(argc, argv);
-	app.run<Polybench_3mm>();
-	return 0;
+  BenchmarkApp app(argc, argv);
+  app.run<Polybench_3mm>();
+  return 0;
 }
diff --git a/polybench/atax.cpp b/polybench/atax.cpp
index 6321282d..e4aa7958 100644
--- a/polybench/atax.cpp
+++ b/polybench/atax.cpp
@@ -3,7 +3,7 @@
 
 #include <cstdlib>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "common.h"
 #include "polybenchUtilFuncts.h"
@@ -18,121 +18,122 @@ class Atax1;
 class Atax2;
 
 void init_array(DATA_TYPE* x, DATA_TYPE* A, size_t size) {
-	const auto NX = size;
-	const auto NY = size;
-
-	for(size_t i = 0; i < NX; i++) {
-		x[i] = i * M_PI;
-		for(size_t j = 0; j < NY; j++) {
-			A[i * NY + j] = ((DATA_TYPE)i * (j)) / NX;
-		}
-	}
+  const auto NX = size;
+  const auto NY = size;
+
+  for(size_t i = 0; i < NX; i++) {
+    x[i] = i * M_PI;
+    for(size_t j = 0; j < NY; j++) {
+      A[i * NY + j] = ((DATA_TYPE)i * (j)) / NX;
+    }
+  }
 }
 
 void atax_cpu(DATA_TYPE* A, DATA_TYPE* x, DATA_TYPE* y, DATA_TYPE* tmp, size_t size) {
-	const auto NX = size;
-	const auto NY = size;
-
-	for(size_t i = 0; i < NX; i++) {
-		for(size_t j = 0; j < NY; j++) {
-			tmp[i] += A[i * NY + j] * x[j];
-		}
-
-		for(size_t j = 0; j < NY; j++) {
-			y[j] += A[i * NY + j] * tmp[i];
-		}
-	}
+  const auto NX = size;
+  const auto NY = size;
+
+  for(size_t i = 0; i < NX; i++) {
+    for(size_t j = 0; j < NY; j++) {
+      tmp[i] += A[i * NY + j] * x[j];
+    }
+
+    for(size_t j = 0; j < NY; j++) {
+      y[j] += A[i * NY + j] * tmp[i];
+    }
+  }
 }
 
 class Polybench_Atax {
-  public:
-	Polybench_Atax(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
+public:
+  Polybench_Atax(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
 
-	void setup() {
-		A.resize(size * size);
-		x.resize(size);
-		y.resize(size);
-		tmp.resize(size);
+  void setup() {
+    A.resize(size * size);
+    x.resize(size);
+    y.resize(size);
+    tmp.resize(size);
 
-		init_array(x.data(), A.data(), size);
+    init_array(x.data(), A.data(), size);
 
-		A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<2>{size, size});
-		x_buffer.initialize(args.device_queue, x.data(), cl::sycl::range<1>{size});
-		y_buffer.initialize(args.device_queue, y.data(), cl::sycl::range<1>{size});
-		tmp_buffer.initialize(args.device_queue, tmp.data(), cl::sycl::range<1>{size});
-	}
+    A_buffer.initialize(args.device_queue, A.data(), sycl::range<2>{size, size});
+    x_buffer.initialize(args.device_queue, x.data(), sycl::range<1>{size});
+    y_buffer.initialize(args.device_queue, y.data(), sycl::range<1>{size});
+    tmp_buffer.initialize(args.device_queue, tmp.data(), sycl::range<1>{size});
+  }
 
-	void run(std::vector<cl::sycl::event>& events) {
-		using namespace cl::sycl;
+  void run(std::vector<sycl::event>& events) {
+    using namespace sycl;
 
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto A = A_buffer.get_access<access::mode::read>(cgh);
-			auto x = x_buffer.get_access<access::mode::read>(cgh);
-			auto tmp = tmp_buffer.get_access<access::mode::read_write>(cgh);
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto A = A_buffer.get_access<access::mode::read>(cgh);
+      auto x = x_buffer.get_access<access::mode::read>(cgh);
+      auto tmp = tmp_buffer.get_access<access::mode::read_write>(cgh);
 
-			cgh.parallel_for<Atax1>(tmp_buffer.get_range(), [=, size_ = size](item<1> item) {
-				const auto i = item[0];
+      cgh.parallel_for<Atax1>(tmp_buffer.get_range(), [=, size_ = size](item<1> item) {
+        const auto i = item[0];
 
-				for(size_t j = 0; j < size_; j++) {
-					tmp[item] += A[{i, j}] * x[j];
-				}
-			});
-		}));
+        for(size_t j = 0; j < size_; j++) {
+          tmp[item] += A[{i, j}] * x[j];
+        }
+      });
+    }));
 
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto A = A_buffer.get_access<access::mode::read>(cgh);
-			auto y = y_buffer.get_access<access::mode::read_write>(cgh);
-			auto tmp = tmp_buffer.get_access<access::mode::read>(cgh);
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto A = A_buffer.get_access<access::mode::read>(cgh);
+      auto y = y_buffer.get_access<access::mode::read_write>(cgh);
+      auto tmp = tmp_buffer.get_access<access::mode::read>(cgh);
 
-			cgh.parallel_for<Atax2>(y_buffer.get_range(), [=, size_ = size](item<1> item) {
-				const auto j = item[0];
+      cgh.parallel_for<Atax2>(y_buffer.get_range(), [=, size_ = size](item<1> item) {
+        const auto j = item[0];
 
-				for(size_t i = 0; i < size_; i++) {
-					y[item] += A[{i, j}] * tmp[i];
-				}
-			});
-		}));
-	}
+        for(size_t i = 0; i < size_; i++) {
+          y[item] += A[{i, j}] * tmp[i];
+        }
+      });
+    }));
+  }
 
-	bool verify(VerificationSetting&) {
-		constexpr auto ERROR_THRESHOLD = 0.05;
+  bool verify(VerificationSetting&) {
+    constexpr auto ERROR_THRESHOLD = 0.05;
 
-		init_array(x.data(), A.data(), size);
+    init_array(x.data(), A.data(), size);
 
-		std::vector<DATA_TYPE> y_cpu(size);
-		std::vector<DATA_TYPE> tmp_cpu(size);
+    std::vector<DATA_TYPE> y_cpu(size);
+    std::vector<DATA_TYPE> tmp_cpu(size);
 
-		atax_cpu(A.data(), x.data(), y_cpu.data(), tmp_cpu.data(), size);
+    atax_cpu(A.data(), x.data(), y_cpu.data(), tmp_cpu.data(), size);
 
-		auto y_acc = y_buffer.get_access<cl::sycl::access::mode::read>();
+    auto y_acc = y_buffer.get_host_access();
 
-		for(size_t i = 0; i < size; i++) {
-			const auto diff = percentDiff(y_cpu[i], y_acc[i]);
-			if(diff > ERROR_THRESHOLD) return false;
-		}
+    for(size_t i = 0; i < size; i++) {
+      const auto diff = percentDiff(y_cpu[i], y_acc[i]);
+      if(diff > ERROR_THRESHOLD)
+        return false;
+    }
 
-		return true;
-	}
+    return true;
+  }
 
-	static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Atax"; }
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Atax"; }
 
-  private:
-	BenchmarkArgs args;
+private:
+  BenchmarkArgs args;
 
-	const size_t size;
-	std::vector<DATA_TYPE> A;
-	std::vector<DATA_TYPE> x;
-	std::vector<DATA_TYPE> y;
-	std::vector<DATA_TYPE> tmp;
+  const size_t size;
+  std::vector<DATA_TYPE> A;
+  std::vector<DATA_TYPE> x;
+  std::vector<DATA_TYPE> y;
+  std::vector<DATA_TYPE> tmp;
 
-	PrefetchedBuffer<DATA_TYPE, 2> A_buffer;
-	PrefetchedBuffer<DATA_TYPE, 1> x_buffer;
-	PrefetchedBuffer<DATA_TYPE, 1> y_buffer;
-	PrefetchedBuffer<DATA_TYPE, 1> tmp_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> A_buffer;
+  PrefetchedBuffer<DATA_TYPE, 1> x_buffer;
+  PrefetchedBuffer<DATA_TYPE, 1> y_buffer;
+  PrefetchedBuffer<DATA_TYPE, 1> tmp_buffer;
 };
 
 int main(int argc, char** argv) {
-	BenchmarkApp app(argc, argv);
-	app.run<Polybench_Atax>();
-	return 0;
+  BenchmarkApp app(argc, argv);
+  app.run<Polybench_Atax>();
+  return 0;
 }
diff --git a/polybench/bicg.cpp b/polybench/bicg.cpp
index 0e569256..4655e602 100644
--- a/polybench/bicg.cpp
+++ b/polybench/bicg.cpp
@@ -3,7 +3,7 @@
 
 #include <cstdlib>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "common.h"
 #include "polybenchUtilFuncts.h"
@@ -18,130 +18,132 @@ class Bicg1;
 class Bicg2;
 
 void init_array(DATA_TYPE* A, DATA_TYPE* p, DATA_TYPE* r, size_t size) {
-	const auto NX = size;
-	const auto NY = size;
+  const auto NX = size;
+  const auto NY = size;
 
-	for(size_t i = 0; i < NX; i++) {
-		r[i] = i * M_PI;
+  for(size_t i = 0; i < NX; i++) {
+    r[i] = i * M_PI;
 
-		for(size_t j = 0; j < NY; j++) {
-			A[i * NY + j] = ((DATA_TYPE)i * j) / NX;
-		}
-	}
+    for(size_t j = 0; j < NY; j++) {
+      A[i * NY + j] = ((DATA_TYPE)i * j) / NX;
+    }
+  }
 
-	for(size_t i = 0; i < NY; i++) {
-		p[i] = i * M_PI;
-	}
+  for(size_t i = 0; i < NY; i++) {
+    p[i] = i * M_PI;
+  }
 }
 
 void bicg_cpu(DATA_TYPE* A, DATA_TYPE* r, DATA_TYPE* s, DATA_TYPE* p, DATA_TYPE* q, size_t size) {
-	const auto NX = size;
-	const auto NY = size;
-
-	for(size_t i = 0; i < NX; i++) {
-		for(size_t j = 0; j < NY; j++) {
-			s[j] += r[i] * A[i * NY + j];
-			q[i] += A[i * NY + j] * p[j];
-		}
-	}
+  const auto NX = size;
+  const auto NY = size;
+
+  for(size_t i = 0; i < NX; i++) {
+    for(size_t j = 0; j < NY; j++) {
+      s[j] += r[i] * A[i * NY + j];
+      q[i] += A[i * NY + j] * p[j];
+    }
+  }
 }
 
 class Polybench_Bicg {
-  public:
-	Polybench_Bicg(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
-
-	void setup() {
-		A.resize(size * size);
-		r.resize(size);
-		s.resize(size);
-		p.resize(size);
-		q.resize(size);
-
-		init_array(A.data(), p.data(), r.data(), size);
-
-		A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<2>(size, size));
-		r_buffer.initialize(args.device_queue, r.data(), cl::sycl::range<1>(size));
-	  s_buffer.initialize(args.device_queue, s.data(), cl::sycl::range<1>(size));
-		p_buffer.initialize(args.device_queue, p.data(), cl::sycl::range<1>(size));
-		q_buffer.initialize(args.device_queue, q.data(), cl::sycl::range<1>(size));
-	}
-
-	void run(std::vector<cl::sycl::event>& events) {
-		using namespace cl::sycl;
-
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto A = A_buffer.get_access<access::mode::read>(cgh);
-			auto r = r_buffer.get_access<access::mode::read>(cgh);
-			auto s = s_buffer.get_access<access::mode::read_write>(cgh);
-
-			cgh.parallel_for<Bicg1>(s_buffer.get_range(), [=, size_ = size](item<1> item) {
-				const auto j = item[0];
-
-				for(size_t i = 0; i < size_; i++) {
-					s[item] += A[{i, j}] * r[i];
-				}
-			});
-		}));
-
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto A = A_buffer.get_access<access::mode::read>(cgh);
-			auto p = p_buffer.get_access<access::mode::read>(cgh);
-			auto q = q_buffer.get_access<access::mode::read_write>(cgh);
-
-			cgh.parallel_for<Bicg2>(q_buffer.get_range(), [=, size_ = size](item<1> item) {
-				const auto i = item[0];
-
-				for(size_t j = 0; j < size_; j++) {
-					q[item] += A[{i, j}] * p[j];
-				}
-			});
-		}));
-	}
-
-	bool verify(VerificationSetting&) {
-		constexpr auto ERROR_THRESHOLD = 0.05;
-
-		// Trigger writebacks
-		s_buffer.reset();
-		q_buffer.reset();
-
-		std::vector<DATA_TYPE> s_cpu(size);
-		std::vector<DATA_TYPE> q_cpu(size);
-
-		bicg_cpu(A.data(), r.data(), s_cpu.data(), p.data(), q_cpu.data(), size);
-
-		for(size_t i = 0; i < size; i++) {
-			auto diff = percentDiff(s_cpu[i], s[i]);
-			if(diff > ERROR_THRESHOLD) return false;
-
-			diff = percentDiff(q_cpu[i], q[i]);
-			if(diff > ERROR_THRESHOLD) return false;
-		}
-
-		return true;
-	}
-
-	static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Bicg"; }
-
-  private:
-	BenchmarkArgs args;
-
-	const size_t size;
-	std::vector<DATA_TYPE> A;
-	std::vector<DATA_TYPE> r;
-	std::vector<DATA_TYPE> s;
-	std::vector<DATA_TYPE> p;
-	std::vector<DATA_TYPE> q;
-
-	PrefetchedBuffer<DATA_TYPE, 2> A_buffer;
-	PrefetchedBuffer<DATA_TYPE, 1> r_buffer;
-	PrefetchedBuffer<DATA_TYPE, 1> s_buffer;
-	PrefetchedBuffer<DATA_TYPE, 1> p_buffer;
-	PrefetchedBuffer<DATA_TYPE, 1> q_buffer;
+public:
+  Polybench_Bicg(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
+
+  void setup() {
+    A.resize(size * size);
+    r.resize(size);
+    s.resize(size);
+    p.resize(size);
+    q.resize(size);
+
+    init_array(A.data(), p.data(), r.data(), size);
+
+    A_buffer.initialize(args.device_queue, A.data(), sycl::range<2>(size, size));
+    r_buffer.initialize(args.device_queue, r.data(), sycl::range<1>(size));
+    s_buffer.initialize(args.device_queue, s.data(), sycl::range<1>(size));
+    p_buffer.initialize(args.device_queue, p.data(), sycl::range<1>(size));
+    q_buffer.initialize(args.device_queue, q.data(), sycl::range<1>(size));
+  }
+
+  void run(std::vector<sycl::event>& events) {
+    using namespace sycl;
+
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto A = A_buffer.get_access<access::mode::read>(cgh);
+      auto r = r_buffer.get_access<access::mode::read>(cgh);
+      auto s = s_buffer.get_access<access::mode::read_write>(cgh);
+
+      cgh.parallel_for<Bicg1>(s_buffer.get_range(), [=, size_ = size](item<1> item) {
+        const auto j = item[0];
+
+        for(size_t i = 0; i < size_; i++) {
+          s[item] += A[{i, j}] * r[i];
+        }
+      });
+    }));
+
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto A = A_buffer.get_access<access::mode::read>(cgh);
+      auto p = p_buffer.get_access<access::mode::read>(cgh);
+      auto q = q_buffer.get_access<access::mode::read_write>(cgh);
+
+      cgh.parallel_for<Bicg2>(q_buffer.get_range(), [=, size_ = size](item<1> item) {
+        const auto i = item[0];
+
+        for(size_t j = 0; j < size_; j++) {
+          q[item] += A[{i, j}] * p[j];
+        }
+      });
+    }));
+  }
+
+  bool verify(VerificationSetting&) {
+    constexpr auto ERROR_THRESHOLD = 0.05;
+
+    // Trigger writebacks
+    s_buffer.reset();
+    q_buffer.reset();
+
+    std::vector<DATA_TYPE> s_cpu(size);
+    std::vector<DATA_TYPE> q_cpu(size);
+
+    bicg_cpu(A.data(), r.data(), s_cpu.data(), p.data(), q_cpu.data(), size);
+
+    for(size_t i = 0; i < size; i++) {
+      auto diff = percentDiff(s_cpu[i], s[i]);
+      if(diff > ERROR_THRESHOLD)
+        return false;
+
+      diff = percentDiff(q_cpu[i], q[i]);
+      if(diff > ERROR_THRESHOLD)
+        return false;
+    }
+
+    return true;
+  }
+
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Bicg"; }
+
+private:
+  BenchmarkArgs args;
+
+  const size_t size;
+  std::vector<DATA_TYPE> A;
+  std::vector<DATA_TYPE> r;
+  std::vector<DATA_TYPE> s;
+  std::vector<DATA_TYPE> p;
+  std::vector<DATA_TYPE> q;
+
+  PrefetchedBuffer<DATA_TYPE, 2> A_buffer;
+  PrefetchedBuffer<DATA_TYPE, 1> r_buffer;
+  PrefetchedBuffer<DATA_TYPE, 1> s_buffer;
+  PrefetchedBuffer<DATA_TYPE, 1> p_buffer;
+  PrefetchedBuffer<DATA_TYPE, 1> q_buffer;
 };
 
 int main(int argc, char** argv) {
-	BenchmarkApp app(argc, argv);
-	app.run<Polybench_Bicg>();
-	return 0;
+  BenchmarkApp app(argc, argv);
+  app.run<Polybench_Bicg>();
+  return 0;
 }
diff --git a/polybench/common/polybenchUtilFuncts.h b/polybench/common/polybenchUtilFuncts.h
index 9cc0b0b7..f5390f82 100644
--- a/polybench/common/polybenchUtilFuncts.h
+++ b/polybench/common/polybenchUtilFuncts.h
@@ -14,34 +14,33 @@
 #define SMALL_FLOAT_VAL 0.00000001f
 
 double rtclock() {
-	struct timezone Tzp;
-	struct timeval Tp;
-	int stat;
-	stat = gettimeofday(&Tp, &Tzp);
-	if(stat != 0) printf("Error return from gettimeofday: %d", stat);
-	return (Tp.tv_sec + Tp.tv_usec * 1.0e-6);
+  struct timezone Tzp;
+  struct timeval Tp;
+  int stat;
+  stat = gettimeofday(&Tp, &Tzp);
+  if(stat != 0)
+    printf("Error return from gettimeofday: %d", stat);
+  return (Tp.tv_sec + Tp.tv_usec * 1.0e-6);
 }
 
 
 float absVal(float a) {
-	if(a < 0) {
-		return (a * -1);
-	} else {
-		return a;
-	}
+  if(a < 0) {
+    return (a * -1);
+  } else {
+    return a;
+  }
 }
 
 
 float percentDiff(double val1, double val2) {
-	if((absVal(val1) < 0.01) && (absVal(val2) < 0.01)) {
-		return 0.0f;
-	} else {
-		return 100.0f * (absVal(absVal(val1 - val2) / absVal(val1 + SMALL_FLOAT_VAL)));
-	}
+  if((absVal(val1) < 0.01) && (absVal(val2) < 0.01)) {
+    return 0.0f;
+  } else {
+    return 100.0f * (absVal(absVal(val1 - val2) / absVal(val1 + SMALL_FLOAT_VAL)));
+  }
 }
 
-static bool shouldDoCpu(void) {
-	return getenv("SYCL_BENCH_SKIP_CPU") == NULL;
-}
+static bool shouldDoCpu(void) { return getenv("SYCL_BENCH_SKIP_CPU") == NULL; }
 
 #endif // POLYBENCH_UTIL_FUNCTS_H
diff --git a/polybench/common/syclUtilFuncts.h b/polybench/common/syclUtilFuncts.h
index 77ab6fc8..bff71444 100644
--- a/polybench/common/syclUtilFuncts.h
+++ b/polybench/common/syclUtilFuncts.h
@@ -1,18 +1,18 @@
 #ifndef SYCL_UTIL_FUNCTS_H
 #define SYCL_UTIL_FUNCTS_H
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 template <typename T, int Dims>
-void initDeviceBuffer(cl::sycl::queue& queue, cl::sycl::buffer<T, Dims>& buffer, T* data) {
-	using namespace cl::sycl;
+void initDeviceBuffer(sycl::queue& queue, sycl::buffer<T, Dims>& buffer, T* data) {
+  using namespace sycl;
 
-	queue.submit([&](handler& cgh) {
-		auto accessor = buffer.template get_access<access::mode::discard_write>(cgh);
-		cgh.copy(data, accessor);
-	});
+  queue.submit([&](handler& cgh) {
+    auto accessor = buffer.template get_access<access::mode::discard_write>(cgh);
+    cgh.copy(data, accessor);
+  });
 
-	queue.wait();
+  queue.wait();
 }
 
 #endif
diff --git a/polybench/correlation.cpp b/polybench/correlation.cpp
index 7e098417..ab552b91 100644
--- a/polybench/correlation.cpp
+++ b/polybench/correlation.cpp
@@ -4,7 +4,7 @@
 #include <cmath>
 #include <cstdlib>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "common.h"
 #include "polybenchUtilFuncts.h"
@@ -23,210 +23,219 @@ class CorrelationCorr;
 class Correlation5;
 
 void init_arrays(DATA_TYPE* data, size_t size) {
-	const auto M = size;
-	const auto N = size;
-
-	for(size_t i = 0; i <= M; i++) {
-		for(size_t j = 0; j <= N; j++) {
-			data[i * N + j] = ((DATA_TYPE)i * j) / (M + 1);
-		}
-	}
+  const auto M = size;
+  const auto N = size;
+
+  for(size_t i = 0; i <= M; i++) {
+    for(size_t j = 0; j <= N; j++) {
+      data[i * N + j] = ((DATA_TYPE)i * j) / (M + 1);
+    }
+  }
 }
 
 void correlation(DATA_TYPE* data, DATA_TYPE* mean, DATA_TYPE* stddev, DATA_TYPE* symmat, size_t size) {
-	const auto M = size;
-	const auto N = size;
-
-	// Determine mean of column vectors of input data matrix
-	for(size_t j = 1; j <= M; j++) {
-		mean[j] = 0.0;
-
-		for(size_t i = 1; i <= N; i++) {
-			mean[j] += data[i * (M + 1) + j];
-		}
-
-		mean[j] /= (DATA_TYPE)FLOAT_N;
-	}
-
-	// Determine standard deviations of column vectors of data matrix.
-	for(size_t j = 1; j <= M; j++) {
-		stddev[j] = 0.0;
-
-		for(size_t i = 1; i <= N; i++) {
-			stddev[j] += (data[i * (M + 1) + j] - mean[j]) * (data[i * (M + 1) + j] - mean[j]);
-		}
-
-		stddev[j] /= FLOAT_N;
-		stddev[j] = sqrt_of_array_cell(stddev, j);
-		stddev[j] = stddev[j] <= EPS ? 1.0 : stddev[j];
-	}
-
-	// Center and reduce the column vectors.
-	for(size_t i = 1; i <= N; i++) {
-		for(size_t j = 1; j <= M; j++) {
-			data[i * (M + 1) + j] -= mean[j];
-			data[i * (M + 1) + j] /= sqrt(FLOAT_N);
-			data[i * (M + 1) + j] /= stddev[j];
-		}
-	}
-
-	// Calculate the m * m correlation matrix.
-	for(size_t j1 = 1; j1 <= M - 1; j1++) {
-		symmat[j1 * (M + 1) + j1] = 1.0;
-
-		for(size_t j2 = j1 + 1; j2 <= M; j2++) {
-			symmat[j1 * (M + 1) + j2] = 0.0;
-
-			for(size_t i = 1; i <= N; i++) {
-				symmat[j1 * (M + 1) + j2] += (data[i * (M + 1) + j1] * data[i * (M + 1) + j2]);
-			}
-
-			symmat[j2 * (M + 1) + j1] = symmat[j1 * (M + 1) + j2];
-		}
-	}
-
-	symmat[M * (M + 1) + M] = 1.0;
+  const auto M = size;
+  const auto N = size;
+
+  // Determine mean of column vectors of input data matrix
+  for(size_t j = 1; j <= M; j++) {
+    mean[j] = 0.0;
+
+    for(size_t i = 1; i <= N; i++) {
+      mean[j] += data[i * (M + 1) + j];
+    }
+
+    mean[j] /= (DATA_TYPE)FLOAT_N;
+  }
+
+  // Determine standard deviations of column vectors of data matrix.
+  for(size_t j = 1; j <= M; j++) {
+    stddev[j] = 0.0;
+
+    for(size_t i = 1; i <= N; i++) {
+      stddev[j] += (data[i * (M + 1) + j] - mean[j]) * (data[i * (M + 1) + j] - mean[j]);
+    }
+
+    stddev[j] /= FLOAT_N;
+    stddev[j] = sqrt_of_array_cell(stddev, j);
+    stddev[j] = stddev[j] <= EPS ? 1.0 : stddev[j];
+  }
+
+  // Center and reduce the column vectors.
+  for(size_t i = 1; i <= N; i++) {
+    for(size_t j = 1; j <= M; j++) {
+      data[i * (M + 1) + j] -= mean[j];
+      data[i * (M + 1) + j] /= sqrt(FLOAT_N);
+      data[i * (M + 1) + j] /= stddev[j];
+    }
+  }
+
+  // Calculate the m * m correlation matrix.
+  for(size_t j1 = 1; j1 <= M - 1; j1++) {
+    symmat[j1 * (M + 1) + j1] = 1.0;
+
+    for(size_t j2 = j1 + 1; j2 <= M; j2++) {
+      symmat[j1 * (M + 1) + j2] = 0.0;
+
+      for(size_t i = 1; i <= N; i++) {
+        symmat[j1 * (M + 1) + j2] += (data[i * (M + 1) + j1] * data[i * (M + 1) + j2]);
+      }
+
+      symmat[j2 * (M + 1) + j1] = symmat[j1 * (M + 1) + j2];
+    }
+  }
+
+  symmat[M * (M + 1) + M] = 1.0;
 }
 
 class Polybench_Correlation {
-  public:
-	Polybench_Correlation(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
-
-	void setup() {
-		data.resize((size + 1) * (size + 1));
-		mean.resize(size + 1);
-		stddev.resize(size + 1);
-		symmat.resize((size + 1) * (size + 1));
-
-		init_arrays(data.data(), size);
-
-		data_buffer.initialize(args.device_queue, data.data(), cl::sycl::range<2>(size + 1, size + 1));
-		mean_buffer.initialize(args.device_queue, mean.data(), cl::sycl::range<1>(size + 1));
-		stddev_buffer.initialize(args.device_queue, stddev.data(), cl::sycl::range<1>(size + 1));
-		symmat_buffer.initialize(args.device_queue, symmat.data(), cl::sycl::range<2>(size + 1, size + 1));
-	}
-
-	void run(std::vector<cl::sycl::event>& events) {
-		using namespace cl::sycl;
-
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto data = data_buffer.get_access<access::mode::read>(cgh);
-			auto mean = mean_buffer.get_access<access::mode::read_write>(cgh);
-
-			cgh.parallel_for<CorrelationMean>(range<1>(size), id<1>(1), [=, N_ = size](item<1> item) {
-				const auto j = item[0];
-
-				for(size_t i = 1; i <= N_; i++) {
-					mean[item] += data[{i, j}];
-				}
-				mean[item] /= ((DATA_TYPE)FLOAT_N);
-			});
-		}));
-
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto data = data_buffer.get_access<access::mode::read>(cgh);
-			auto mean = mean_buffer.get_access<access::mode::read>(cgh);
-			auto stddev = stddev_buffer.get_access<access::mode::read_write>(cgh);
-
-			cgh.parallel_for<CorrelationStd>(range<1>(size), id<1>(1), [=, N_ = size](item<1> item) {
-				const auto j = item[0];
-
-				for(size_t i = 1; i <= N_; i++) {
-					stddev[item] += (data[{i, j}] - mean[item]) * (data[{i, j}] - mean[item]);
-				}
-
-				stddev[item] /= FLOAT_N;
-				stddev[item] = cl::sycl::sqrt(stddev[item]);
-				stddev[item] = stddev[item] <= EPS ? 1.0 : stddev[item];
-			});
-		}));
-
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto data = data_buffer.get_access<access::mode::read_write>(cgh);
-			auto mean = mean_buffer.get_access<access::mode::read>(cgh);
-			auto stddev = stddev_buffer.get_access<access::mode::read>(cgh);
-
-			cgh.parallel_for<CorrelationReduce>(range<2>(size, size), id<2>(1, 1), [=](item<2> item) {
-				const auto j = item[1];
-
-				data[item] -= mean[j];
-				data[item] /= cl::sycl::sqrt(FLOAT_N);
-				data[item] /= stddev[j];
-			});
-		}));
-
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto data = data_buffer.get_access<access::mode::read>(cgh);
-			auto symmat = symmat_buffer.get_access<access::mode::read_write>(cgh);
-
-			cgh.parallel_for<CorrelationCorr>(range<1>(size), id<1>(1), [=, M_ = size, N_ = size](item<1> item) {
-				// if(item[0] >= M_ - 1) return;
-
-				const auto j1 = item[0];
-
-				symmat[{j1, j1}] = 1.0;
-
-				for(size_t j2 = j1 + 1; j2 <= M_; j2++) {
-					symmat[{j1, j2}] = 0.0;
-
-					for(size_t i = 1; i <= N_; i++) {
-						symmat[{j1, j2}] += data[{i, j1}] * data[{i, j2}];
-					}
-
-					symmat[{j2, j1}] = symmat[{j1, j2}];
-				}
-			});
-		}));
-
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto symmat = symmat_buffer.get_access<access::mode::discard_write>(cgh);
-			cgh.parallel_for<Correlation5>(range<2>(1, 1), id<2>(size, size), [=](item<2> item) { symmat[item] = 1.0; });
-		}));
-	}
-
-	bool verify(VerificationSetting&) {
-		constexpr auto ERROR_THRESHOLD = 0.05;
-
-		std::vector<DATA_TYPE> data_cpu((size + 1) * (size + 1));
-		std::vector<DATA_TYPE> mean_cpu(size + 1);
-		std::vector<DATA_TYPE> stddev_cpu(size + 1);
-		std::vector<DATA_TYPE> symmat_cpu((size + 1) * (size + 1));
-
-		// Trigger writeback
-		symmat_buffer.reset();
-
-		init_arrays(data_cpu.data(), size);
-		correlation(data_cpu.data(), mean_cpu.data(), stddev_cpu.data(), symmat_cpu.data(), size);
-
-		for(size_t i = 1; i < size + 1; i++) {
-			for(size_t j = 1; j < size + 1; j++) {
-				const auto diff = percentDiff(symmat_cpu[i * (size + 1) + j], symmat[i * (size + 1) + j]);
-				if(diff > ERROR_THRESHOLD) return false;
-			}
-		}
-
-		return true;
-	}
-
-	static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Correlation"; }
-
-  private:
-	BenchmarkArgs args;
-
-	const size_t size;
-	std::vector<DATA_TYPE> data;
-	std::vector<DATA_TYPE> mean;
-	std::vector<DATA_TYPE> stddev;
-	std::vector<DATA_TYPE> symmat;
-
-	PrefetchedBuffer<DATA_TYPE, 2> data_buffer;
+public:
+  Polybench_Correlation(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
+
+  void setup() {
+    data.resize((size + 1) * (size + 1));
+    mean.resize(size + 1);
+    stddev.resize(size + 1);
+    symmat.resize((size + 1) * (size + 1));
+
+    init_arrays(data.data(), size);
+
+    data_buffer.initialize(args.device_queue, data.data(), sycl::range<2>(size + 1, size + 1));
+    mean_buffer.initialize(args.device_queue, mean.data(), sycl::range<1>(size + 1));
+    stddev_buffer.initialize(args.device_queue, stddev.data(), sycl::range<1>(size + 1));
+    symmat_buffer.initialize(args.device_queue, symmat.data(), sycl::range<2>(size + 1, size + 1));
+  }
+
+  void run(std::vector<sycl::event>& events) {
+    using namespace sycl;
+
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto data = data_buffer.get_access<access::mode::read>(cgh);
+      auto mean = mean_buffer.get_access<access::mode::read_write>(cgh);
+
+      cgh.parallel_for<CorrelationMean>(range<1>(size), [=, N_ = size](id<1> gid) {
+        const id<1> offset(1);
+        const auto j = gid[0] + offset[0];
+
+        for(size_t i = 1; i <= N_; i++) {
+          mean[gid + offset] += data[{i, j}];
+        }
+        mean[gid + offset] /= ((DATA_TYPE)FLOAT_N);
+      });
+    }));
+
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto data = data_buffer.get_access<access::mode::read>(cgh);
+      auto mean = mean_buffer.get_access<access::mode::read>(cgh);
+      auto stddev = stddev_buffer.get_access<access::mode::read_write>(cgh);
+
+      cgh.parallel_for<CorrelationStd>(range<1>(size), [=, N_ = size](id<1> gid) {
+        const id<1> offset(1);
+        const auto adj_id = gid + offset;
+        const auto j = gid[0] + offset[0];
+
+        for(size_t i = 1; i <= N_; i++) {
+          stddev[adj_id] += (data[{i, j}] - mean[adj_id]) * (data[{i, j}] - mean[adj_id]);
+        }
+
+        stddev[adj_id] /= FLOAT_N;
+        stddev[adj_id] = sycl::sqrt(stddev[adj_id]);
+        stddev[adj_id] = stddev[adj_id] <= EPS ? 1.0 : stddev[adj_id];
+      });
+    }));
+
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto data = data_buffer.get_access<access::mode::read_write>(cgh);
+      auto mean = mean_buffer.get_access<access::mode::read>(cgh);
+      auto stddev = stddev_buffer.get_access<access::mode::read>(cgh);
+
+      cgh.parallel_for<CorrelationReduce>(range<2>(size, size), [=](id<2> gid) {
+        const id<2> offset(1, 1);
+        const auto adj_id = gid + offset;
+        const auto j = gid[1] + offset[1];
+
+        data[adj_id] -= mean[j];
+        data[adj_id] /= sycl::sqrt(FLOAT_N);
+        data[adj_id] /= stddev[j];
+      });
+    }));
+
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto data = data_buffer.get_access<access::mode::read>(cgh);
+      auto symmat = symmat_buffer.get_access<access::mode::read_write>(cgh);
+
+      cgh.parallel_for<CorrelationCorr>(range<1>(size), [=, M_ = size, N_ = size](id<1> gid) {
+        // if(item[0] >= M_ - 1) return;
+        const id<1> offset(1);
+        const auto j1 = gid[0] + offset[0];
+
+        symmat[{j1, j1}] = 1.0;
+
+        for(size_t j2 = j1 + 1; j2 <= M_; j2++) {
+          symmat[{j1, j2}] = 0.0;
+
+          for(size_t i = 1; i <= N_; i++) {
+            symmat[{j1, j2}] += data[{i, j1}] * data[{i, j2}];
+          }
+
+          symmat[{j2, j1}] = symmat[{j1, j2}];
+        }
+      });
+    }));
+
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto symmat = symmat_buffer.get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<Correlation5>(range<2>(1, 1), [=, M_ = size](id<2> gid) {
+        const id<2> offset(M_, M_);
+        symmat[gid + offset] = 1.0;
+      });
+    }));
+  }
+
+  bool verify(VerificationSetting&) {
+    constexpr auto ERROR_THRESHOLD = 0.05;
+
+    std::vector<DATA_TYPE> data_cpu((size + 1) * (size + 1));
+    std::vector<DATA_TYPE> mean_cpu(size + 1);
+    std::vector<DATA_TYPE> stddev_cpu(size + 1);
+    std::vector<DATA_TYPE> symmat_cpu((size + 1) * (size + 1));
+
+    // Trigger writeback
+    symmat_buffer.reset();
+
+    init_arrays(data_cpu.data(), size);
+    correlation(data_cpu.data(), mean_cpu.data(), stddev_cpu.data(), symmat_cpu.data(), size);
+
+    for(size_t i = 1; i < size + 1; i++) {
+      for(size_t j = 1; j < size + 1; j++) {
+        const auto diff = percentDiff(symmat_cpu[i * (size + 1) + j], symmat[i * (size + 1) + j]);
+        if(diff > ERROR_THRESHOLD)
+          return false;
+      }
+    }
+
+    return true;
+  }
+
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Correlation"; }
+
+private:
+  BenchmarkArgs args;
+
+  const size_t size;
+  std::vector<DATA_TYPE> data;
+  std::vector<DATA_TYPE> mean;
+  std::vector<DATA_TYPE> stddev;
+  std::vector<DATA_TYPE> symmat;
+
+  PrefetchedBuffer<DATA_TYPE, 2> data_buffer;
   PrefetchedBuffer<DATA_TYPE, 1> mean_buffer;
-	PrefetchedBuffer<DATA_TYPE, 1> stddev_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> symmat_buffer;
+  PrefetchedBuffer<DATA_TYPE, 1> stddev_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> symmat_buffer;
 };
 
 int main(int argc, char** argv) {
-	BenchmarkApp app(argc, argv);
-	app.run<Polybench_Correlation>();
-	return 0;
-}
+  BenchmarkApp app(argc, argv);
+  app.run<Polybench_Correlation>();
+  return 0;
+}
\ No newline at end of file
diff --git a/polybench/covariance.cpp b/polybench/covariance.cpp
index c15c37b4..88b22f06 100644
--- a/polybench/covariance.cpp
+++ b/polybench/covariance.cpp
@@ -4,7 +4,7 @@
 #include <cmath>
 #include <cstdlib>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "common.h"
 #include "polybenchUtilFuncts.h"
@@ -18,155 +18,159 @@ class CovarianceCovar;
 constexpr DATA_TYPE float_n = 3214212.01;
 
 void init_arrays(DATA_TYPE* data, size_t size) {
-	const auto M = size;
-	const auto N = size;
-
-	for(size_t i = 0; i < M; i++) {
-		for(size_t j = 0; j < N; j++) {
-			data[i * (N + 1) + j] = ((DATA_TYPE)i * j) / M;
-		}
-	}
+  const auto M = size;
+  const auto N = size;
+
+  for(size_t i = 0; i < M; i++) {
+    for(size_t j = 0; j < N; j++) {
+      data[i * (N + 1) + j] = ((DATA_TYPE)i * j) / M;
+    }
+  }
 }
 
 void covariance(DATA_TYPE* data, DATA_TYPE* symmat, DATA_TYPE* mean, size_t size) {
-	const auto M = size;
-	const auto N = size;
-
-	// Determine mean of column vectors of input data matrix
-	for(size_t j = 1; j <= M; j++) {
-		mean[j] = 0.0;
-		for(size_t i = 1; i <= N; i++) {
-			mean[j] += data[i * (M + 1) + j];
-		}
-		mean[j] /= float_n;
-	}
-
-	// Center the column vectors.
-	for(size_t i = 1; i <= N; i++) {
-		for(size_t j = 1; j <= M; j++) {
-			data[i * (M + 1) + j] -= mean[j];
-		}
-	}
-
-	// Calculate the m * m covariance matrix.
-	for(size_t j1 = 1; j1 <= M; j1++) {
-		for(size_t j2 = j1; j2 <= M; j2++) {
-			symmat[j1 * (M + 1) + j2] = 0.0;
-			for(size_t i = 1; i <= N; i++) {
-				symmat[j1 * (M + 1) + j2] += data[i * (M + 1) + j1] * data[i * (M + 1) + j2];
-			}
-			symmat[j2 * (M + 1) + j1] = symmat[j1 * (M + 1) + j2];
-		}
-	}
+  const auto M = size;
+  const auto N = size;
+
+  // Determine mean of column vectors of input data matrix
+  for(size_t j = 1; j <= M; j++) {
+    mean[j] = 0.0;
+    for(size_t i = 1; i <= N; i++) {
+      mean[j] += data[i * (M + 1) + j];
+    }
+    mean[j] /= float_n;
+  }
+
+  // Center the column vectors.
+  for(size_t i = 1; i <= N; i++) {
+    for(size_t j = 1; j <= M; j++) {
+      data[i * (M + 1) + j] -= mean[j];
+    }
+  }
+
+  // Calculate the m * m covariance matrix.
+  for(size_t j1 = 1; j1 <= M; j1++) {
+    for(size_t j2 = j1; j2 <= M; j2++) {
+      symmat[j1 * (M + 1) + j2] = 0.0;
+      for(size_t i = 1; i <= N; i++) {
+        symmat[j1 * (M + 1) + j2] += data[i * (M + 1) + j1] * data[i * (M + 1) + j2];
+      }
+      symmat[j2 * (M + 1) + j1] = symmat[j1 * (M + 1) + j2];
+    }
+  }
 }
 
 class Polybench_Covariance {
 public:
-	Polybench_Covariance(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
+  Polybench_Covariance(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
 
-	void setup() {
-		data.resize((size + 1) * (size + 1));
-		symmat.resize((size + 1) * (size + 1));
-		mean.resize(size + 1);
+  void setup() {
+    data.resize((size + 1) * (size + 1));
+    symmat.resize((size + 1) * (size + 1));
+    mean.resize(size + 1);
 
-		init_arrays(data.data(), size);
+    init_arrays(data.data(), size);
 
-		data_buffer.initialize(args.device_queue, data.data(), cl::sycl::range<2>(size + 1, size + 1));
-		symmat_buffer.initialize(args.device_queue, symmat.data(), cl::sycl::range<2>(size + 1, size + 1));
-		mean_buffer.initialize(args.device_queue, mean.data(), cl::sycl::range<1>(size + 1));
+    data_buffer.initialize(args.device_queue, data.data(), sycl::range<2>(size + 1, size + 1));
+    symmat_buffer.initialize(args.device_queue, symmat.data(), sycl::range<2>(size + 1, size + 1));
+    mean_buffer.initialize(args.device_queue, mean.data(), sycl::range<1>(size + 1));
   }
 
-	void run(std::vector<cl::sycl::event>& events) {
-		using namespace cl::sycl;
-
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto data = data_buffer.get_access<access::mode::read>(cgh);
-			auto mean = mean_buffer.get_access<access::mode::discard_write>(cgh);
-
-			cgh.parallel_for<CovarianceMean>(range<1>(size), id<1>(1), [=, N_ = size](item<1> item) {
-				const auto j = item[0];
-
-				mean[item] = 0;
-				for(size_t i = 1; i <= N_; i++) {
-					mean[item] += data[{i, j}];
-				}
-				mean[item] /= float_n;
-			});
-		}));
-
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto mean = mean_buffer.get_access<access::mode::read>(cgh);
-			auto data = data_buffer.get_access<access::mode::read_write>(cgh);
-
-			cgh.parallel_for<CovarianceReduce>(range<2>(size, size), id<2>(1, 1), [=](item<2> item) {
-				const auto j = item[1];
-				data[item] -= mean[j];
-			});
-		}));
-
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto data = data_buffer.get_access<access::mode::read>(cgh);
-			auto symmat = symmat_buffer.get_access<access::mode::discard_write>(cgh);
-			auto symmat2 = symmat_buffer.get_access<access::mode::discard_write>(cgh);
-
-			cgh.parallel_for<CovarianceCovar>(range<1>(size), id<1>(1), [=, M_ = size, N_ = size](item<1> item) {
-				const auto j1 = item[0];
-
-				symmat[{j1, j1}] = 1.0;
-
-				for(size_t j2 = j1; j2 <= M_; j2++) {
-					symmat[{j1, j2}] = 0.0;
-					for(size_t i = 1; i <= N_; i++) {
-						symmat[{j1, j2}] += data[{i, j1}] * data[{i, j2}];
-					}
-
-					symmat2[{j2, j1}] = symmat[{j1, j2}];
-				}
-			});
-		}));
-	}
+  void run(std::vector<sycl::event>& events) {
+    using namespace sycl;
+
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto data = data_buffer.get_access<access::mode::read>(cgh);
+      auto mean = mean_buffer.get_access<access::mode::discard_write>(cgh);
+
+      cgh.parallel_for<CovarianceMean>(range<1>(size), [=, N_ = size](id<1> gid) {
+        const id<1> offset(1);
+        const auto j = gid[0] + offset[0];
+
+        mean[gid + offset] = 0;
+        for(size_t i = 1; i <= N_; i++) {
+          mean[gid + offset] += data[{i, j}];
+        }
+        mean[gid + offset] /= float_n;
+      });
+    }));
+
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto mean = mean_buffer.get_access<access::mode::read>(cgh);
+      auto data = data_buffer.get_access<access::mode::read_write>(cgh);
+
+      cgh.parallel_for<CovarianceReduce>(range<2>(size, size), [=](id<2> gid) {
+        const id<2> offset(1, 1);
+        const auto j = gid[1] + offset[1];
+        data[gid + offset] -= mean[j];
+      });
+    }));
+
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto data = data_buffer.get_access<access::mode::read>(cgh);
+      auto symmat = symmat_buffer.get_access<access::mode::discard_write>(cgh);
+      auto symmat2 = symmat_buffer.get_access<access::mode::discard_write>(cgh);
+
+      cgh.parallel_for<CovarianceCovar>(range<1>(size), [=, M_ = size, N_ = size](id<1> gid) {
+        const id<1> offset(1);
+        const auto j1 = gid[0] + offset[0];
+
+        symmat[{j1, j1}] = 1.0;
+
+        for(size_t j2 = j1; j2 <= M_; j2++) {
+          symmat[{j1, j2}] = 0.0;
+          for(size_t i = 1; i <= N_; i++) {
+            symmat[{j1, j2}] += data[{i, j1}] * data[{i, j2}];
+          }
+
+          symmat2[{j2, j1}] = symmat[{j1, j2}];
+        }
+      });
+    }));
+  }
 
-	bool verify(VerificationSetting&) {
-		constexpr auto ERROR_THRESHOLD = 0.05;
+  bool verify(VerificationSetting&) {
+    constexpr auto ERROR_THRESHOLD = 0.05;
 
-		std::vector<DATA_TYPE> data_cpu((size + 1) * (size + 1));
-		std::vector<DATA_TYPE> symmat_cpu((size + 1) * (size + 1));
-		std::vector<DATA_TYPE> mean_cpu(size + 1);
+    std::vector<DATA_TYPE> data_cpu((size + 1) * (size + 1));
+    std::vector<DATA_TYPE> symmat_cpu((size + 1) * (size + 1));
+    std::vector<DATA_TYPE> mean_cpu(size + 1);
 
-		// Trigger writeback
-		symmat_buffer.reset();
+    // Trigger writeback
+    symmat_buffer.reset();
 
-		init_arrays(data_cpu.data(), size);
+    init_arrays(data_cpu.data(), size);
 
-		covariance(data_cpu.data(), symmat_cpu.data(), mean_cpu.data(), size);
+    covariance(data_cpu.data(), symmat_cpu.data(), mean_cpu.data(), size);
 
-		for(size_t i = 1; i < size + 1; i++) {
-			for(size_t j = 1; j < size + 1; j++) {
-				const auto diff = percentDiff(symmat_cpu[i * (size + 1) + j], symmat[i * (size + 1) + j]);
-				if(diff > ERROR_THRESHOLD) return false;
-			}
-		}
+    for(size_t i = 1; i < size + 1; i++) {
+      for(size_t j = 1; j < size + 1; j++) {
+        const auto diff = percentDiff(symmat_cpu[i * (size + 1) + j], symmat[i * (size + 1) + j]);
+        if(diff > ERROR_THRESHOLD)
+          return false;
+      }
+    }
 
-		return true;
-	}
+    return true;
+  }
 
-	static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Covariance"; }
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Covariance"; }
 
 private:
-	BenchmarkArgs args;
+  BenchmarkArgs args;
 
-	const size_t size;
-	std::vector<DATA_TYPE> data;
-	std::vector<DATA_TYPE> symmat;
-	std::vector<DATA_TYPE> mean;
+  const size_t size;
+  std::vector<DATA_TYPE> data;
+  std::vector<DATA_TYPE> symmat;
+  std::vector<DATA_TYPE> mean;
 
-	PrefetchedBuffer<DATA_TYPE, 2> data_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> symmat_buffer;
-	PrefetchedBuffer<DATA_TYPE, 1> mean_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> data_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> symmat_buffer;
+  PrefetchedBuffer<DATA_TYPE, 1> mean_buffer;
 };
 
 int main(int argc, char** argv) {
-	BenchmarkApp app(argc, argv);
-	app.run<Polybench_Covariance>();
-	return 0;
-}
+  BenchmarkApp app(argc, argv);
+  app.run<Polybench_Covariance>();
+  return 0;
+}
\ No newline at end of file
diff --git a/polybench/fdtd2d.cpp b/polybench/fdtd2d.cpp
index d7f8444a..42d17953 100644
--- a/polybench/fdtd2d.cpp
+++ b/polybench/fdtd2d.cpp
@@ -3,7 +3,7 @@
 
 #include <cstdlib>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "common.h"
 #include "polybenchUtilFuncts.h"
@@ -17,188 +17,192 @@ class Fdtd2d3;
 constexpr auto TMAX = 500;
 
 void init_arrays(DATA_TYPE* fict, DATA_TYPE* ex, DATA_TYPE* ey, DATA_TYPE* hz, size_t size) {
-	const auto NX = size;
-	const auto NY = size;
-
-	for(size_t i = 0; i < TMAX; i++) {
-		fict[i] = (DATA_TYPE)i;
-	}
-
-	for(size_t i = 0; i < NX; i++) {
-		for(size_t j = 0; j < NY; j++) {
-			ex[i * NY + j] = ((DATA_TYPE)i * (j + 1) + 1) / NX;
-			ey[i * NY + j] = ((DATA_TYPE)(i - 1) * (j + 2) + 2) / NX;
-			hz[i * NY + j] = ((DATA_TYPE)(i - 9) * (j + 4) + 3) / NX;
-		}
-	}
+  const auto NX = size;
+  const auto NY = size;
+
+  for(size_t i = 0; i < TMAX; i++) {
+    fict[i] = (DATA_TYPE)i;
+  }
+
+  for(size_t i = 0; i < NX; i++) {
+    for(size_t j = 0; j < NY; j++) {
+      ex[i * NY + j] = ((DATA_TYPE)i * (j + 1) + 1) / NX;
+      ey[i * NY + j] = ((DATA_TYPE)(i - 1) * (j + 2) + 2) / NX;
+      hz[i * NY + j] = ((DATA_TYPE)(i - 9) * (j + 4) + 3) / NX;
+    }
+  }
 }
 
 void runFdtd(DATA_TYPE* fict, DATA_TYPE* ex, DATA_TYPE* ey, DATA_TYPE* hz, size_t size) {
-	const auto NX = size;
-	const auto NY = size;
-
-	for(size_t t = 0; t < TMAX; t++) {
-		for(size_t j = 0; j < NY; j++) {
-			ey[0 * NY + j] = fict[t];
-		}
-
-		for(size_t i = 1; i < NX; i++) {
-			for(size_t j = 0; j < NY; j++) {
-				ey[i * NY + j] = ey[i * NY + j] - 0.5 * (hz[i * NY + j] - hz[(i - 1) * NY + j]);
-			}
-		}
-
-		for(size_t i = 0; i < NX; i++) {
-			for(size_t j = 1; j < NY; j++) {
-				ex[i * (NY + 1) + j] = ex[i * (NY + 1) + j] - 0.5 * (hz[i * NY + j] - hz[i * NY + (j - 1)]);
-			}
-		}
-
-		for(size_t i = 0; i < NX; i++) {
-			for(size_t j = 0; j < NY; j++) {
-				hz[i * NY + j] = hz[i * NY + j] - 0.7 * (ex[i * (NY + 1) + (j + 1)] - ex[i * (NY + 1) + j] + ey[(i + 1) * NY + j] - ey[i * NY + j]);
-			}
-		}
-	}
+  const auto NX = size;
+  const auto NY = size;
+
+  for(size_t t = 0; t < TMAX; t++) {
+    for(size_t j = 0; j < NY; j++) {
+      ey[0 * NY + j] = fict[t];
+    }
+
+    for(size_t i = 1; i < NX; i++) {
+      for(size_t j = 0; j < NY; j++) {
+        ey[i * NY + j] = ey[i * NY + j] - 0.5 * (hz[i * NY + j] - hz[(i - 1) * NY + j]);
+      }
+    }
+
+    for(size_t i = 0; i < NX; i++) {
+      for(size_t j = 1; j < NY; j++) {
+        ex[i * (NY + 1) + j] = ex[i * (NY + 1) + j] - 0.5 * (hz[i * NY + j] - hz[i * NY + (j - 1)]);
+      }
+    }
+
+    for(size_t i = 0; i < NX; i++) {
+      for(size_t j = 0; j < NY; j++) {
+        hz[i * NY + j] = hz[i * NY + j] - 0.7 * (ex[i * (NY + 1) + (j + 1)] - ex[i * (NY + 1) + j] +
+                                                    ey[(i + 1) * NY + j] - ey[i * NY + j]);
+      }
+    }
+  }
 }
 
 class Polybench_Fdtd2d {
-  public:
-	Polybench_Fdtd2d(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
-
-	void setup() {
-		fict.resize(TMAX);
-		ex.resize(size * (size + 1));
-		ey.resize((size + 1) * size);
-		hz.resize(size * size);
-
-		init_arrays(fict.data(), ex.data(), ey.data(), hz.data(), size);
-
-		fict_buffer.initialize(args.device_queue, fict.data(), cl::sycl::range<1>(TMAX));
-		ex_buffer.initialize(args.device_queue, ex.data(), cl::sycl::range<2>(size, size + 1));
-		ey_buffer.initialize(args.device_queue, ey.data(), cl::sycl::range<2>(size + 1, size));
-		hz_buffer.initialize(args.device_queue, hz.data(), cl::sycl::range<2>(size, size));
-	}
-
-	void run(std::vector<cl::sycl::event>& events) {
-		using namespace cl::sycl;
-
-		for(size_t t = 0; t < TMAX; t++) {
-			events.push_back(args.device_queue.submit([&](handler& cgh) {
-				auto fict = fict_buffer.get_access<access::mode::read>(cgh);
-				auto ey = ey_buffer.get_access<access::mode::read_write>(cgh);
-				auto hz = hz_buffer.get_access<access::mode::read>(cgh);
-
-				cgh.parallel_for<Fdtd2d1>(range<2>(size, size), [=](item<2> item) {
-					const auto i = item[0];
-					const auto j = item[1];
-
-					if(i == 0) {
-						ey[item] = fict[t];
-					} else {
-						ey[item] = ey[item] - 0.5 * (hz[item] - hz[{(i - 1), j}]);
-					}
-				});
-			}));
-
-			events.push_back(args.device_queue.submit([&](handler& cgh) {
-				auto ex = ex_buffer.get_access<access::mode::read_write>(cgh);
-				auto hz = hz_buffer.get_access<access::mode::read>(cgh);
-
-				cgh.parallel_for<Fdtd2d2>(range<2>(size, size), [=, NX_ = size, NY_ = size](item<2> item) {
-					const auto i = item[0];
-					const auto j = item[1];
-
-					if(j > 0) ex[item] = ex[item] - 0.5 * (hz[item] - hz[{i, (j - 1)}]);
-				});
-			}));
-
-			events.push_back(args.device_queue.submit([&](handler& cgh) {
-				auto ex = ex_buffer.get_access<access::mode::read>(cgh);
-				auto ey = ey_buffer.get_access<access::mode::read>(cgh);
-				auto hz = hz_buffer.get_access<access::mode::read_write>(cgh);
-
-				cgh.parallel_for<Fdtd2d3>(hz_buffer.get_range(), [=](item<2> item) {
-					const auto i = item[0];
-					const auto j = item[1];
-
-					hz[item] = hz[item] - 0.7 * (ex[{i, (j + 1)}] - ex[item] + ey[{(i + 1), j}] - ey[item]);
-				});
-			}));
-		}
-	}
-
-	bool verify(VerificationSetting&) {
-		// Yes, this is threshold is used by polybench/CUDA/fdtd2d. Numbers in
-		// this benchmark can get pretty large and regular floats don't provide
-		// enough precision. This verification may fail on some problem sizes.
-		constexpr auto ERROR_THRESHOLD = 10.05;
-
-		std::vector<DATA_TYPE> fict_cpu(TMAX);
-		std::vector<DATA_TYPE> ex_cpu(size * (size + 1));
-		std::vector<DATA_TYPE> ey_cpu((size + 1) * size);
-		std::vector<DATA_TYPE> hz_cpu(size * size);
-
-		// Trigger writebacks
-		hz_buffer.reset();
-
-		init_arrays(fict_cpu.data(), ex_cpu.data(), ey_cpu.data(), hz_cpu.data(), size);
-
-		runFdtd(fict_cpu.data(), ex_cpu.data(), ey_cpu.data(), hz_cpu.data(), size);
-
-		// for(size_t i = 0; i < size; i++) {
-		// 	for(size_t j = 0; j < size; j++) {
-		// 		const auto diff = percentDiff(ex_cpu[i * size + j], ex[i * size + j]);
-		// 		if(diff > ERROR_THRESHOLD) {
-		// 			printf("%ld %ld: %f %f %f\n", i, j, ex_cpu[i * size + j], ex[i * size + j], diff);
-		// 			return false;
-		// 		}
-		// 	}
-		// }
-
-		// for(size_t i = 0; i < size; i++) {
-		// 	for(size_t j = 0; j < size; j++) {
-		// 		const auto diff = percentDiff(ey_cpu[i * size + j], ey[i * size + j]);
-		// 		if(diff > ERROR_THRESHOLD) {
-		// 			printf("%ld %ld: %f %f %f\n", i, j, ey_cpu[i * size + j], ey[i * size + j], diff);
-		// 			return false;
-		// 		}
-		// 	}
-		// }
-
-		for(size_t i = 0; i < size; i++) {
-			for(size_t j = 0; j < size; j++) {
-				const auto diff = percentDiff(hz_cpu[i * size + j], hz[i * size + j]);
-				if(diff > ERROR_THRESHOLD) {
-					printf("%ld %ld: %f %f %f\n", i, j, hz_cpu[i * size + j], hz[i * size + j], diff);
-					return false;
-				}
-			}
-		}
-
-		return true;
-	}
-
-	static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Fdtd2d"; }
-
-  private:
-	BenchmarkArgs args;
-
-	const size_t size;
-	std::vector<DATA_TYPE> fict;
-	std::vector<DATA_TYPE> ex;
-	std::vector<DATA_TYPE> ey;
-	std::vector<DATA_TYPE> hz;
-
-	PrefetchedBuffer<DATA_TYPE, 1> fict_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> ex_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> ey_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> hz_buffer;
+public:
+  Polybench_Fdtd2d(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
+
+  void setup() {
+    fict.resize(TMAX);
+    ex.resize(size * (size + 1));
+    ey.resize((size + 1) * size);
+    hz.resize(size * size);
+
+    init_arrays(fict.data(), ex.data(), ey.data(), hz.data(), size);
+
+    fict_buffer.initialize(args.device_queue, fict.data(), sycl::range<1>(TMAX));
+    ex_buffer.initialize(args.device_queue, ex.data(), sycl::range<2>(size, size + 1));
+    ey_buffer.initialize(args.device_queue, ey.data(), sycl::range<2>(size + 1, size));
+    hz_buffer.initialize(args.device_queue, hz.data(), sycl::range<2>(size, size));
+  }
+
+  void run(std::vector<sycl::event>& events) {
+    using namespace sycl;
+
+    for(size_t t = 0; t < TMAX; t++) {
+      events.push_back(args.device_queue.submit([&](handler& cgh) {
+        auto fict = fict_buffer.get_access<access::mode::read>(cgh);
+        auto ey = ey_buffer.get_access<access::mode::read_write>(cgh);
+        auto hz = hz_buffer.get_access<access::mode::read>(cgh);
+
+        cgh.parallel_for<Fdtd2d1>(range<2>(size, size), [=](item<2> item) {
+          const auto i = item[0];
+          const auto j = item[1];
+
+          if(i == 0) {
+            ey[item] = fict[t];
+          } else {
+            ey[item] = ey[item] - 0.5 * (hz[item] - hz[{(i - 1), j}]);
+          }
+        });
+      }));
+
+      events.push_back(args.device_queue.submit([&](handler& cgh) {
+        auto ex = ex_buffer.get_access<access::mode::read_write>(cgh);
+        auto hz = hz_buffer.get_access<access::mode::read>(cgh);
+
+        cgh.parallel_for<Fdtd2d2>(range<2>(size, size), [=, NX_ = size, NY_ = size](item<2> item) {
+          const auto i = item[0];
+          const auto j = item[1];
+
+          if(j > 0)
+            ex[item] = ex[item] - 0.5 * (hz[item] - hz[{i, (j - 1)}]);
+        });
+      }));
+
+      events.push_back(args.device_queue.submit([&](handler& cgh) {
+        auto ex = ex_buffer.get_access<access::mode::read>(cgh);
+        auto ey = ey_buffer.get_access<access::mode::read>(cgh);
+        auto hz = hz_buffer.get_access<access::mode::read_write>(cgh);
+
+        cgh.parallel_for<Fdtd2d3>(hz_buffer.get_range(), [=](item<2> item) {
+          const auto i = item[0];
+          const auto j = item[1];
+
+          hz[item] = hz[item] - 0.7 * (ex[{i, (j + 1)}] - ex[item] + ey[{(i + 1), j}] - ey[item]);
+        });
+      }));
+    }
+  }
+
+  bool verify(VerificationSetting&) {
+    // Yes, this is threshold is used by polybench/CUDA/fdtd2d. Numbers in
+    // this benchmark can get pretty large and regular floats don't provide
+    // enough precision. This verification may fail on some problem sizes.
+    constexpr auto ERROR_THRESHOLD = 10.05;
+
+    std::vector<DATA_TYPE> fict_cpu(TMAX);
+    std::vector<DATA_TYPE> ex_cpu(size * (size + 1));
+    std::vector<DATA_TYPE> ey_cpu((size + 1) * size);
+    std::vector<DATA_TYPE> hz_cpu(size * size);
+
+    // Trigger writebacks
+    hz_buffer.reset();
+
+    init_arrays(fict_cpu.data(), ex_cpu.data(), ey_cpu.data(), hz_cpu.data(), size);
+
+    runFdtd(fict_cpu.data(), ex_cpu.data(), ey_cpu.data(), hz_cpu.data(), size);
+
+    // for(size_t i = 0; i < size; i++) {
+    // 	for(size_t j = 0; j < size; j++) {
+    // 		const auto diff = percentDiff(ex_cpu[i * size + j], ex[i * size + j]);
+    // 		if(diff > ERROR_THRESHOLD) {
+    // 			printf("%ld %ld: %f %f %f\n", i, j, ex_cpu[i * size + j], ex[i * size + j], diff);
+    // 			return false;
+    // 		}
+    // 	}
+    // }
+
+    // for(size_t i = 0; i < size; i++) {
+    // 	for(size_t j = 0; j < size; j++) {
+    // 		const auto diff = percentDiff(ey_cpu[i * size + j], ey[i * size + j]);
+    // 		if(diff > ERROR_THRESHOLD) {
+    // 			printf("%ld %ld: %f %f %f\n", i, j, ey_cpu[i * size + j], ey[i * size + j], diff);
+    // 			return false;
+    // 		}
+    // 	}
+    // }
+
+    for(size_t i = 0; i < size; i++) {
+      for(size_t j = 0; j < size; j++) {
+        const auto diff = percentDiff(hz_cpu[i * size + j], hz[i * size + j]);
+        if(diff > ERROR_THRESHOLD) {
+          printf("%ld %ld: %f %f %f\n", i, j, hz_cpu[i * size + j], hz[i * size + j], diff);
+          return false;
+        }
+      }
+    }
+
+    return true;
+  }
+
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Fdtd2d"; }
+
+private:
+  BenchmarkArgs args;
+
+  const size_t size;
+  std::vector<DATA_TYPE> fict;
+  std::vector<DATA_TYPE> ex;
+  std::vector<DATA_TYPE> ey;
+  std::vector<DATA_TYPE> hz;
+
+  PrefetchedBuffer<DATA_TYPE, 1> fict_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> ex_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> ey_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> hz_buffer;
 };
 
 int main(int argc, char** argv) {
-	BenchmarkApp app(argc, argv);
-        if(app.deviceSupportsFP64())
-          app.run<Polybench_Fdtd2d>();
-        return 0;
+  BenchmarkApp app(argc, argv);
+
+  if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) {
+    app.run<Polybench_Fdtd2d>();
+  }
+  return 0;
 }
diff --git a/polybench/gemm.cpp b/polybench/gemm.cpp
index e1bff491..417763d4 100644
--- a/polybench/gemm.cpp
+++ b/polybench/gemm.cpp
@@ -3,7 +3,7 @@
 
 #include <cstdlib>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "common.h"
 #include "polybenchUtilFuncts.h"
@@ -16,121 +16,122 @@ using DATA_TYPE = float;
 class Gemm;
 
 void init(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* C, size_t size) {
-	const auto NI = size;
-	const auto NJ = size;
-	const auto NK = size;
-
-	for(size_t i = 0; i < NI; i++) {
-		for(size_t j = 0; j < NK; j++) {
-			A[i * NK + j] = ((DATA_TYPE)i * j) / NI;
-		}
-	}
-
-	for(size_t i = 0; i < NK; i++) {
-		for(size_t j = 0; j < NJ; j++) {
-			B[i * NJ + j] = ((DATA_TYPE)i * j + 1) / NJ;
-		}
-	}
-
-	for(size_t i = 0; i < NI; i++) {
-		for(size_t j = 0; j < NJ; j++) {
-			C[i * NJ + j] = ((DATA_TYPE)i * j + 2) / NJ;
-		}
-	}
+  const auto NI = size;
+  const auto NJ = size;
+  const auto NK = size;
+
+  for(size_t i = 0; i < NI; i++) {
+    for(size_t j = 0; j < NK; j++) {
+      A[i * NK + j] = ((DATA_TYPE)i * j) / NI;
+    }
+  }
+
+  for(size_t i = 0; i < NK; i++) {
+    for(size_t j = 0; j < NJ; j++) {
+      B[i * NJ + j] = ((DATA_TYPE)i * j + 1) / NJ;
+    }
+  }
+
+  for(size_t i = 0; i < NI; i++) {
+    for(size_t j = 0; j < NJ; j++) {
+      C[i * NJ + j] = ((DATA_TYPE)i * j + 2) / NJ;
+    }
+  }
 }
 
 void gemm(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* C, size_t size) {
-	const auto NI = size;
-	const auto NJ = size;
-	const auto NK = size;
-
-	for(size_t i = 0; i < NI; i++) {
-		for(size_t j = 0; j < NJ; j++) {
-			C[i * NJ + j] *= BETA;
-
-			for(size_t k = 0; k < NK; ++k) {
-				C[i * NJ + j] += ALPHA * A[i * NK + k] * B[k * NJ + j];
-			}
-		}
-	}
+  const auto NI = size;
+  const auto NJ = size;
+  const auto NK = size;
+
+  for(size_t i = 0; i < NI; i++) {
+    for(size_t j = 0; j < NJ; j++) {
+      C[i * NJ + j] *= BETA;
+
+      for(size_t k = 0; k < NK; ++k) {
+        C[i * NJ + j] += ALPHA * A[i * NK + k] * B[k * NJ + j];
+      }
+    }
+  }
 }
 
 class Polybench_Gemm {
-  public:
-	Polybench_Gemm(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
+public:
+  Polybench_Gemm(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
 
-	void setup() {
-		A.resize(size * size);
-		B.resize(size * size);
-		C.resize(size * size);
+  void setup() {
+    A.resize(size * size);
+    B.resize(size * size);
+    C.resize(size * size);
 
-		init(A.data(), B.data(), C.data(), size);
+    init(A.data(), B.data(), C.data(), size);
 
-		A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<2>(size, size));
-		B_buffer.initialize(args.device_queue, B.data(), cl::sycl::range<2>(size, size));
-		C_buffer.initialize(args.device_queue, C.data(), cl::sycl::range<2>(size, size));
-	}
+    A_buffer.initialize(args.device_queue, A.data(), sycl::range<2>(size, size));
+    B_buffer.initialize(args.device_queue, B.data(), sycl::range<2>(size, size));
+    C_buffer.initialize(args.device_queue, C.data(), sycl::range<2>(size, size));
+  }
 
-	void run(std::vector<cl::sycl::event>& events) {
-		using namespace cl::sycl;
+  void run(std::vector<sycl::event>& events) {
+    using namespace sycl;
 
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto A = A_buffer.get_access<access::mode::read>(cgh);
-			auto B = B_buffer.get_access<access::mode::read>(cgh);
-			auto C = C_buffer.get_access<access::mode::read_write>(cgh);
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto A = A_buffer.get_access<access::mode::read>(cgh);
+      auto B = B_buffer.get_access<access::mode::read>(cgh);
+      auto C = C_buffer.get_access<access::mode::read_write>(cgh);
 
-			cgh.parallel_for<Gemm>(C_buffer.get_range(), [=, NK_ = size](item<2> item) {
-				const auto i = item[0];
-				const auto j = item[1];
+      cgh.parallel_for<Gemm>(C_buffer.get_range(), [=, NK_ = size](item<2> item) {
+        const auto i = item[0];
+        const auto j = item[1];
 
-				C[item] *= BETA;
+        C[item] *= BETA;
 
-				for(size_t k = 0; k < NK_; k++) {
-					C[item] += ALPHA * A[{i, k}] * B[{k, j}];
-				}
-			});
-		}));
-	}
+        for(size_t k = 0; k < NK_; k++) {
+          C[item] += ALPHA * A[{i, k}] * B[{k, j}];
+        }
+      });
+    }));
+  }
 
-	bool verify(VerificationSetting&) {
-		constexpr auto ERROR_THRESHOLD = 0.05;
+  bool verify(VerificationSetting&) {
+    constexpr auto ERROR_THRESHOLD = 0.05;
 
-		// Trigger writeback
-		C_buffer.reset();
+    // Trigger writeback
+    C_buffer.reset();
 
-		std::vector<DATA_TYPE> C_cpu(size * size);
+    std::vector<DATA_TYPE> C_cpu(size * size);
 
-		init(A.data(), B.data(), C_cpu.data(), size);
+    init(A.data(), B.data(), C_cpu.data(), size);
 
-		gemm(A.data(), B.data(), C_cpu.data(), size);
+    gemm(A.data(), B.data(), C_cpu.data(), size);
 
-		for(size_t i = 0; i < size; i++) {
-			for(size_t j = 0; j < size; j++) {
-				const auto diff = percentDiff(C_cpu[i * size + j], C[i * size + j]);
-				if(diff > ERROR_THRESHOLD) return false;
-			}
-		}
+    for(size_t i = 0; i < size; i++) {
+      for(size_t j = 0; j < size; j++) {
+        const auto diff = percentDiff(C_cpu[i * size + j], C[i * size + j]);
+        if(diff > ERROR_THRESHOLD)
+          return false;
+      }
+    }
 
-		return true;
-	}
+    return true;
+  }
 
-	static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Gemm"; }
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Gemm"; }
 
 private:
-	BenchmarkArgs args;
+  BenchmarkArgs args;
 
-	const size_t size;
-	std::vector<DATA_TYPE> A;
-	std::vector<DATA_TYPE> B;
-	std::vector<DATA_TYPE> C;
+  const size_t size;
+  std::vector<DATA_TYPE> A;
+  std::vector<DATA_TYPE> B;
+  std::vector<DATA_TYPE> C;
 
-	PrefetchedBuffer<DATA_TYPE, 2> A_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> B_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> C_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> A_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> B_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> C_buffer;
 };
 
 int main(int argc, char** argv) {
-	BenchmarkApp app(argc, argv);
-	app.run<Polybench_Gemm>();
-	return 0;
+  BenchmarkApp app(argc, argv);
+  app.run<Polybench_Gemm>();
+  return 0;
 }
diff --git a/polybench/gesummv.cpp b/polybench/gesummv.cpp
index 9b3429f2..606e6786 100644
--- a/polybench/gesummv.cpp
+++ b/polybench/gesummv.cpp
@@ -3,7 +3,7 @@
 
 #include <cstdlib>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "common.h"
 #include "polybenchUtilFuncts.h"
@@ -23,120 +23,122 @@ constexpr DATA_TYPE BETA = 1;
 // 		if(percentDiff(y[i], y_outputFromGpu[i]) > PERCENT_DIFF_ERROR_THRESHOLD) fail++;
 // 	}
 
-// 	printf("Non-Matching CPU-GPU Outputs Beyond Error Threshold of %4.2f Percent: %d\n", PERCENT_DIFF_ERROR_THRESHOLD, fail);
+// 	printf("Non-Matching CPU-GPU Outputs Beyond Error Threshold of %4.2f Percent: %d\n",
+// PERCENT_DIFF_ERROR_THRESHOLD, fail);
 // }
 
 void init(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* x, size_t size) {
-	const auto N = size;
+  const auto N = size;
 
-	for(size_t i = 0; i < N; i++) {
-		x[i] = 1;
+  for(size_t i = 0; i < N; i++) {
+    x[i] = 1;
 
-		for(size_t j = 0; j < N; j++) {
-			A[i * N + j] = 2;
-			B[i * N + j] = 3;
-		}
-	}
+    for(size_t j = 0; j < N; j++) {
+      A[i * N + j] = 2;
+      B[i * N + j] = 3;
+    }
+  }
 }
 
 void gesummv(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* x, DATA_TYPE* y, DATA_TYPE* tmp, size_t size) {
-	const auto N = size;
-
-	for(size_t i = 0; i < N; i++) {
-		tmp[i] = 0;
-		y[i] = 0;
-		for(size_t j = 0; j < N; j++) {
-			tmp[i] = A[i * N + j] * x[j] + tmp[i];
-			y[i] = B[i * N + j] * x[j] + y[i];
-		}
-
-		y[i] = ALPHA * tmp[i] + BETA * y[i];
-	}
+  const auto N = size;
+
+  for(size_t i = 0; i < N; i++) {
+    tmp[i] = 0;
+    y[i] = 0;
+    for(size_t j = 0; j < N; j++) {
+      tmp[i] = A[i * N + j] * x[j] + tmp[i];
+      y[i] = B[i * N + j] * x[j] + y[i];
+    }
+
+    y[i] = ALPHA * tmp[i] + BETA * y[i];
+  }
 }
 
 class Polybench_Gesummv {
 public:
-	Polybench_Gesummv(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
+  Polybench_Gesummv(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
 
-	void setup() {
-		A.resize(size * size);
-		B.resize(size * size);
-		x.resize(size);
-		y.resize(size);
-		tmp.resize(size);
+  void setup() {
+    A.resize(size * size);
+    B.resize(size * size);
+    x.resize(size);
+    y.resize(size);
+    tmp.resize(size);
 
-		init(A.data(), B.data(), x.data(), size);
+    init(A.data(), B.data(), x.data(), size);
 
-		A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<2>(size, size));
-		B_buffer.initialize(args.device_queue, B.data(), cl::sycl::range<2>(size, size));
-		x_buffer.initialize(args.device_queue, x.data(), cl::sycl::range<1>(size));
-		y_buffer.initialize(args.device_queue, y.data(), cl::sycl::range<1>(size));
-		tmp_buffer.initialize(args.device_queue, tmp.data(), cl::sycl::range<1>(size));
-	}
+    A_buffer.initialize(args.device_queue, A.data(), sycl::range<2>(size, size));
+    B_buffer.initialize(args.device_queue, B.data(), sycl::range<2>(size, size));
+    x_buffer.initialize(args.device_queue, x.data(), sycl::range<1>(size));
+    y_buffer.initialize(args.device_queue, y.data(), sycl::range<1>(size));
+    tmp_buffer.initialize(args.device_queue, tmp.data(), sycl::range<1>(size));
+  }
 
-	void run(std::vector<cl::sycl::event>& events) {
-		using namespace cl::sycl;
+  void run(std::vector<sycl::event>& events) {
+    using namespace sycl;
 
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto A = A_buffer.get_access<access::mode::read>(cgh);
-			auto B = B_buffer.get_access<access::mode::read>(cgh);
-			auto x = x_buffer.get_access<access::mode::read>(cgh);
-			auto y = y_buffer.get_access<access::mode::read_write>(cgh);
-			auto tmp = tmp_buffer.get_access<access::mode::read_write>(cgh);
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto A = A_buffer.get_access<access::mode::read>(cgh);
+      auto B = B_buffer.get_access<access::mode::read>(cgh);
+      auto x = x_buffer.get_access<access::mode::read>(cgh);
+      auto y = y_buffer.get_access<access::mode::read_write>(cgh);
+      auto tmp = tmp_buffer.get_access<access::mode::read_write>(cgh);
 
-			cgh.parallel_for<Gesummv>(y.get_range(), [=, N_ = size](item<1> item) {
-				const auto i = item[0];
+      cgh.parallel_for<Gesummv>(y.get_range(), [=, N_ = size](item<1> item) {
+        const auto i = item[0];
 
-				for(size_t j = 0; j < N_; j++) {
-					tmp[item] += A[{i, j}] * x[j];
-					y[item] += B[{i, j}] * x[j];
-				}
+        for(size_t j = 0; j < N_; j++) {
+          tmp[item] += A[{i, j}] * x[j];
+          y[item] += B[{i, j}] * x[j];
+        }
 
-				y[item] = ALPHA * tmp[item] + BETA * y[item];
-			});
-		}));
-	}
+        y[item] = ALPHA * tmp[item] + BETA * y[item];
+      });
+    }));
+  }
 
-	bool verify(VerificationSetting&) {
-		constexpr auto ERROR_THRESHOLD = 0.05;
+  bool verify(VerificationSetting&) {
+    constexpr auto ERROR_THRESHOLD = 0.05;
 
-		// Trigger writeback
-		y_buffer.reset();
+    // Trigger writeback
+    y_buffer.reset();
 
-		std::vector<DATA_TYPE> y_cpu(size);
-		std::vector<DATA_TYPE> tmp_cpu(size);
+    std::vector<DATA_TYPE> y_cpu(size);
+    std::vector<DATA_TYPE> tmp_cpu(size);
 
-		gesummv(A.data(), B.data(), x.data(), y_cpu.data(), tmp_cpu.data(), size);
+    gesummv(A.data(), B.data(), x.data(), y_cpu.data(), tmp_cpu.data(), size);
 
-		for(size_t i = 0; i < size; i++) {
-			const auto diff = percentDiff(y_cpu[i], y[i]);
-			if(diff > ERROR_THRESHOLD) return false;
-		}
+    for(size_t i = 0; i < size; i++) {
+      const auto diff = percentDiff(y_cpu[i], y[i]);
+      if(diff > ERROR_THRESHOLD)
+        return false;
+    }
 
-		return true;
-	}
+    return true;
+  }
 
-	static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Gesummv"; }
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Gesummv"; }
 
 private:
-	BenchmarkArgs args;
-
-	const size_t size;
-	std::vector<DATA_TYPE> A;
-	std::vector<DATA_TYPE> B;
-	std::vector<DATA_TYPE> x;
-	std::vector<DATA_TYPE> y;
-	std::vector<DATA_TYPE> tmp;
-
-	PrefetchedBuffer<DATA_TYPE, 2> A_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> B_buffer;
-	PrefetchedBuffer<DATA_TYPE, 1> x_buffer;
-	PrefetchedBuffer<DATA_TYPE, 1> y_buffer;
-	PrefetchedBuffer<DATA_TYPE, 1> tmp_buffer;
+  BenchmarkArgs args;
+
+  const size_t size;
+  std::vector<DATA_TYPE> A;
+  std::vector<DATA_TYPE> B;
+  std::vector<DATA_TYPE> x;
+  std::vector<DATA_TYPE> y;
+  std::vector<DATA_TYPE> tmp;
+
+  PrefetchedBuffer<DATA_TYPE, 2> A_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> B_buffer;
+  PrefetchedBuffer<DATA_TYPE, 1> x_buffer;
+  PrefetchedBuffer<DATA_TYPE, 1> y_buffer;
+  PrefetchedBuffer<DATA_TYPE, 1> tmp_buffer;
 };
 
 int main(int argc, char** argv) {
-	BenchmarkApp app(argc, argv);
-	app.run<Polybench_Gesummv>();
-	return 0;
+  BenchmarkApp app(argc, argv);
+  app.run<Polybench_Gesummv>();
+  return 0;
 }
diff --git a/polybench/gramschmidt.cpp b/polybench/gramschmidt.cpp
index ba447e17..368b4446 100644
--- a/polybench/gramschmidt.cpp
+++ b/polybench/gramschmidt.cpp
@@ -4,7 +4,7 @@
 #include <cmath>
 #include <cstdlib>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "common.h"
 #include "polybenchUtilFuncts.h"
@@ -16,148 +16,153 @@ class Gramschmidt2;
 class Gramschmidt3;
 
 void init_array(DATA_TYPE* A, size_t size) {
-	const auto M = size;
-	const auto N = size;
-
-	for(size_t i = 0; i < M; i++) {
-		for(size_t j = 0; j < N; j++) {
-			A[i * N + j] = ((DATA_TYPE)(i + 1) * (j + 1)) / (M + 1);
-		}
-	}
+  const auto M = size;
+  const auto N = size;
+
+  for(size_t i = 0; i < M; i++) {
+    for(size_t j = 0; j < N; j++) {
+      A[i * N + j] = ((DATA_TYPE)(i + 1) * (j + 1)) / (M + 1);
+    }
+  }
 }
 
 void gramschmidt(DATA_TYPE* A, DATA_TYPE* R, DATA_TYPE* Q, size_t size) {
-	const auto M = size;
-	const auto N = size;
-
-	for(size_t k = 0; k < N; k++) {
-		DATA_TYPE nrm = 0;
-		for(size_t i = 0; i < M; i++) {
-			nrm += A[i * N + k] * A[i * N + k];
-		}
-
-		R[k * N + k] = sqrt(nrm);
-		for(size_t i = 0; i < M; i++) {
-			Q[i * N + k] = A[i * N + k] / R[k * N + k];
-		}
-
-		for(size_t j = k + 1; j < N; j++) {
-			R[k * N + j] = 0;
-			for(size_t i = 0; i < M; i++) {
-				R[k * N + j] += Q[i * N + k] * A[i * N + j];
-			}
-			for(size_t i = 0; i < M; i++) {
-				A[i * N + j] = A[i * N + j] - Q[i * N + k] * R[k * N + j];
-			}
-		}
-	}
+  const auto M = size;
+  const auto N = size;
+
+  for(size_t k = 0; k < N; k++) {
+    DATA_TYPE nrm = 0;
+    for(size_t i = 0; i < M; i++) {
+      nrm += A[i * N + k] * A[i * N + k];
+    }
+
+    R[k * N + k] = sqrt(nrm);
+    for(size_t i = 0; i < M; i++) {
+      Q[i * N + k] = A[i * N + k] / R[k * N + k];
+    }
+
+    for(size_t j = k + 1; j < N; j++) {
+      R[k * N + j] = 0;
+      for(size_t i = 0; i < M; i++) {
+        R[k * N + j] += Q[i * N + k] * A[i * N + j];
+      }
+      for(size_t i = 0; i < M; i++) {
+        A[i * N + j] = A[i * N + j] - Q[i * N + k] * R[k * N + j];
+      }
+    }
+  }
 }
 
 class Polybench_Gramschmidt {
-  public:
-	Polybench_Gramschmidt(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
-
-	void setup() {
-		A.resize(size * size);
-		R.resize(size * size);
-		Q.resize(size * size);
-
-		init_array(A.data(), size);
-
-		A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<2>(size, size));
-		R_buffer.initialize(args.device_queue, R.data(), cl::sycl::range<2>(size, size));
-		Q_buffer.initialize(args.device_queue, Q.data(), cl::sycl::range<2>(size, size));
-	}
-
-	void run(std::vector<cl::sycl::event>& events) {
-		using namespace cl::sycl;
-
-		for(size_t k = 0; k < size; k++) {
-			events.push_back(args.device_queue.submit([&](handler& cgh) {
-				auto A = A_buffer.get_access<access::mode::read>(cgh);
-				auto R = R_buffer.get_access<access::mode::write>(cgh);
-
-				cgh.parallel_for<Gramschmidt1>(range<2>(1, 1), [=, M_ = size](item<2> item) {
-					DATA_TYPE nrm = 0;
-					for(size_t i = 0; i < M_; i++) {
-						nrm += A[{i, k}] * A[{i, k}];
-					}
-					R[{k, k}] = cl::sycl::sqrt(nrm);
-				});
-			}));
-
-			events.push_back(args.device_queue.submit([&](handler& cgh) {
-				auto A = A_buffer.get_access<access::mode::read>(cgh);
-				auto R = R_buffer.get_access<access::mode::read>(cgh);
-				auto Q = Q_buffer.get_access<access::mode::write>(cgh);
-
-				cgh.parallel_for<Gramschmidt2>(range<2>(size, 1), id<2>(0, k), [=](item<2> item) { Q[item] = A[item] / R[{k, k}]; });
-			}));
-
-			events.push_back(args.device_queue.submit([&](handler& cgh) {
-				auto A = A_buffer.get_access<access::mode::read_write>(cgh);
-				auto R = R_buffer.get_access<access::mode::write>(cgh);
-				auto Q = Q_buffer.get_access<access::mode::read>(cgh);
-
-				cgh.parallel_for<Gramschmidt3>(range<2>(size, 1), [=, M_ = size, N_ = size](item<2> item) {
-					const auto j = item[0];
-
-					if(j <= k || j >= N_) return;
-
-					R[item] = 0;
-					for(size_t i = 0; i < M_; i++) {
-						R[item] += Q[{i, k}] * A[{i, j}];
-					}
-
-					for(size_t i = 0; i < M_; i++) {
-						A[{i, j}] -= Q[{i, k}] * R[item];
-					}
-				});
-			}));
-		}
-	}
-
-	bool verify(VerificationSetting&) {
-		constexpr auto ERROR_THRESHOLD = 0.05;
-
-		std::vector<DATA_TYPE> A_cpu(size * size);
-		std::vector<DATA_TYPE> R_cpu(size * size);
-		std::vector<DATA_TYPE> Q_cpu(size * size);
-
-		// Trigger writeback
-		A_buffer.reset();
-
-		init_array(A_cpu.data(), size);
-
-		gramschmidt(A_cpu.data(), R_cpu.data(), Q_cpu.data(), size);
-
-		for(size_t i = 0; i < size; i++) {
-			for(size_t j = 0; j < size; j++) {
-				const auto diff = percentDiff(A_cpu[i * size + j], A[i * size + j]);
-				if(diff > ERROR_THRESHOLD) return false;
-			}
-		}
-
-		return true;
-	}
-
-	static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Gramschmidt"; }
-
-  private:
-	BenchmarkArgs args;
-
-	const size_t size;
-	std::vector<DATA_TYPE> A;
-	std::vector<DATA_TYPE> R;
-	std::vector<DATA_TYPE> Q;
-
-	PrefetchedBuffer<DATA_TYPE, 2> A_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> R_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> Q_buffer;
+public:
+  Polybench_Gramschmidt(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
+
+  void setup() {
+    A.resize(size * size);
+    R.resize(size * size);
+    Q.resize(size * size);
+
+    init_array(A.data(), size);
+
+    A_buffer.initialize(args.device_queue, A.data(), sycl::range<2>(size, size));
+    R_buffer.initialize(args.device_queue, R.data(), sycl::range<2>(size, size));
+    Q_buffer.initialize(args.device_queue, Q.data(), sycl::range<2>(size, size));
+  }
+
+  void run(std::vector<sycl::event>& events) {
+    using namespace sycl;
+
+    for(size_t k = 0; k < size; k++) {
+      events.push_back(args.device_queue.submit([&](handler& cgh) {
+        auto A = A_buffer.get_access<access::mode::read>(cgh);
+        auto R = R_buffer.get_access<access::mode::write>(cgh);
+
+        cgh.parallel_for<Gramschmidt1>(range<2>(1, 1), [=, M_ = size](item<2> item) {
+          DATA_TYPE nrm = 0;
+          for(size_t i = 0; i < M_; i++) {
+            nrm += A[{i, k}] * A[{i, k}];
+          }
+          R[{k, k}] = sycl::sqrt(nrm);
+        });
+      }));
+
+      events.push_back(args.device_queue.submit([&](handler& cgh) {
+        auto A = A_buffer.get_access<access::mode::read>(cgh);
+        auto R = R_buffer.get_access<access::mode::read>(cgh);
+        auto Q = Q_buffer.get_access<access::mode::write>(cgh);
+
+        cgh.parallel_for<Gramschmidt2>(range<2>(size, 1), [=](item<2> gid) {
+          const id<2> offset(0, k);
+          Q[gid + offset] = A[gid + offset] / R[{k, k}];
+        });
+      }));
+
+      events.push_back(args.device_queue.submit([&](handler& cgh) {
+        auto A = A_buffer.get_access<access::mode::read_write>(cgh);
+        auto R = R_buffer.get_access<access::mode::write>(cgh);
+        auto Q = Q_buffer.get_access<access::mode::read>(cgh);
+
+        cgh.parallel_for<Gramschmidt3>(range<2>(size, 1), [=, M_ = size, N_ = size](item<2> item) {
+          const auto j = item[0];
+
+          if(j <= k || j >= N_)
+            return;
+
+          R[item] = 0;
+          for(size_t i = 0; i < M_; i++) {
+            R[item] += Q[{i, k}] * A[{i, j}];
+          }
+
+          for(size_t i = 0; i < M_; i++) {
+            A[{i, j}] -= Q[{i, k}] * R[item];
+          }
+        });
+      }));
+    }
+  }
+
+  bool verify(VerificationSetting&) {
+    constexpr auto ERROR_THRESHOLD = 0.05;
+
+    std::vector<DATA_TYPE> A_cpu(size * size);
+    std::vector<DATA_TYPE> R_cpu(size * size);
+    std::vector<DATA_TYPE> Q_cpu(size * size);
+
+    // Trigger writeback
+    A_buffer.reset();
+
+    init_array(A_cpu.data(), size);
+
+    gramschmidt(A_cpu.data(), R_cpu.data(), Q_cpu.data(), size);
+
+    for(size_t i = 0; i < size; i++) {
+      for(size_t j = 0; j < size; j++) {
+        const auto diff = percentDiff(A_cpu[i * size + j], A[i * size + j]);
+        if(diff > ERROR_THRESHOLD)
+          return false;
+      }
+    }
+
+    return true;
+  }
+
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Gramschmidt"; }
+
+private:
+  BenchmarkArgs args;
+
+  const size_t size;
+  std::vector<DATA_TYPE> A;
+  std::vector<DATA_TYPE> R;
+  std::vector<DATA_TYPE> Q;
+
+  PrefetchedBuffer<DATA_TYPE, 2> A_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> R_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> Q_buffer;
 };
 
 int main(int argc, char** argv) {
-	BenchmarkApp app(argc, argv);
-	app.run<Polybench_Gramschmidt>();
-	return 0;
-}
+  BenchmarkApp app(argc, argv);
+  app.run<Polybench_Gramschmidt>();
+  return 0;
+}
\ No newline at end of file
diff --git a/polybench/mvt.cpp b/polybench/mvt.cpp
index 497d14c4..b3077062 100644
--- a/polybench/mvt.cpp
+++ b/polybench/mvt.cpp
@@ -3,7 +3,7 @@
 
 #include <cstdlib>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "common.h"
 #include "polybenchUtilFuncts.h"
@@ -14,134 +14,136 @@ class Mvt1;
 class Mvt2;
 
 void init_arrays(DATA_TYPE* a, DATA_TYPE* x1, DATA_TYPE* x2, DATA_TYPE* y_1, DATA_TYPE* y_2, size_t size) {
-	const auto N = size;
-
-	for(size_t i = 0; i < N; i++) {
-		x1[i] = 0.0;
-		x2[i] = 0.0;
-		y_1[i] = 1.0;
-		y_2[i] = 1.0;
-
-		for(size_t j = 0; j < N; j++) {
-			a[i * N + j] = (DATA_TYPE)(i + j + 1.0) / N;
-		}
-	}
+  const auto N = size;
+
+  for(size_t i = 0; i < N; i++) {
+    x1[i] = 0.0;
+    x2[i] = 0.0;
+    y_1[i] = 1.0;
+    y_2[i] = 1.0;
+
+    for(size_t j = 0; j < N; j++) {
+      a[i * N + j] = (DATA_TYPE)(i + j + 1.0) / N;
+    }
+  }
 }
 
 void runMvt(DATA_TYPE* a, DATA_TYPE* x1, DATA_TYPE* x2, DATA_TYPE* y1, DATA_TYPE* y2, size_t size) {
-	const auto N = size;
-
-	for(size_t i = 0; i < N; i++) {
-		for(size_t j = 0; j < N; j++) {
-			x1[i] = x1[i] + a[i * N + j] * y1[j];
-		}
-	}
-
-	for(size_t k = 0; k < N; k++) {
-		for(size_t l = 0; l < N; l++) {
-			x2[k] = x2[k] + a[k * N + l] * y2[l];
-		}
-	}
+  const auto N = size;
+
+  for(size_t i = 0; i < N; i++) {
+    for(size_t j = 0; j < N; j++) {
+      x1[i] = x1[i] + a[i * N + j] * y1[j];
+    }
+  }
+
+  for(size_t k = 0; k < N; k++) {
+    for(size_t l = 0; l < N; l++) {
+      x2[k] = x2[k] + a[k * N + l] * y2[l];
+    }
+  }
 }
 
 class Polybench_Mvt {
-  public:
-	Polybench_Mvt(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
-
-	void setup() {
-		a.resize(size * size);
-		x1.resize(size);
-		x2.resize(size);
-		y1.resize(size);
-		y2.resize(size);
-
-		init_arrays(a.data(), x1.data(), x2.data(), y1.data(), y2.data(), size);
-
-		a_buffer .initialize(args.device_queue, a.data(), cl::sycl::range<2>(size, size));
-		x1_buffer.initialize(args.device_queue, x1.data(), cl::sycl::range<1>(size));
-		x2_buffer.initialize(args.device_queue, x2.data(), cl::sycl::range<1>(size));
-		y1_buffer.initialize(args.device_queue, y1.data(), cl::sycl::range<1>(size));
-		y2_buffer.initialize(args.device_queue, y2.data(), cl::sycl::range<1>(size));
-	}
-
-	void run(std::vector<cl::sycl::event>& events) {
-		using namespace cl::sycl;
-
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto a = a_buffer.get_access<access::mode::read>(cgh);
-			auto y1 = y1_buffer.get_access<access::mode::read>(cgh);
-			auto x1 = x1_buffer.get_access<access::mode::read_write>(cgh);
-
-			cgh.parallel_for<Mvt1>(x1_buffer.get_range(), [=, N_ = size](item<1> item) {
-				const auto i = item[0];
-
-				for(size_t j = 0; j < N_; j++) {
-					x1[i] += a[{i, j}] * y1[j];
-				}
-			});
-		}));
-
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto a = a_buffer.get_access<access::mode::read>(cgh);
-			auto y2 = y2_buffer.get_access<access::mode::read>(cgh);
-			auto x2 = x2_buffer.get_access<access::mode::read_write>(cgh);
-
-			cgh.parallel_for<Mvt2>(x1_buffer.get_range(), [=, N_ = size](item<1> item) {
-				const auto k = item[0];
-
-				for(size_t l = 0; l < N_; l++) {
-					x2[k] += a[{k, l}] * y2[l];
-				}
-			});
-		}));
-	}
-
-	bool verify(VerificationSetting&) {
-		constexpr auto ERROR_THRESHOLD = 0.05;
-
-		std::vector<DATA_TYPE> x1_cpu(size);
-		std::vector<DATA_TYPE> x2_cpu(size);
-
-		// Trigger writeback
-		x1_buffer.reset();
-		x2_buffer.reset();
-
-		init_arrays(a.data(), x1_cpu.data(), x2_cpu.data(), y1.data(), y2.data(), size);
-
-		runMvt(a.data(), x1_cpu.data(), x2_cpu.data(), y1.data(), y2.data(), size);
-
-		for(size_t i = 0; i < size; i++) {
-			auto diff = percentDiff(x1_cpu[i], x1[i]);
-			if(diff > ERROR_THRESHOLD) return false;
-
-			diff = percentDiff(x2_cpu[i], x2[i]);
-			if(diff > ERROR_THRESHOLD) return false;
-		}
-
-		return true;
-	}
-
-	static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Mvt"; }
-
-  private:
-	BenchmarkArgs args;
-
-	const size_t size;
-	std::vector<DATA_TYPE> a;
-	std::vector<DATA_TYPE> x1;
-	std::vector<DATA_TYPE> x2;
-	std::vector<DATA_TYPE> y1;
-	std::vector<DATA_TYPE> y2;
-
-	PrefetchedBuffer<DATA_TYPE, 2> a_buffer;
-	PrefetchedBuffer<DATA_TYPE, 1> x1_buffer;
-	PrefetchedBuffer<DATA_TYPE, 1> x2_buffer;
-	PrefetchedBuffer<DATA_TYPE, 1> y1_buffer;
-	PrefetchedBuffer<DATA_TYPE, 1> y2_buffer;
+public:
+  Polybench_Mvt(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
+
+  void setup() {
+    a.resize(size * size);
+    x1.resize(size);
+    x2.resize(size);
+    y1.resize(size);
+    y2.resize(size);
+
+    init_arrays(a.data(), x1.data(), x2.data(), y1.data(), y2.data(), size);
+
+    a_buffer.initialize(args.device_queue, a.data(), sycl::range<2>(size, size));
+    x1_buffer.initialize(args.device_queue, x1.data(), sycl::range<1>(size));
+    x2_buffer.initialize(args.device_queue, x2.data(), sycl::range<1>(size));
+    y1_buffer.initialize(args.device_queue, y1.data(), sycl::range<1>(size));
+    y2_buffer.initialize(args.device_queue, y2.data(), sycl::range<1>(size));
+  }
+
+  void run(std::vector<sycl::event>& events) {
+    using namespace sycl;
+
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto a = a_buffer.get_access<access::mode::read>(cgh);
+      auto y1 = y1_buffer.get_access<access::mode::read>(cgh);
+      auto x1 = x1_buffer.get_access<access::mode::read_write>(cgh);
+
+      cgh.parallel_for<Mvt1>(x1_buffer.get_range(), [=, N_ = size](item<1> item) {
+        const auto i = item[0];
+
+        for(size_t j = 0; j < N_; j++) {
+          x1[i] += a[{i, j}] * y1[j];
+        }
+      });
+    }));
+
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto a = a_buffer.get_access<access::mode::read>(cgh);
+      auto y2 = y2_buffer.get_access<access::mode::read>(cgh);
+      auto x2 = x2_buffer.get_access<access::mode::read_write>(cgh);
+
+      cgh.parallel_for<Mvt2>(x1_buffer.get_range(), [=, N_ = size](item<1> item) {
+        const auto k = item[0];
+
+        for(size_t l = 0; l < N_; l++) {
+          x2[k] += a[{k, l}] * y2[l];
+        }
+      });
+    }));
+  }
+
+  bool verify(VerificationSetting&) {
+    constexpr auto ERROR_THRESHOLD = 0.05;
+
+    std::vector<DATA_TYPE> x1_cpu(size);
+    std::vector<DATA_TYPE> x2_cpu(size);
+
+    // Trigger writeback
+    x1_buffer.reset();
+    x2_buffer.reset();
+
+    init_arrays(a.data(), x1_cpu.data(), x2_cpu.data(), y1.data(), y2.data(), size);
+
+    runMvt(a.data(), x1_cpu.data(), x2_cpu.data(), y1.data(), y2.data(), size);
+
+    for(size_t i = 0; i < size; i++) {
+      auto diff = percentDiff(x1_cpu[i], x1[i]);
+      if(diff > ERROR_THRESHOLD)
+        return false;
+
+      diff = percentDiff(x2_cpu[i], x2[i]);
+      if(diff > ERROR_THRESHOLD)
+        return false;
+    }
+
+    return true;
+  }
+
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Mvt"; }
+
+private:
+  BenchmarkArgs args;
+
+  const size_t size;
+  std::vector<DATA_TYPE> a;
+  std::vector<DATA_TYPE> x1;
+  std::vector<DATA_TYPE> x2;
+  std::vector<DATA_TYPE> y1;
+  std::vector<DATA_TYPE> y2;
+
+  PrefetchedBuffer<DATA_TYPE, 2> a_buffer;
+  PrefetchedBuffer<DATA_TYPE, 1> x1_buffer;
+  PrefetchedBuffer<DATA_TYPE, 1> x2_buffer;
+  PrefetchedBuffer<DATA_TYPE, 1> y1_buffer;
+  PrefetchedBuffer<DATA_TYPE, 1> y2_buffer;
 };
 
 int main(int argc, char** argv) {
-	BenchmarkApp app(argc, argv);
-	app.run<Polybench_Mvt>();
-	return 0;
+  BenchmarkApp app(argc, argv);
+  app.run<Polybench_Mvt>();
+  return 0;
 }
diff --git a/polybench/syr2k.cpp b/polybench/syr2k.cpp
index d62fb7aa..63e231c6 100644
--- a/polybench/syr2k.cpp
+++ b/polybench/syr2k.cpp
@@ -3,7 +3,7 @@
 
 #include <cstdlib>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "common.h"
 #include "polybenchUtilFuncts.h"
@@ -16,117 +16,118 @@ constexpr DATA_TYPE ALPHA = 1;
 constexpr DATA_TYPE BETA = 1;
 
 void init_arrays(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* C, size_t size) {
-	const auto N = size;
-	const auto M = size;
-
-	for(size_t i = 0; i < N; i++) {
-		for(size_t j = 0; j < N; j++) {
-			C[i * N + j] = ((DATA_TYPE)i * j + 2) / N;
-		}
-
-		for(size_t j = 0; j < M; j++) {
-			A[i * N + j] = ((DATA_TYPE)i * j) / N;
-			B[i * N + j] = ((DATA_TYPE)i * j + 1) / N;
-		}
-	}
+  const auto N = size;
+  const auto M = size;
+
+  for(size_t i = 0; i < N; i++) {
+    for(size_t j = 0; j < N; j++) {
+      C[i * N + j] = ((DATA_TYPE)i * j + 2) / N;
+    }
+
+    for(size_t j = 0; j < M; j++) {
+      A[i * N + j] = ((DATA_TYPE)i * j) / N;
+      B[i * N + j] = ((DATA_TYPE)i * j + 1) / N;
+    }
+  }
 }
 
 void syr2k(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* C, size_t size) {
-	const auto N = size;
-	const auto M = size;
-
-	for(size_t i = 0; i < N; i++) {
-		for(size_t j = 0; j < N; j++) {
-			C[i * N + j] *= BETA;
-		}
-	}
-
-	for(size_t i = 0; i < N; i++) {
-		for(size_t j = 0; j < N; j++) {
-			for(size_t k = 0; k < M; k++) {
-				C[i * N + j] += ALPHA * A[i * M + k] * B[j * M + k];
-				C[i * N + j] += ALPHA * B[i * M + k] * A[j * M + k];
-			}
-		}
-	}
+  const auto N = size;
+  const auto M = size;
+
+  for(size_t i = 0; i < N; i++) {
+    for(size_t j = 0; j < N; j++) {
+      C[i * N + j] *= BETA;
+    }
+  }
+
+  for(size_t i = 0; i < N; i++) {
+    for(size_t j = 0; j < N; j++) {
+      for(size_t k = 0; k < M; k++) {
+        C[i * N + j] += ALPHA * A[i * M + k] * B[j * M + k];
+        C[i * N + j] += ALPHA * B[i * M + k] * A[j * M + k];
+      }
+    }
+  }
 }
 
 class Polybench_Syr2k {
-  public:
-	Polybench_Syr2k(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
+public:
+  Polybench_Syr2k(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
 
-	void setup() {
-		A.resize(size * size);
-		B.resize(size * size);
-		C.resize(size * size);
+  void setup() {
+    A.resize(size * size);
+    B.resize(size * size);
+    C.resize(size * size);
 
-		init_arrays(A.data(), B.data(), C.data(), size);
+    init_arrays(A.data(), B.data(), C.data(), size);
 
-		A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<2>(size, size));
-		B_buffer.initialize(args.device_queue, B.data(), cl::sycl::range<2>(size, size));
-		C_buffer.initialize(args.device_queue, C.data(), cl::sycl::range<2>(size, size));
-	}
+    A_buffer.initialize(args.device_queue, A.data(), sycl::range<2>(size, size));
+    B_buffer.initialize(args.device_queue, B.data(), sycl::range<2>(size, size));
+    C_buffer.initialize(args.device_queue, C.data(), sycl::range<2>(size, size));
+  }
 
-	void run(std::vector<cl::sycl::event>& events) {
-		using namespace cl::sycl;
+  void run(std::vector<sycl::event>& events) {
+    using namespace sycl;
 
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto A = A_buffer.get_access<access::mode::read>(cgh);
-			auto B = B_buffer.get_access<access::mode::read>(cgh);
-			auto C = C_buffer.get_access<access::mode::read_write>(cgh);
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto A = A_buffer.get_access<access::mode::read>(cgh);
+      auto B = B_buffer.get_access<access::mode::read>(cgh);
+      auto C = C_buffer.get_access<access::mode::read_write>(cgh);
 
-			cgh.parallel_for<Syr2k1>(C_buffer.get_range(), [=, M_ = size](item<2> item) {
-				const auto i = item[0];
-				const auto j = item[1];
+      cgh.parallel_for<Syr2k1>(C_buffer.get_range(), [=, M_ = size](item<2> item) {
+        const auto i = item[0];
+        const auto j = item[1];
 
-				C[item] *= BETA;
+        C[item] *= BETA;
 
-				for(size_t k = 0; k < M_; k++) {
-					C[item] += ALPHA * A[{i, k}] * B[{j, k}] + ALPHA * B[{i, k}] * A[{j, k}];
-				}
-			});
-		}));
-	}
+        for(size_t k = 0; k < M_; k++) {
+          C[item] += ALPHA * A[{i, k}] * B[{j, k}] + ALPHA * B[{i, k}] * A[{j, k}];
+        }
+      });
+    }));
+  }
 
-	bool verify(VerificationSetting&) {
-		constexpr auto ERROR_THRESHOLD = 0.05;
+  bool verify(VerificationSetting&) {
+    constexpr auto ERROR_THRESHOLD = 0.05;
 
-		std::vector<DATA_TYPE> C_cpu(size * size);
+    std::vector<DATA_TYPE> C_cpu(size * size);
 
-		init_arrays(A.data(), B.data(), C_cpu.data(), size);
+    init_arrays(A.data(), B.data(), C_cpu.data(), size);
 
-		// Trigger writeback
-		C_buffer.reset();
+    // Trigger writeback
+    C_buffer.reset();
 
-		syr2k(A.data(), B.data(), C_cpu.data(), size);
+    syr2k(A.data(), B.data(), C_cpu.data(), size);
 
-		for(size_t i = 0; i < size; i++) {
-			for(size_t j = 0; j < size; j++) {
-				const auto diff = percentDiff(C_cpu[i * size + j], C[i * size + j]);
-				if(diff > ERROR_THRESHOLD) return false;
-			}
-		}
+    for(size_t i = 0; i < size; i++) {
+      for(size_t j = 0; j < size; j++) {
+        const auto diff = percentDiff(C_cpu[i * size + j], C[i * size + j]);
+        if(diff > ERROR_THRESHOLD)
+          return false;
+      }
+    }
 
-		return true;
-	}
+    return true;
+  }
 
-	static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Syr2k"; }
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Syr2k"; }
 
-  private:
-	BenchmarkArgs args;
+private:
+  BenchmarkArgs args;
 
-	const size_t size;
-	std::vector<DATA_TYPE> A;
-	std::vector<DATA_TYPE> B;
-	std::vector<DATA_TYPE> C;
+  const size_t size;
+  std::vector<DATA_TYPE> A;
+  std::vector<DATA_TYPE> B;
+  std::vector<DATA_TYPE> C;
 
-	PrefetchedBuffer<DATA_TYPE, 2> A_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> B_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> C_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> A_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> B_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> C_buffer;
 };
 
 int main(int argc, char** argv) {
-	BenchmarkApp app(argc, argv);
-	app.run<Polybench_Syr2k>();
-	return 0;
+  BenchmarkApp app(argc, argv);
+  app.run<Polybench_Syr2k>();
+  return 0;
 }
diff --git a/polybench/syrk.cpp b/polybench/syrk.cpp
index a5b9ae6a..1d017f7c 100644
--- a/polybench/syrk.cpp
+++ b/polybench/syrk.cpp
@@ -3,7 +3,7 @@
 
 #include <cstdlib>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "common.h"
 #include "polybenchUtilFuncts.h"
@@ -16,111 +16,112 @@ constexpr DATA_TYPE alpha = 123;
 constexpr DATA_TYPE beta = 14512;
 
 void init_arrays(DATA_TYPE* A, DATA_TYPE* C, size_t size) {
-	const auto N = size;
-	const auto M = size;
-
-	for(size_t i = 0; i < N; i++) {
-		for(size_t j = 0; j < M; j++) {
-			A[i * M + j] = ((DATA_TYPE)i * j) / N;
-		}
-
-		for(size_t j = 0; j < N; j++) {
-			C[i * M + j] = ((DATA_TYPE)i * j + 2) / N;
-		}
-	}
+  const auto N = size;
+  const auto M = size;
+
+  for(size_t i = 0; i < N; i++) {
+    for(size_t j = 0; j < M; j++) {
+      A[i * M + j] = ((DATA_TYPE)i * j) / N;
+    }
+
+    for(size_t j = 0; j < N; j++) {
+      C[i * M + j] = ((DATA_TYPE)i * j + 2) / N;
+    }
+  }
 }
 
 void syrk(DATA_TYPE* A, DATA_TYPE* C, size_t size) {
-	const auto N = size;
-	const auto M = size;
-
-	/*  C := alpha*A*A' + beta*C */
-	for(size_t i = 0; i < N; i++) {
-		for(size_t j = 0; j < N; j++) {
-			C[i * M + j] *= beta;
-		}
-	}
-
-	for(size_t i = 0; i < N; i++) {
-		for(size_t j = 0; j < N; j++) {
-			for(size_t k = 0; k < M; k++) {
-				C[i * N + j] += alpha * A[i * M + k] * A[j * M + k];
-			}
-		}
-	}
+  const auto N = size;
+  const auto M = size;
+
+  /*  C := alpha*A*A' + beta*C */
+  for(size_t i = 0; i < N; i++) {
+    for(size_t j = 0; j < N; j++) {
+      C[i * M + j] *= beta;
+    }
+  }
+
+  for(size_t i = 0; i < N; i++) {
+    for(size_t j = 0; j < N; j++) {
+      for(size_t k = 0; k < M; k++) {
+        C[i * N + j] += alpha * A[i * M + k] * A[j * M + k];
+      }
+    }
+  }
 }
 
 class Polybench_Syrk {
-  public:
-	Polybench_Syrk(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
+public:
+  Polybench_Syrk(const BenchmarkArgs& args) : args(args), size(args.problem_size) {}
 
-	void setup() {
-		A.resize(size * size);
-		C.resize(size * size);
+  void setup() {
+    A.resize(size * size);
+    C.resize(size * size);
 
-		init_arrays(A.data(), C.data(), size);
+    init_arrays(A.data(), C.data(), size);
 
-		A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<2>(size, size));
-		C_buffer.initialize(args.device_queue, C.data(), cl::sycl::range<2>(size, size));
-	}
+    A_buffer.initialize(args.device_queue, A.data(), sycl::range<2>(size, size));
+    C_buffer.initialize(args.device_queue, C.data(), sycl::range<2>(size, size));
+  }
 
-	void run(std::vector<cl::sycl::event>& events) {
-		using namespace cl::sycl;
+  void run(std::vector<sycl::event>& events) {
+    using namespace sycl;
 
-		events.push_back(args.device_queue.submit([&](handler& cgh) {
-			auto A = A_buffer.get_access<access::mode::read>(cgh);
-			auto C = C_buffer.get_access<access::mode::read_write>(cgh);
+    events.push_back(args.device_queue.submit([&](handler& cgh) {
+      auto A = A_buffer.get_access<access::mode::read>(cgh);
+      auto C = C_buffer.get_access<access::mode::read_write>(cgh);
 
-			cgh.parallel_for<Syr2k2>(C_buffer.get_range(), [=, M_ = size](item<2> item) {
-				const auto i = item[0];
-				const auto j = item[1];
+      cgh.parallel_for<Syr2k2>(C_buffer.get_range(), [=, M_ = size](item<2> item) {
+        const auto i = item[0];
+        const auto j = item[1];
 
-				C[item] *= beta;
+        C[item] *= beta;
 
-				for(size_t k = 0; k < M_; k++) {
-					C[item] += alpha * A[{i, k}] * A[{j, k}];
-				}
-			});
-		}));
-	}
+        for(size_t k = 0; k < M_; k++) {
+          C[item] += alpha * A[{i, k}] * A[{j, k}];
+        }
+      });
+    }));
+  }
 
-	bool verify(VerificationSetting&) {
-		constexpr auto ERROR_THRESHOLD = 0.05;
+  bool verify(VerificationSetting&) {
+    constexpr auto ERROR_THRESHOLD = 0.05;
 
-		// Trigger writeback
-		C_buffer.reset();
+    // Trigger writeback
+    C_buffer.reset();
 
-		std::vector<DATA_TYPE> C_cpu(size * size);
+    std::vector<DATA_TYPE> C_cpu(size * size);
 
-		init_arrays(A.data(), C_cpu.data(), size);
+    init_arrays(A.data(), C_cpu.data(), size);
 
-		syrk(A.data(), C_cpu.data(), size);
+    syrk(A.data(), C_cpu.data(), size);
 
-		for(size_t i = 0; i < size; i++) {
-			for(size_t j = 0; j < size; j++) {
-				const auto diff = percentDiff(C_cpu[i * size + j], C[i * size + j]);
-				if(diff > ERROR_THRESHOLD) return false;
-			}
-		}
+    for(size_t i = 0; i < size; i++) {
+      for(size_t j = 0; j < size; j++) {
+        const auto diff = percentDiff(C_cpu[i * size + j], C[i * size + j]);
+        if(diff > ERROR_THRESHOLD)
+          return false;
+      }
+    }
 
-		return true;
-	}
+    return true;
+  }
 
-	static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Syrk"; }
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Syrk"; }
 
-  private:
-	BenchmarkArgs args;
+private:
+  BenchmarkArgs args;
 
-	const size_t size;
-	std::vector<DATA_TYPE> A;
-	std::vector<DATA_TYPE> C;
+  const size_t size;
+  std::vector<DATA_TYPE> A;
+  std::vector<DATA_TYPE> C;
 
-	PrefetchedBuffer<DATA_TYPE, 2> A_buffer;
-	PrefetchedBuffer<DATA_TYPE, 2> C_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> A_buffer;
+  PrefetchedBuffer<DATA_TYPE, 2> C_buffer;
 };
 
 int main(int argc, char** argv) {
-	BenchmarkApp app(argc, argv);
-	app.run<Polybench_Syrk>();
-	return 0;
+  BenchmarkApp app(argc, argv);
+  app.run<Polybench_Syrk>();
+  return 0;
 }
diff --git a/runtime/blocked_transform.cpp b/runtime/blocked_transform.cpp
index 10109b71..77c9e7d0 100644
--- a/runtime/blocked_transform.cpp
+++ b/runtime/blocked_transform.cpp
@@ -1,39 +1,32 @@
-
-
-
-
 #include "common.h"
 
+#include <cassert>
+#include <cmath>
 #include <functional>
 #include <iostream>
-#include <cmath>
-#include <cassert>
 
-using namespace cl;
 
-using complex = sycl::vec<float,2>;
+using complex = sycl::vec<float, 2>;
 
-inline complex mandelbrot_iteration(complex z, complex c)
-{
+inline complex mandelbrot_iteration(complex z, complex c) {
   complex result = c;
 
-  result.x() += z.x()*z.x() - z.y()*z.y();
-  result.y() += 2 * z.x()*z.y();
+  result.x() += z.x() * z.x() - z.y() * z.y();
+  result.y() += 2 * z.x() * z.y();
 
   return result;
 }
 
-template<int Num_iterations>
-complex mandelbrot_sequence(complex z0, complex c)
-{
+template <int Num_iterations>
+complex mandelbrot_sequence(complex z0, complex c) {
   complex z = z0;
-  for(int i = 0; i < Num_iterations; ++i){
+  for(int i = 0; i < Num_iterations; ++i) {
     z = mandelbrot_iteration(z, c);
   }
   return z;
 }
 
-template<int Num_iterations>
+template <int Num_iterations>
 class MandelbrotKernel;
 
 /// Performs a blocked transform operation using the mandelbrot sequence
@@ -47,57 +40,47 @@ class MandelbrotKernel;
 /// accessed ranges are non-overlapping. In order for the benchmark to stress
 /// these aspects, \c Num_iterations should be tuned such that the kernel
 /// runtime is similar to the data transfer time of one block.
-template<int Num_iterations>
-class BlockedTransform
-{
+template <int Num_iterations>
+class BlockedTransform {
 private:
-    std::vector<complex> data;
-    BenchmarkArgs args;
-    std::size_t block_size;
+  std::vector<complex> data;
+  BenchmarkArgs args;
+  std::size_t block_size;
+
 public:
-  BlockedTransform(
-    const BenchmarkArgs &_args,
-    std::size_t _block_size)
-    : args(_args), block_size{_block_size}
-    {
-      assert(block_size > 0);
-    }
-  
-  void setup() {     
-    init_data(data);
+  BlockedTransform(const BenchmarkArgs& _args, std::size_t _block_size) : args(_args), block_size{_block_size} {
+    assert(block_size > 0);
   }
 
-  void run(){
-    sycl::buffer<complex,1> buff {data.data(), sycl::range<1>{data.size()}};
+  void setup() { init_data(data); }
 
-    sycl::id<1> begin {0};
-    sycl::range<1> current_batch_size {block_size};
-    for(;begin[0] < data.size(); begin[0] += this->block_size) {
+  void run() {
+    sycl::buffer<complex, 1> buff{data.data(), sycl::range<1>{data.size()}};
 
-      current_batch_size[0] = std::min(this->block_size, data.size()-begin[0]);
+    sycl::id<1> begin{0};
+    sycl::range<1> current_batch_size{block_size};
+    for(; begin[0] < data.size(); begin[0] += this->block_size) {
+      current_batch_size[0] = std::min(this->block_size, data.size() - begin[0]);
 
-      args.device_queue.submit([&](sycl::handler &cgh) {
+      args.device_queue.submit([&](sycl::handler& cgh) {
+        auto acc = buff.get_access<sycl::access::mode::read_write>(cgh, current_batch_size, begin);
 
-        auto acc = buff.get_access<sycl::access::mode::read_write>(
-            cgh, current_batch_size, begin);
-
-        cgh.parallel_for<MandelbrotKernel<Num_iterations>>(
-            current_batch_size, begin, [=](cl::sycl::id<1> idx) {
-              const complex z0{0.0f, 0.0f};
-              acc[idx] = mandelbrot_sequence<Num_iterations>(z0, acc[idx]);
-            });
+        cgh.parallel_for<MandelbrotKernel<Num_iterations>>(current_batch_size, [=](sycl::id<1> idx) {
+          const complex z0{0.0f, 0.0f};
+          acc[idx] = mandelbrot_sequence<Num_iterations>(z0, acc[idx]);
+        });
       });
     }
   }
 
-  bool verify(VerificationSetting &ver) { 
-    std::vector<complex>  v;
+  bool verify(VerificationSetting& ver) {
+    std::vector<complex> v;
     init_data(v);
 
     const double tol = 1.e-5;
 
     for(std::size_t i = 0; i < v.size(); ++i) {
-      v[i] = mandelbrot_sequence<Num_iterations>(complex{0.0f,0.0f}, v[i]);
+      v[i] = mandelbrot_sequence<Num_iterations>(complex{0.0f, 0.0f}, v[i]);
 
       if(std::abs(v[i].x() - data[i].x()) > tol)
         return false;
@@ -107,7 +90,7 @@ class BlockedTransform
 
     return true;
   }
-  
+
   std::string getBenchmarkName(BenchmarkArgs& args) {
     std::stringstream name;
     name << "Runtime_BlockedTransform_iter_";
@@ -117,25 +100,21 @@ class BlockedTransform
   }
 
 private:
-  void init_data(std::vector<complex>& initial_data)
-  {
+  void init_data(std::vector<complex>& initial_data) {
     initial_data.clear();
     initial_data.resize(args.problem_size);
 
-    for(std::size_t i = 0; i < initial_data.size(); ++i)
-    {
-      initial_data[i].x() = 0.8*std::cos(i/args.problem_size);
-      initial_data[i].y() = 0.8*std::sin(i/args.problem_size);
+    for(std::size_t i = 0; i < initial_data.size(); ++i) {
+      initial_data[i].x() = 0.8 * std::cos(i / args.problem_size);
+      initial_data[i].y() = 0.8 * std::sin(i / args.problem_size);
     }
   }
 };
 
-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
   BenchmarkApp app(argc, argv);
 
-  for (std::size_t block_size = app.getArgs().local_size;
-       block_size < app.getArgs().problem_size; block_size *= 2) {
+  for(std::size_t block_size = app.getArgs().local_size; block_size < app.getArgs().problem_size; block_size *= 2) {
     app.run<BlockedTransform<64>>(block_size);
     app.run<BlockedTransform<128>>(block_size);
     app.run<BlockedTransform<256>>(block_size);
@@ -144,7 +123,3 @@ int main(int argc, char** argv)
 
   return 0;
 }
-
-
-
-
diff --git a/runtime/dag_task_throughput_independent.cpp b/runtime/dag_task_throughput_independent.cpp
index bd848341..dc136dc7 100644
--- a/runtime/dag_task_throughput_independent.cpp
+++ b/runtime/dag_task_throughput_independent.cpp
@@ -2,7 +2,7 @@
 
 #include <vector>
 
-using namespace cl;
+using namespace sycl;
 
 class IndependentDagTaskThroughputKernelSingleTask;
 class IndependentDagTaskThroughputKernelBasicPF;
@@ -10,106 +10,82 @@ class DagTaskThroughputKernelNdrangePF;
 class DagTaskThroughputKernelHierarchicalPF;
 
 // Measures the time it takes to run <problem-size> trivial single_task and parallel_for kernels
-// that are *independent*. 
+// that are *independent*.
 // This benchmark can be used to see how well a SYCL implementation
 // can utilize hardware concurrency.
-class IndependentDagTaskThroughput
-{
+class IndependentDagTaskThroughput {
   std::vector<sycl::buffer<int, 1>> dummy_buffers;
   BenchmarkArgs args;
+
 public:
-  IndependentDagTaskThroughput(const BenchmarkArgs &_args) 
-  : args(_args)
-  {}
-  
-  void setup() 
-  {
-    for (std::size_t i = 0; i < args.problem_size; ++i) {
+  IndependentDagTaskThroughput(const BenchmarkArgs& _args) : args(_args) {}
+
+  void setup() {
+    for(std::size_t i = 0; i < args.problem_size; ++i) {
       dummy_buffers.push_back(sycl::buffer<int, 1>{sycl::range<1>{1}});
       forceDataAllocation(args.device_queue, dummy_buffers.back());
     }
   }
 
-  void submit_single_task()
-  {
+  void submit_single_task() {
     for(std::size_t i = 0; i < args.problem_size; ++i) {
-
-      args.device_queue.submit(
-          [&](cl::sycl::handler& cgh) {
+      args.device_queue.submit([&](sycl::handler& cgh) {
         auto acc = dummy_buffers[i].get_access<sycl::access::mode::discard_write>(cgh);
-        
-        cgh.single_task<IndependentDagTaskThroughputKernelSingleTask>(
-          [=]()
-        {
-          acc[0] = i;
-        });  
+
+        cgh.single_task<IndependentDagTaskThroughputKernelSingleTask>([=]() { acc[0] = i; });
       }); // submit
     }
   }
 
-  void submit_basic_parallel_for()
-  {
+  void submit_basic_parallel_for() {
     for(std::size_t i = 0; i < args.problem_size; ++i) {
-      args.device_queue.submit(
-          [&](cl::sycl::handler& cgh) {
+      args.device_queue.submit([&](sycl::handler& cgh) {
         auto acc = dummy_buffers[i].get_access<sycl::access::mode::discard_write>(cgh);
-        
+
         cgh.parallel_for<IndependentDagTaskThroughputKernelBasicPF>(
-          
-          sycl::range<1>{args.local_size},
-          [=](sycl::id<1> idx)
-        {
-          if(idx[0] == 0)
-            acc[0] = i;
-        });  
+
+            sycl::range<1>{args.local_size}, [=](sycl::id<1> idx) {
+              if(idx[0] == 0)
+                acc[0] = i;
+            });
       }); // submit
     }
   }
 
-  void submit_ndrange_parallel_for()
-  {
+  void submit_ndrange_parallel_for() {
     for(std::size_t i = 0; i < args.problem_size; ++i) {
-      args.device_queue.submit(
-          [&](cl::sycl::handler& cgh) {
+      args.device_queue.submit([&](sycl::handler& cgh) {
         auto acc = dummy_buffers[i].get_access<sycl::access::mode::discard_write>(cgh);
-        
+
         cgh.parallel_for<DagTaskThroughputKernelNdrangePF>(
-          sycl::nd_range<1>{
-            sycl::range<1>{args.local_size},
-            sycl::range<1>{args.local_size}},
-          [=](sycl::nd_item<1> idx)
-        {
-          if(idx.get_global_id(0) == 0)
-            acc[0] = i;
-        });  
+            sycl::nd_range<1>{sycl::range<1>{args.local_size}, sycl::range<1>{args.local_size}},
+            [=](sycl::nd_item<1> idx) {
+              if(idx.get_global_id(0) == 0)
+                acc[0] = i;
+            });
       }); // submit
     }
   }
 
-  void submit_hierarchical_parallel_for()
-  {
+  void submit_hierarchical_parallel_for() {
     for(std::size_t i = 0; i < args.problem_size; ++i) {
-      args.device_queue.submit(
-          [&](cl::sycl::handler& cgh) {
+      args.device_queue.submit([&](sycl::handler& cgh) {
         auto acc = dummy_buffers[i].get_access<sycl::access::mode::discard_write>(cgh);
-        
+
         cgh.parallel_for_work_group<DagTaskThroughputKernelHierarchicalPF>(
-          sycl::range<1>{1}, sycl::range<1>{args.local_size},
-          [=](sycl::group<1> grp)
-        {
-          grp.parallel_for_work_item([&](sycl::h_item<1> idx){
-            if(idx.get_global_id(0) == 0)
-              acc[0] = i;
-          });
-        });  
+            sycl::range<1>{1}, sycl::range<1>{args.local_size}, [=](sycl::group<1> grp) {
+              grp.parallel_for_work_item([&](sycl::h_item<1> idx) {
+                if(idx.get_global_id(0) == 0)
+                  acc[0] = i;
+              });
+            });
       }); // submit
     }
   }
 
-  bool verify(VerificationSetting &ver) { 
-    for(std::size_t i = 0; i < dummy_buffers.size(); ++i){
-      auto host_acc =
-        dummy_buffers[i].get_access<sycl::access::mode::read>();
+  bool verify(VerificationSetting& ver) {
+    for(std::size_t i = 0; i < dummy_buffers.size(); ++i) {
+      auto host_acc = dummy_buffers[i].get_host_access();
 
       if(host_acc[0] != i)
         return false;
@@ -119,72 +95,49 @@ class IndependentDagTaskThroughput
   }
 };
 
-class IndependentDagTaskThroughputSingleTask
-    : public IndependentDagTaskThroughput
-{
+class IndependentDagTaskThroughputSingleTask : public IndependentDagTaskThroughput {
 public:
-  IndependentDagTaskThroughputSingleTask(const BenchmarkArgs& args)
-  : IndependentDagTaskThroughput{args} {}
+  IndependentDagTaskThroughputSingleTask(const BenchmarkArgs& args) : IndependentDagTaskThroughput{args} {}
 
-  void run(){
-    submit_single_task();
-  }
+  void run() { submit_single_task(); }
 
-  static std::string getBenchmarkName(BenchmarkArgs& args) {
-    return "Runtime_IndependentDAGTaskThroughput_SingleTask";
-  }
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "Runtime_IndependentDAGTaskThroughput_SingleTask"; }
 };
 
-class IndependentDagTaskThroughputBasicPF
-    : public IndependentDagTaskThroughput 
-{
+class IndependentDagTaskThroughputBasicPF : public IndependentDagTaskThroughput {
 public:
-  IndependentDagTaskThroughputBasicPF(const BenchmarkArgs& args)
-  : IndependentDagTaskThroughput{args} {}
+  IndependentDagTaskThroughputBasicPF(const BenchmarkArgs& args) : IndependentDagTaskThroughput{args} {}
 
-  void run(){
-    submit_basic_parallel_for();
-  }
+  void run() { submit_basic_parallel_for(); }
 
   static std::string getBenchmarkName(BenchmarkArgs& args) {
     return "Runtime_IndependentDAGTaskThroughput_BasicParallelFor";
   }
 };
 
-class IndependentDagTaskThroughputNDRangePF
-    : public IndependentDagTaskThroughput 
-{
+class IndependentDagTaskThroughputNDRangePF : public IndependentDagTaskThroughput {
 public:
-  IndependentDagTaskThroughputNDRangePF(const BenchmarkArgs& args)
-  : IndependentDagTaskThroughput{args} {}
+  IndependentDagTaskThroughputNDRangePF(const BenchmarkArgs& args) : IndependentDagTaskThroughput{args} {}
 
-  void run(){
-    submit_ndrange_parallel_for();
-  }
+  void run() { submit_ndrange_parallel_for(); }
 
   static std::string getBenchmarkName(BenchmarkArgs& args) {
     return "Runtime_IndependentDAGTaskThroughput_NDRangeParallelFor";
   }
 };
 
-class IndependentDagTaskThroughputHierarchicalPF
-    : public IndependentDagTaskThroughput 
-{
+class IndependentDagTaskThroughputHierarchicalPF : public IndependentDagTaskThroughput {
 public:
-  IndependentDagTaskThroughputHierarchicalPF(const BenchmarkArgs& args)
-  : IndependentDagTaskThroughput{args} {}
+  IndependentDagTaskThroughputHierarchicalPF(const BenchmarkArgs& args) : IndependentDagTaskThroughput{args} {}
 
-  void run(){
-    submit_hierarchical_parallel_for();
-  }
+  void run() { submit_hierarchical_parallel_for(); }
 
   static std::string getBenchmarkName(BenchmarkArgs& args) {
     return "Runtime_IndependentDAGTaskThroughput_HierarchicalParallelFor";
   }
 };
 
-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
   BenchmarkApp app(argc, argv);
 
   app.run<IndependentDagTaskThroughputSingleTask>();
@@ -194,6 +147,6 @@ int main(int argc, char** argv)
   // or triSYCL, this will be prohibitively slow
   if(app.shouldRunNDRangeKernels())
     app.run<IndependentDagTaskThroughputNDRangePF>();
-  
+
   return 0;
 }
diff --git a/runtime/dag_task_throughput_sequential.cpp b/runtime/dag_task_throughput_sequential.cpp
index 2f9eee54..f00032e1 100644
--- a/runtime/dag_task_throughput_sequential.cpp
+++ b/runtime/dag_task_throughput_sequential.cpp
@@ -1,6 +1,6 @@
 #include "common.h"
 
-using namespace cl;
+using namespace sycl;
 
 class DagTaskThroughputKernelSingleTask;
 class DagTaskThroughputKernelBasicPF;
@@ -10,166 +10,121 @@ class DagTaskThroughputKernelHierarchicalPF;
 // Measures the time it takes to run <problem-size> trivial single_task and parallel_for kernels
 // that depend on each other, and have to be executed in-order (-> Utilization of
 // parallel hardware is *not* tested)
-// This is influenced by 
+// This is influenced by
 // * latencies in task submission to the backend, e.g. GPU kernel latencies
 // * scheduling latencies caused by the SYCL implementation
 // * other overheads
-class DagTaskThroughput
-{
+class DagTaskThroughput {
   const int initial_value;
   PrefetchedBuffer<int, 1> dummy_counter;
   BenchmarkArgs args;
+
 public:
-  DagTaskThroughput(const BenchmarkArgs &_args) 
-  : initial_value{0}, args(_args)
-  {}
+  DagTaskThroughput(const BenchmarkArgs& _args) : initial_value{0}, args(_args) {}
 
   void setup() { dummy_counter.initialize(args.device_queue, &initial_value, sycl::range<1>{1}); }
 
-  void submit_single_task()
-  {
+  void submit_single_task() {
     // Behold! The weirdest, most inefficient summation algorithm ever conceived!
     for(std::size_t i = 0; i < args.problem_size; ++i) {
-      
-      args.device_queue.submit(
-          [&](cl::sycl::handler& cgh) {
+      args.device_queue.submit([&](sycl::handler& cgh) {
         auto acc = dummy_counter.get_access<sycl::access::mode::read_write>(cgh);
-        
-        cgh.single_task<DagTaskThroughputKernelSingleTask>(
-          [=]()
-        {
-          acc[0] += 1;
-        });  
+
+        cgh.single_task<DagTaskThroughputKernelSingleTask>([=]() { acc[0] += 1; });
       }); // submit
     }
   }
 
-  void submit_basic_parallel_for()
-  {
+  void submit_basic_parallel_for() {
     for(std::size_t i = 0; i < args.problem_size; ++i) {
-      args.device_queue.submit(
-          [&](cl::sycl::handler& cgh) {
+      args.device_queue.submit([&](sycl::handler& cgh) {
         auto acc = dummy_counter.get_access<sycl::access::mode::read_write>(cgh);
-        
+
         cgh.parallel_for<DagTaskThroughputKernelBasicPF>(
-          // while we cannot control it, let's hope the SYCL implementation 
-          // spawns a single work group.
-          sycl::range<1>{args.local_size},
-          [=](sycl::id<1> idx)
-        {
-          if(idx[0] == 0)
-            acc[0] += 1;
-        });  
+            // while we cannot control it, let's hope the SYCL implementation
+            // spawns a single work group.
+            sycl::range<1>{args.local_size}, [=](sycl::id<1> idx) {
+              if(idx[0] == 0)
+                acc[0] += 1;
+            });
       }); // submit
     }
   }
 
-  void submit_ndrange_parallel_for()
-  {
+  void submit_ndrange_parallel_for() {
     for(std::size_t i = 0; i < args.problem_size; ++i) {
-      args.device_queue.submit(
-          [&](cl::sycl::handler& cgh) {
+      args.device_queue.submit([&](sycl::handler& cgh) {
         auto acc = dummy_counter.get_access<sycl::access::mode::read_write>(cgh);
-        
+
         cgh.parallel_for<DagTaskThroughputKernelNdrangePF>(
-          sycl::nd_range<1>{
-            sycl::range<1>{args.local_size},
-            sycl::range<1>{args.local_size}},
-          [=](sycl::nd_item<1> idx)
-        {
-          if(idx.get_global_id(0) == 0)
-            acc[0] += 1;
-        });  
+            sycl::nd_range<1>{sycl::range<1>{args.local_size}, sycl::range<1>{args.local_size}},
+            [=](sycl::nd_item<1> idx) {
+              if(idx.get_global_id(0) == 0)
+                acc[0] += 1;
+            });
       }); // submit
     }
   }
 
-  void submit_hierarchical_parallel_for()
-  {
+  void submit_hierarchical_parallel_for() {
     for(std::size_t i = 0; i < args.problem_size; ++i) {
-      args.device_queue.submit(
-          [&](cl::sycl::handler& cgh) {
+      args.device_queue.submit([&](sycl::handler& cgh) {
         auto acc = dummy_counter.get_access<sycl::access::mode::read_write>(cgh);
-        
+
         cgh.parallel_for_work_group<DagTaskThroughputKernelHierarchicalPF>(
-          sycl::range<1>{1}, sycl::range<1>{args.local_size},
-          [=](sycl::group<1> grp)
-        {
-          grp.parallel_for_work_item([&](sycl::h_item<1> idx){
-            if(idx.get_global_id(0) == 0)
-              acc[0] += 1;
-          });
-        });  
+            sycl::range<1>{1}, sycl::range<1>{args.local_size}, [=](sycl::group<1> grp) {
+              grp.parallel_for_work_item([&](sycl::h_item<1> idx) {
+                if(idx.get_global_id(0) == 0)
+                  acc[0] += 1;
+              });
+            });
       }); // submit
     }
   }
 
-  bool verify(VerificationSetting &ver) { 
-    auto host_acc =
-      dummy_counter.get_access<sycl::access::mode::read>();
+  bool verify(VerificationSetting& ver) {
+    auto host_acc = dummy_counter.get_host_access();
 
     return host_acc[0] == args.problem_size;
   }
 };
 
 
-class DagTaskThroughputSingleTask : public DagTaskThroughput
-{
+class DagTaskThroughputSingleTask : public DagTaskThroughput {
 public:
-  DagTaskThroughputSingleTask(const BenchmarkArgs& args)
-  : DagTaskThroughput{args} {}
+  DagTaskThroughputSingleTask(const BenchmarkArgs& args) : DagTaskThroughput{args} {}
 
-  void run(){
-    submit_single_task();
-  }
+  void run() { submit_single_task(); }
 
-  static std::string getBenchmarkName(BenchmarkArgs& args) {
-    return "Runtime_DAGTaskThroughput_SingleTask";
-  }
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "Runtime_DAGTaskThroughput_SingleTask"; }
 };
 
 
-class DagTaskThroughputBasicPF : public DagTaskThroughput
-{
+class DagTaskThroughputBasicPF : public DagTaskThroughput {
 public:
-  DagTaskThroughputBasicPF(const BenchmarkArgs& args)
-  : DagTaskThroughput{args} {}
+  DagTaskThroughputBasicPF(const BenchmarkArgs& args) : DagTaskThroughput{args} {}
 
-  void run(){
-    submit_basic_parallel_for();
-  }
+  void run() { submit_basic_parallel_for(); }
 
-  static std::string getBenchmarkName(BenchmarkArgs& args) {
-    return "Runtime_DAGTaskThroughput_BasicParallelFor";
-  }
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "Runtime_DAGTaskThroughput_BasicParallelFor"; }
 };
 
 
-class DagTaskThroughputNDRangePF : public DagTaskThroughput
-{
+class DagTaskThroughputNDRangePF : public DagTaskThroughput {
 public:
-  DagTaskThroughputNDRangePF(const BenchmarkArgs& args)
-  : DagTaskThroughput{args} {}
+  DagTaskThroughputNDRangePF(const BenchmarkArgs& args) : DagTaskThroughput{args} {}
 
-  void run(){
-    submit_ndrange_parallel_for();
-  }
+  void run() { submit_ndrange_parallel_for(); }
 
-  static std::string getBenchmarkName(BenchmarkArgs& args) {
-    return "Runtime_DAGTaskThroughput_NDRangeParallelFor";
-  }
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "Runtime_DAGTaskThroughput_NDRangeParallelFor"; }
 };
 
 
-class DagTaskThroughputHierarchicalPF : public DagTaskThroughput
-{
+class DagTaskThroughputHierarchicalPF : public DagTaskThroughput {
 public:
-  DagTaskThroughputHierarchicalPF(const BenchmarkArgs& args)
-  : DagTaskThroughput{args} {}
+  DagTaskThroughputHierarchicalPF(const BenchmarkArgs& args) : DagTaskThroughput{args} {}
 
-  void run(){
-    submit_hierarchical_parallel_for();
-  }
+  void run() { submit_hierarchical_parallel_for(); }
 
   static std::string getBenchmarkName(BenchmarkArgs& args) {
     return "Runtime_DAGTaskThroughput_HierarchicalParallelFor";
@@ -177,8 +132,7 @@ class DagTaskThroughputHierarchicalPF : public DagTaskThroughput
 };
 
 
-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
   BenchmarkApp app(argc, argv);
 
   app.run<DagTaskThroughputSingleTask>();
diff --git a/runtime/matmulchain.cpp b/runtime/matmulchain.cpp
index e8b89f33..44635921 100644
--- a/runtime/matmulchain.cpp
+++ b/runtime/matmulchain.cpp
@@ -10,38 +10,38 @@ template <typename T>
 class MatmulChain;
 
 template <typename T>
-void multiply(cl::sycl::queue& queue, cl::sycl::buffer<T, 2>& mat_a, cl::sycl::buffer<T, 2>& mat_b,
-    cl::sycl::buffer<T, 2>& mat_c, const size_t mat_size) {
-  queue.submit([&](cl::sycl::handler& cgh) {
-    auto a = mat_a.template get_access<cl::sycl::access::mode::read>(cgh);
-    auto b = mat_b.template get_access<cl::sycl::access::mode::read>(cgh);
-    auto c = mat_c.template get_access<cl::sycl::access::mode::discard_write>(cgh);
-
-		cgh.parallel_for<class MatmulChain<T>>(cl::sycl::range<2>(mat_size, mat_size), [=](cl::sycl::item<2> item) {
-			auto sum = 0;
-			for(size_t k = 0; k < mat_size; ++k) {
-				const auto a_ik = a[{item[0], k}];
-				const auto b_kj = b[{k, item[1]}];
-				sum += a_ik * b_kj;
-			}
-			c[item] = sum;
-		});
+void multiply(sycl::queue& queue, sycl::buffer<T, 2>& mat_a, sycl::buffer<T, 2>& mat_b, sycl::buffer<T, 2>& mat_c,
+    const size_t mat_size) {
+  queue.submit([&](sycl::handler& cgh) {
+    auto a = mat_a.template get_access<sycl::access::mode::read>(cgh);
+    auto b = mat_b.template get_access<sycl::access::mode::read>(cgh);
+    auto c = mat_c.template get_access<sycl::access::mode::discard_write>(cgh);
+
+    cgh.parallel_for<class MatmulChain<T>>(sycl::range<2>(mat_size, mat_size), [=](sycl::item<2> item) {
+      auto sum = 0;
+      for(size_t k = 0; k < mat_size; ++k) {
+        const auto a_ik = a[{item[0], k}];
+        const auto b_kj = b[{k, item[1]}];
+        sum += a_ik * b_kj;
+      }
+      c[item] = sum;
+    });
   });
 }
 
 
 template <typename T>
 class MatmulChain {
-protected:    
-	std::vector<T> mat_a;
-	std::vector<T> mat_b;
-	std::vector<T> mat_c;
-	std::vector<T> mat_d;
-	std::vector<T> mat_res;
-	BenchmarkArgs args;
-	int mat_size;
-
-	PrefetchedBuffer<T, 2> mat_a_buf;
+protected:
+  std::vector<T> mat_a;
+  std::vector<T> mat_b;
+  std::vector<T> mat_c;
+  std::vector<T> mat_d;
+  std::vector<T> mat_res;
+  BenchmarkArgs args;
+  int mat_size;
+
+  PrefetchedBuffer<T, 2> mat_a_buf;
   PrefetchedBuffer<T, 2> mat_b_buf;
   PrefetchedBuffer<T, 2> mat_c_buf;
   PrefetchedBuffer<T, 2> mat_d_buf;
@@ -50,68 +50,68 @@ class MatmulChain {
   PrefetchedBuffer<T, 2> mat_q_buf;
 
 public:
-	MatmulChain(const BenchmarkArgs &_args) : args(_args) {
-		mat_size = args.problem_size;
-	}
-
-	void setup() {
-		mat_a = std::vector<T>(mat_size * mat_size);
-		mat_b = std::vector<T>(mat_size * mat_size);
-		mat_c = std::vector<T>(mat_size * mat_size);
-		mat_d = std::vector<T>(mat_size * mat_size);
-		mat_res = std::vector<T>(mat_size * mat_size);
-
-		// Initialize matrices to the identity
-		for(size_t i = 0; i < mat_size; ++i) {
-			for(size_t j = 0; j < mat_size; ++j) {
-				mat_a[i * mat_size + j] = i == j;
-				mat_b[i * mat_size + j] = i == j;
-				mat_c[i * mat_size + j] = i == j;
-				mat_d[i * mat_size + j] = i == j;
-			}
-		}
-
-		mat_a_buf.initialize(args.device_queue, mat_a.data(), cl::sycl::range<2>(mat_size, mat_size));
-		mat_b_buf.initialize(args.device_queue, mat_b.data(), cl::sycl::range<2>(mat_size, mat_size));
-		mat_c_buf.initialize(args.device_queue, mat_c.data(), cl::sycl::range<2>(mat_size, mat_size));
-		mat_d_buf.initialize(args.device_queue, mat_d.data(), cl::sycl::range<2>(mat_size, mat_size));
-		mat_res_buf.initialize(args.device_queue, mat_res.data(), cl::sycl::range<2>(mat_size, mat_size));
-		mat_p_buf.initialize(args.device_queue, cl::sycl::range<2>(mat_size, mat_size));
-		mat_q_buf.initialize(args.device_queue, cl::sycl::range<2>(mat_size, mat_size));
-	}
-
-	void run() {
-		multiply(args.device_queue, mat_a_buf.get(), mat_b_buf.get(), mat_p_buf.get(), mat_size);
-		multiply(args.device_queue, mat_c_buf.get(), mat_d_buf.get(), mat_q_buf.get(), mat_size);
-		multiply(args.device_queue, mat_p_buf.get(), mat_q_buf.get(), mat_res_buf.get(), mat_size);
-	}
+  MatmulChain(const BenchmarkArgs& _args) : args(_args) { mat_size = args.problem_size; }
+
+  void setup() {
+    mat_a = std::vector<T>(mat_size * mat_size);
+    mat_b = std::vector<T>(mat_size * mat_size);
+    mat_c = std::vector<T>(mat_size * mat_size);
+    mat_d = std::vector<T>(mat_size * mat_size);
+    mat_res = std::vector<T>(mat_size * mat_size);
+
+    // Initialize matrices to the identity
+    for(size_t i = 0; i < mat_size; ++i) {
+      for(size_t j = 0; j < mat_size; ++j) {
+        mat_a[i * mat_size + j] = i == j;
+        mat_b[i * mat_size + j] = i == j;
+        mat_c[i * mat_size + j] = i == j;
+        mat_d[i * mat_size + j] = i == j;
+      }
+    }
+
+    mat_a_buf.initialize(args.device_queue, mat_a.data(), sycl::range<2>(mat_size, mat_size));
+    mat_b_buf.initialize(args.device_queue, mat_b.data(), sycl::range<2>(mat_size, mat_size));
+    mat_c_buf.initialize(args.device_queue, mat_c.data(), sycl::range<2>(mat_size, mat_size));
+    mat_d_buf.initialize(args.device_queue, mat_d.data(), sycl::range<2>(mat_size, mat_size));
+    mat_res_buf.initialize(args.device_queue, mat_res.data(), sycl::range<2>(mat_size, mat_size));
+    mat_p_buf.initialize(args.device_queue, sycl::range<2>(mat_size, mat_size));
+    mat_q_buf.initialize(args.device_queue, sycl::range<2>(mat_size, mat_size));
+  }
+
+  void run() {
+    multiply(args.device_queue, mat_a_buf.get(), mat_b_buf.get(), mat_p_buf.get(), mat_size);
+    multiply(args.device_queue, mat_c_buf.get(), mat_d_buf.get(), mat_q_buf.get(), mat_size);
+    multiply(args.device_queue, mat_p_buf.get(), mat_q_buf.get(), mat_res_buf.get(), mat_size);
+  }
 
   static std::string getBenchmarkName(BenchmarkArgs& args) { return "MatmulChain"; }
 
-  bool verify(VerificationSetting &ver) {
-		// Triggers writeback
-		mat_res_buf.reset();
-		bool verification_passed = true;
-
-		for(size_t i = 0; i < mat_size; ++i) {
-			for(size_t j = 0; j < mat_size; ++j) {
-				const T kernel_value = mat_res[i * mat_size + j];
-				const T host_value = i == j;
-				if(kernel_value != host_value) {
-					fprintf(stderr, "VERIFICATION FAILED for element %ld,%ld: %f != %f\n", i, j, kernel_value, host_value);
-					verification_passed = false;
-					break;
-				}
-			}
-			if(!verification_passed) { break; }
-		}
-		return verification_passed;
-	}
+  bool verify(VerificationSetting& ver) {
+    // Triggers writeback
+    mat_res_buf.reset();
+    bool verification_passed = true;
+
+    for(size_t i = 0; i < mat_size; ++i) {
+      for(size_t j = 0; j < mat_size; ++j) {
+        const T kernel_value = mat_res[i * mat_size + j];
+        const T host_value = i == j;
+        if(kernel_value != host_value) {
+          fprintf(stderr, "VERIFICATION FAILED for element %ld,%ld: %f != %f\n", i, j, kernel_value, host_value);
+          verification_passed = false;
+          break;
+        }
+      }
+      if(!verification_passed) {
+        break;
+      }
+    }
+    return verification_passed;
+  }
 };
 
 int main(int argc, char** argv) {
-	BenchmarkApp app(argc, argv);
-	
-	// float 
-	app.run< MatmulChain<float> >();
+  BenchmarkApp app(argc, argv);
+
+  // float
+  app.run<MatmulChain<float>>();
 }
diff --git a/runtime/short_long.cpp b/runtime/short_long.cpp
index 83224239..736539ec 100644
--- a/runtime/short_long.cpp
+++ b/runtime/short_long.cpp
@@ -1 +1,2 @@
-TODO SYCL code with multiple tasks A -> b1 ... bn -> D and A -> C -> D where b1...bn are short task and C is a long task, so that runtime capabilities of kernel overlapping is evaluated
+TODO SYCL code with multiple tasks A->b1... bn->D and A->C->D where b1... bn are short task and C is a long task,
+    so that runtime capabilities of kernel overlapping is evaluated
diff --git a/single-kernel/kmeans.cpp b/single-kernel/kmeans.cpp
index 974912ea..4c8da8c1 100644
--- a/single-kernel/kmeans.cpp
+++ b/single-kernel/kmeans.cpp
@@ -5,37 +5,38 @@
 #define FLT_MAX 500000.0
 #endif
 
-//using namespace cl::sycl;
-namespace s = cl::sycl;
-template <typename T> class KmeansKernel;
+// using namespace sycl;
+namespace s = sycl;
+template <typename T>
+class KmeansKernel;
 
 template <typename T>
-class KmeansBench
-{
-protected:    
-    std::vector<T> features;
-    std::vector<T> clusters;
-    std::vector<int> membership;
-    int nfeatures;
-	  int nclusters;
-    int feature_size;
-    int cluster_size;
-    BenchmarkArgs args;
-
-
-    PrefetchedBuffer<T, 1> features_buf;
-    PrefetchedBuffer<T, 1> clusters_buf;
-    PrefetchedBuffer<int, 1> membership_buf;
+class KmeansBench {
+protected:
+  std::vector<T> features;
+  std::vector<T> clusters;
+  std::vector<int> membership;
+  int nfeatures;
+  int nclusters;
+  int feature_size;
+  int cluster_size;
+  BenchmarkArgs args;
+
+
+  PrefetchedBuffer<T, 1> features_buf;
+  PrefetchedBuffer<T, 1> clusters_buf;
+  PrefetchedBuffer<int, 1> membership_buf;
+
 public:
-  KmeansBench(const BenchmarkArgs &_args) : args(_args) {}
-  
-  void setup() {      
+  KmeansBench(const BenchmarkArgs& _args) : args(_args) {}
+
+  void setup() {
     // host memory allocation and initialization
     nfeatures = 2;
     nclusters = 3;
 
-    feature_size = nfeatures*args.problem_size;
-    cluster_size = nclusters*args.problem_size;
+    feature_size = nfeatures * args.problem_size;
+    cluster_size = nclusters * args.problem_size;
 
     features.resize(feature_size, 2.0f);
     clusters.resize(cluster_size, 1.0f);
@@ -46,43 +47,41 @@ class KmeansBench
     membership_buf.initialize(args.device_queue, membership.data(), s::range<1>(args.problem_size));
   }
 
-  void run(std::vector<cl::sycl::event>& events) {
-    events.push_back(args.device_queue.submit([&](cl::sycl::handler& cgh) {
+  void run(std::vector<sycl::event>& events) {
+    events.push_back(args.device_queue.submit([&](sycl::handler& cgh) {
       auto features = features_buf.template get_access<s::access::mode::read>(cgh);
       auto clusters = clusters_buf.template get_access<s::access::mode::read>(cgh);
       auto membership = membership_buf.template get_access<s::access::mode::discard_write>(cgh);
 
-      cl::sycl::range<1> ndrange(args.problem_size);
-
-      cgh.parallel_for<class KmeansKernel<T>>(ndrange,
-        [features, clusters, membership, problem_size = args.problem_size,
-         nclusters_ = nclusters, nfeatures_ = nfeatures]
-        (cl::sycl::id<1> idx){
-
-        size_t gid = idx[0];
-
-        if(gid < problem_size) {
-          int index = 0;
-          T min_dist = FLT_MAX;
-          for(size_t i = 0; i < nclusters_; i++) {
-            T dist = 0;
-            for(size_t l = 0; l < nfeatures_; l++) {
-              dist += (features[l * problem_size + gid] - clusters[i * nfeatures_ + l]) *
-                      (features[l * problem_size + gid] - clusters[i * nfeatures_ + l]);
-            }
-            if(dist < min_dist) {
-              min_dist = dist;
-              index = gid;
+      sycl::range<1> ndrange(args.problem_size);
+
+      cgh.parallel_for<class KmeansKernel<T>>(
+          ndrange, [features, clusters, membership, problem_size = args.problem_size, nclusters_ = nclusters,
+                       nfeatures_ = nfeatures](sycl::id<1> idx) {
+            size_t gid = idx[0];
+
+            if(gid < problem_size) {
+              int index = 0;
+              T min_dist = FLT_MAX;
+              for(size_t i = 0; i < nclusters_; i++) {
+                T dist = 0;
+                for(size_t l = 0; l < nfeatures_; l++) {
+                  dist += (features[l * problem_size + gid] - clusters[i * nfeatures_ + l]) *
+                          (features[l * problem_size + gid] - clusters[i * nfeatures_ + l]);
+                }
+                if(dist < min_dist) {
+                  min_dist = dist;
+                  index = gid;
+                }
+              }
+              membership[gid] = index;
             }
-          }
-          membership[gid] = index;
-        }
-      });
+          });
     }));
   }
 
-  bool verify(VerificationSetting &ver) {
-    auto membership_acc = membership_buf.template get_access<s::access::mode::read>();
+  bool verify(VerificationSetting& ver) {
+    auto membership_acc = membership_buf.get_host_access();
 
     bool pass = true;
     unsigned int equal = 1;
@@ -118,15 +117,15 @@ class KmeansBench
     std::stringstream name;
     name << "Kmeans_";
     name << ReadableTypename<T>::name;
-    return name.str();     
+    return name.str();
   }
 };
 
-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
   BenchmarkApp app(argc, argv);
-  app.run<KmeansBench<float>> ();
-  if(app.deviceSupportsFP64())
+  app.run<KmeansBench<float>>();
+  if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) {
     app.run<KmeansBench<double>>();
+  }
   return 0;
 }
diff --git a/single-kernel/lin_reg_coeff.cpp b/single-kernel/lin_reg_coeff.cpp
index 47ad637d..29fb7a87 100644
--- a/single-kernel/lin_reg_coeff.cpp
+++ b/single-kernel/lin_reg_coeff.cpp
@@ -1,35 +1,36 @@
 #include "common.h"
 #include <iostream>
 
-//using namespace cl::sycl;
-namespace s = cl::sycl;
+// using namespace sycl;
+namespace s = sycl;
 
-template <typename T> class VecProductKernel;
-template <typename T> class VecReduceKernel;
+template <typename T>
+class VecProductKernel;
+template <typename T>
+class VecReduceKernel;
 
 template <typename T>
-class LinearRegressionCoeffBench
-{
-protected:    
-    std::vector<T> input1;
-    std::vector<T> input2;
-    std::vector<T> output;
-    T coeff_b1;
-    T coeff_b0;
-
-    // Only needed for verification as reduction is done inplace which modifies the input
-    std::vector<T> input1ver;
-    std::vector<T> input2ver;
-    BenchmarkArgs args;
-
-    PrefetchedBuffer<T, 1> input1_buf;
-    PrefetchedBuffer<T, 1> input2_buf;
-    PrefetchedBuffer<T, 1> output_buf;
+class LinearRegressionCoeffBench {
+protected:
+  std::vector<T> input1;
+  std::vector<T> input2;
+  std::vector<T> output;
+  T coeff_b1;
+  T coeff_b0;
+
+  // Only needed for verification as reduction is done inplace which modifies the input
+  std::vector<T> input1ver;
+  std::vector<T> input2ver;
+  BenchmarkArgs args;
+
+  PrefetchedBuffer<T, 1> input1_buf;
+  PrefetchedBuffer<T, 1> input2_buf;
+  PrefetchedBuffer<T, 1> output_buf;
 
 public:
-  LinearRegressionCoeffBench(const BenchmarkArgs &_args) : args(_args) {}
-  
-  void setup() {      
+  LinearRegressionCoeffBench(const BenchmarkArgs& _args) : args(_args) {}
+
+  void setup() {
     // host memory allocation and initialization
     input1.resize(args.problem_size);
     input2.resize(args.problem_size);
@@ -38,9 +39,9 @@ class LinearRegressionCoeffBench
     input1ver.resize(args.problem_size);
     input2ver.resize(args.problem_size);
 
-    for (size_t i = 0; i < args.problem_size; i++) {
-       input1ver[i] = input1[i] = 1.0;
-       input2ver[i] = input2[i] = 2.0;
+    for(size_t i = 0; i < args.problem_size; i++) {
+      input1ver[i] = input1[i] = 1.0;
+      input2ver[i] = input2[i] = 2.0;
     }
 
     input1_buf.initialize(args.device_queue, input1.data(), s::range<1>(args.problem_size));
@@ -48,82 +49,77 @@ class LinearRegressionCoeffBench
     output_buf.initialize(args.device_queue, output.data(), s::range<1>(args.problem_size));
   }
 
-  void vec_product(std::vector<cl::sycl::event>& events, s::buffer<T, 1> &input1_buf, s::buffer<T, 1> &input2_buf, s::buffer<T, 1> &output_buf) {
-    events.push_back(args.device_queue.submit(
-        [&](cl::sycl::handler& cgh) {
+  void vec_product(std::vector<sycl::event>& events, s::buffer<T, 1>& input1_buf, s::buffer<T, 1>& input2_buf,
+      s::buffer<T, 1>& output_buf) {
+    events.push_back(args.device_queue.submit([&](sycl::handler& cgh) {
       auto in1 = input1_buf.template get_access<s::access::mode::read>(cgh);
       auto in2 = input2_buf.template get_access<s::access::mode::read>(cgh);
- 
-       // Use discard_write here, otherwise the content of the host buffer must first be copied to device
+
+      // Use discard_write here, otherwise the content of the host buffer must first be copied to device
       auto intermediate_product = output_buf.template get_access<s::access::mode::discard_write>(cgh);
 
-      cl::sycl::nd_range<1> ndrange (args.problem_size, args.local_size);
+      sycl::nd_range<1> ndrange(args.problem_size, args.local_size);
 
-      cgh.parallel_for<class VecProductKernel<T>>(ndrange,
-        [=](cl::sycl::nd_item<1> item) 
-        {
-          size_t gid= item.get_global_linear_id();
-          intermediate_product[gid] = in1[gid] * in2[gid];
-        });
+      cgh.parallel_for<class VecProductKernel<T>>(ndrange, [=](sycl::nd_item<1> item) {
+        size_t gid = item.get_global_linear_id();
+        intermediate_product[gid] = in1[gid] * in2[gid];
+      });
     }));
   }
 
-T reduce(std::vector<cl::sycl::event>& events, s::buffer<T, 1> &input_buf) {
-  auto array_size = args.problem_size;
-  auto wgroup_size = args.local_size;
-  // Not yet tested with more than 2
-  auto elements_per_thread = 2;
+  T reduce(std::vector<sycl::event>& events, s::buffer<T, 1>& input_buf) {
+    auto array_size = args.problem_size;
+    auto wgroup_size = args.local_size;
+    // Not yet tested with more than 2
+    auto elements_per_thread = 2;
 
-  while (array_size!= 1) {
-    auto n_wgroups = (array_size + wgroup_size*elements_per_thread - 1)/(wgroup_size*elements_per_thread); // two threads per work item
-
-    events.push_back(args.device_queue.submit(
-      [&](cl::sycl::handler& cgh) {
+    while(array_size != 1) {
+      auto n_wgroups = (array_size + wgroup_size * elements_per_thread - 1) /
+                       (wgroup_size * elements_per_thread); // two threads per work item
 
+      events.push_back(args.device_queue.submit([&](sycl::handler& cgh) {
         auto global_mem = input_buf.template get_access<s::access::mode::read_write>(cgh);
-    
+
         // local memory for reduction
-        auto local_mem = s::accessor <T, 1, s::access::mode::read_write, s::access::target::local> {s::range<1>(wgroup_size), cgh};
-        cl::sycl::nd_range<1> ndrange (n_wgroups*wgroup_size, wgroup_size);
-  
-        cgh.parallel_for<class VecReduceKernel<T>>(ndrange,
-        [=](cl::sycl::nd_item<1> item) 
-          {
-            size_t gid= item.get_global_linear_id();
-            size_t lid = item.get_local_linear_id();
-
-            // initialize local memory to 0
-            local_mem[lid] = 0; 
-
-            if ((elements_per_thread * gid) < array_size) {
-                local_mem[lid] = global_mem[elements_per_thread*gid] + global_mem[elements_per_thread*gid + 1];
-            }
+        auto local_mem = s::local_accessor<T, 1>{s::range<1>(wgroup_size), cgh};
+        sycl::nd_range<1> ndrange(n_wgroups * wgroup_size, wgroup_size);
 
-            item.barrier(s::access::fence_space::local_space);
+        cgh.parallel_for<class VecReduceKernel<T>>(ndrange, [=](sycl::nd_item<1> item) {
+          size_t gid = item.get_global_linear_id();
+          size_t lid = item.get_local_linear_id();
 
-            for (size_t stride = 1; stride < wgroup_size; stride *= elements_per_thread) {
-              auto local_mem_index = elements_per_thread * stride * lid;
-              if (local_mem_index < wgroup_size) {
-                  local_mem[local_mem_index] = local_mem[local_mem_index] + local_mem[local_mem_index + stride];
-              }
+          // initialize local memory to 0
+          local_mem[lid] = 0;
 
-              item.barrier(s::access::fence_space::local_space);
-            }
+          if((elements_per_thread * gid) < array_size) {
+            local_mem[lid] = global_mem[elements_per_thread * gid] + global_mem[elements_per_thread * gid + 1];
+          }
+
+          sycl::group_barrier(item.get_group());
 
-            // Only one work-item per work group writes to global memory 
-            if (lid == 0) {
-              global_mem[item.get_group_linear_id()] = local_mem[0];
+
+          for(size_t stride = 1; stride < wgroup_size; stride *= elements_per_thread) {
+            auto local_mem_index = elements_per_thread * stride * lid;
+            if(local_mem_index < wgroup_size) {
+              local_mem[local_mem_index] = local_mem[local_mem_index] + local_mem[local_mem_index + stride];
             }
-          });
+
+            sycl::group_barrier(item.get_group());
+          }
+
+          // Only one work-item per work group writes to global memory
+          if(lid == 0) {
+            global_mem[item.get_group_linear_id()] = local_mem[0];
+          }
+        });
       }));
-    array_size = n_wgroups;
+      array_size = n_wgroups;
+    }
+    auto reduced_value = input_buf.get_host_access();
+    return (reduced_value[0]);
   }
-  auto reduced_value = input_buf.template get_access<s::access::mode::read>();
-  return(reduced_value[0]);
-}
-
-  void run(std::vector<cl::sycl::event>& events) {
 
+  void run(std::vector<sycl::event>& events) {
     vec_product(events, input1_buf.get(), input2_buf.get(), output_buf.get());
 
     T ss_xy = reduce(events, output_buf.get());
@@ -132,71 +128,72 @@ T reduce(std::vector<cl::sycl::event>& events, s::buffer<T, 1> &input_buf) {
 
     T ss_xx = reduce(events, output_buf.get());
 
-    T mean_x = reduce(events, input1_buf.get())/args.problem_size;
-    T mean_y = reduce(events, input2_buf.get())/args.problem_size;
+    T mean_x = reduce(events, input1_buf.get()) / args.problem_size;
+    T mean_y = reduce(events, input2_buf.get()) / args.problem_size;
 
-    ss_xy = ss_xy - mean_x*mean_y;
-    ss_xx = ss_xx - mean_x*mean_x;
+    ss_xy = ss_xy - mean_x * mean_y;
+    ss_xx = ss_xx - mean_x * mean_x;
 
-    coeff_b1 = ss_xy/ss_xx;
-    coeff_b0 = mean_y - coeff_b1*mean_x;
+    coeff_b1 = ss_xy / ss_xx;
+    coeff_b0 = mean_y - coeff_b1 * mean_x;
 
-    //std::cout << "ss_xy = " << ss_xy << "ss_xx = " << ss_xx << std::endl;
-    //std::cout << "Mean_x = " << mean_x << "Mean_y = " << mean_y << std::endl;
-    //std::cout << "Coeff_b1 = " << coeff_b1 << ", " << "Coeff_b0 = " << coeff_b0 << std::endl;
+    // std::cout << "ss_xy = " << ss_xy << "ss_xx = " << ss_xx << std::endl;
+    // std::cout << "Mean_x = " << mean_x << "Mean_y = " << mean_y << std::endl;
+    // std::cout << "Coeff_b1 = " << coeff_b1 << ", " << "Coeff_b0 = " << coeff_b0 << std::endl;
   }
 
-  bool verify(VerificationSetting &ver) { 
-     bool pass = true;
-    
+  bool verify(VerificationSetting& ver) {
+    bool pass = true;
+
     T sum_of_vec1 = 0;
     T sum_of_vec2 = 0;
-    for (size_t i = 0; i < args.problem_size; i++) {
+    for(size_t i = 0; i < args.problem_size; i++) {
       sum_of_vec1 += input1ver[i];
       sum_of_vec2 += input2ver[i];
     }
 
-    T mean_x = sum_of_vec1/args.problem_size;
-    T mean_y = sum_of_vec2/args.problem_size;
+    T mean_x = sum_of_vec1 / args.problem_size;
+    T mean_y = sum_of_vec2 / args.problem_size;
 
     T ss_xy = 0;
     T ss_xx = 0;
-    for (size_t i = 0; i < args.problem_size; i++) {
-      ss_xy += input1ver[i]*input2ver[i];
-      ss_xx += input1ver[i]*input1ver[i];
+    for(size_t i = 0; i < args.problem_size; i++) {
+      ss_xy += input1ver[i] * input2ver[i];
+      ss_xx += input1ver[i] * input1ver[i];
     }
 
-    ss_xy = ss_xy - mean_x*mean_y;
-    ss_xx = ss_xx - mean_x*mean_x;
+    ss_xy = ss_xy - mean_x * mean_y;
+    ss_xx = ss_xx - mean_x * mean_x;
 
-    T expected_coeff_b1 = ss_xy/ss_xx;
-    T expected_coeff_b0 = mean_y - expected_coeff_b1*mean_x;
+    T expected_coeff_b1 = ss_xy / ss_xx;
+    T expected_coeff_b0 = mean_y - expected_coeff_b1 * mean_x;
 
-    //std::cout << "Coeff_b1 = " << coeff_b1 << ", " << "Coeff_b0 = " << coeff_b0 << std::endl;
-    //std::cout << "Expected Coeff_b1 = " << expected_coeff_b1 << ", " << "Expected Coeff_b0 = " << expected_coeff_b0 << std::endl;
+    // std::cout << "Coeff_b1 = " << coeff_b1 << ", " << "Coeff_b0 = " << coeff_b0 << std::endl;
+    // std::cout << "Expected Coeff_b1 = " << expected_coeff_b1 << ", " << "Expected Coeff_b0 = " << expected_coeff_b0
+    // << std::endl;
 
     const T tolerance = 0.00001;
-    if ((fabs(expected_coeff_b0 - coeff_b0) > tolerance) || (fabs(expected_coeff_b1 - coeff_b1) > tolerance))
+    if((fabs(expected_coeff_b0 - coeff_b0) > tolerance) || (fabs(expected_coeff_b1 - coeff_b1) > tolerance))
       pass = false;
 
     return pass;
   }
-  
+
   static std::string getBenchmarkName(BenchmarkArgs& args) {
     std::stringstream name;
     name << "LinearRegressionCoeff_";
     name << ReadableTypename<T>::name;
-    return name.str();     
+    return name.str();
   }
 };
 
-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
   BenchmarkApp app(argc, argv);
-  if(app.shouldRunNDRangeKernels()){
+  if(app.shouldRunNDRangeKernels()) {
     app.run<LinearRegressionCoeffBench<float>>();
-    if(app.deviceSupportsFP64())
+    if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) {
       app.run<LinearRegressionCoeffBench<double>>();
+    }
   }
   return 0;
 }
diff --git a/single-kernel/lin_reg_error.cpp b/single-kernel/lin_reg_error.cpp
index c2d169e1..0b49e097 100644
--- a/single-kernel/lin_reg_error.cpp
+++ b/single-kernel/lin_reg_error.cpp
@@ -1,32 +1,32 @@
 #include "common.h"
 #include <iostream>
 
-//using namespace cl::sycl;
-namespace s = cl::sycl;
-template <typename T> class LinearRegressionKernel;
+// using namespace sycl;
+namespace s = sycl;
+template <typename T>
+class LinearRegressionKernel;
 
 template <typename T>
-class LinearRegressionBench
-{
-protected:    
-    std::vector<T> input1;
-    std::vector<T> input2;
-    std::vector<T> alpha;
-    std::vector<T> beta;
-    std::vector<T> output;
-    std::vector<T> expected_output;
-    BenchmarkArgs args;
-
-    PrefetchedBuffer<T, 1> input1_buf;
-    PrefetchedBuffer<T, 1> input2_buf;
-    PrefetchedBuffer<T, 1> alpha_buf;
-    PrefetchedBuffer<T, 1> beta_buf;
-    PrefetchedBuffer<T, 1> output_buf;
+class LinearRegressionBench {
+protected:
+  std::vector<T> input1;
+  std::vector<T> input2;
+  std::vector<T> alpha;
+  std::vector<T> beta;
+  std::vector<T> output;
+  std::vector<T> expected_output;
+  BenchmarkArgs args;
+
+  PrefetchedBuffer<T, 1> input1_buf;
+  PrefetchedBuffer<T, 1> input2_buf;
+  PrefetchedBuffer<T, 1> alpha_buf;
+  PrefetchedBuffer<T, 1> beta_buf;
+  PrefetchedBuffer<T, 1> output_buf;
 
 public:
-  LinearRegressionBench(const BenchmarkArgs &_args) : args(_args) {}
-  
-  void setup() {      
+  LinearRegressionBench(const BenchmarkArgs& _args) : args(_args) {}
+
+  void setup() {
     // host memory allocation and initialization
     input1.resize(args.problem_size);
     input2.resize(args.problem_size);
@@ -35,24 +35,22 @@ class LinearRegressionBench
     output.resize(args.problem_size, 0);
     expected_output.resize(args.problem_size, 0);
 
-    for (size_t i = 0; i < args.problem_size; i++) {
-      input1[i] = static_cast <T> (rand()) / static_cast <T> (RAND_MAX);
-      input2[i] = static_cast <T> (rand()) / static_cast <T> (RAND_MAX);
-      alpha[i] = static_cast <T> (rand()) / static_cast <T> (RAND_MAX);
-      beta[i] = static_cast <T> (rand()) / static_cast <T> (RAND_MAX);
+    for(size_t i = 0; i < args.problem_size; i++) {
+      input1[i] = static_cast<T>(rand()) / static_cast<T>(RAND_MAX);
+      input2[i] = static_cast<T>(rand()) / static_cast<T>(RAND_MAX);
+      alpha[i] = static_cast<T>(rand()) / static_cast<T>(RAND_MAX);
+      beta[i] = static_cast<T>(rand()) / static_cast<T>(RAND_MAX);
     }
 
     input1_buf.initialize(args.device_queue, input1.data(), s::range<1>(args.problem_size));
     input2_buf.initialize(args.device_queue, input2.data(), s::range<1>(args.problem_size));
-    alpha_buf. initialize(args.device_queue, alpha.data(), s::range<1>(args.problem_size));
-    beta_buf.  initialize(args.device_queue, beta.data(), s::range<1>(args.problem_size));
+    alpha_buf.initialize(args.device_queue, alpha.data(), s::range<1>(args.problem_size));
+    beta_buf.initialize(args.device_queue, beta.data(), s::range<1>(args.problem_size));
     output_buf.initialize(args.device_queue, output.data(), s::range<1>(args.problem_size));
   }
 
-  void run(std::vector<cl::sycl::event>& events) {
-    
-    events.push_back(args.device_queue.submit(
-        [&](cl::sycl::handler& cgh) {
+  void run(std::vector<sycl::event>& events) {
+    events.push_back(args.device_queue.submit([&](sycl::handler& cgh) {
       auto in1 = input1_buf.template get_access<s::access::mode::read>(cgh);
       auto in2 = input2_buf.template get_access<s::access::mode::read>(cgh);
       auto alpha = alpha_buf.template get_access<s::access::mode::read>(cgh);
@@ -60,79 +58,77 @@ class LinearRegressionBench
       // Use discard_write here, otherwise the content of the host buffer must first be copied to device
       auto output = output_buf.template get_access<s::access::mode::discard_write>(cgh);
 
-      cl::sycl::range<1> ndrange (args.problem_size);
+      sycl::range<1> ndrange(args.problem_size);
 
-      cgh.parallel_for<class LinearRegressionKernel<T>>(ndrange,
-        [=, problem_size = args.problem_size](cl::sycl::id<1> idx)
-        {
-          size_t gid= idx[0];
-          T a = alpha[gid];
-          T b = beta[gid];
-          T error = 0.0;
-          if (gid < problem_size) {
+      cgh.parallel_for<class LinearRegressionKernel<T>>(
+          ndrange, [=, problem_size = args.problem_size](sycl::id<1> idx) {
+            size_t gid = idx[0];
+            T a = alpha[gid];
+            T b = beta[gid];
+            T error = 0.0;
+            if(gid < problem_size) {
               // Use parallel reduction to add errors
-              for (size_t i = 0; i < problem_size; i++) {
-                T e = (a*in1[i] + b) - in2[i];
-                error += e*e;
+              for(size_t i = 0; i < problem_size; i++) {
+                T e = (a * in1[i] + b) - in2[i];
+                error += e * e;
               }
-          }
-          output[gid] = error;
-        });
+            }
+            output[gid] = error;
+          });
     }));
   }
 
   bool compare(const std::vector<T>& expected_output, const int length, const T epsilon) {
-      T error = 0.0f;
-      T ref = 0.0f;
+    T error = 0.0f;
+    T ref = 0.0f;
 
-      auto output = output_buf.template get_access<s::access::mode::read>();
+    auto output = output_buf.get_host_access();
 
-      for(size_t i = 0; i < length; ++i) {
-          T diff = expected_output[i] - output[i];
-          error += diff * diff;
-          ref += expected_output[i] * expected_output[i];
-      }
+    for(size_t i = 0; i < length; ++i) {
+      T diff = expected_output[i] - output[i];
+      error += diff * diff;
+      ref += expected_output[i] * expected_output[i];
+    }
 
-      T normRef = sqrtf((T) ref);
-      if (fabs(ref) < 1e-7f) {
-          return false;
-      }
+    T normRef = sqrtf((T)ref);
+    if(fabs(ref) < 1e-7f) {
+      return false;
+    }
 
-      T normError = sqrtf((T) error);
-      error = normError / normRef;
+    T normError = sqrtf((T)error);
+    error = normError / normRef;
 
-      //std::cout << "error =" << error << "epsilon =" << epsilon;
+    // std::cout << "error =" << error << "epsilon =" << epsilon;
 
-      return error < epsilon;
+    return error < epsilon;
   }
 
-  bool verify(VerificationSetting &ver) { 
-
-    for (size_t i = 0; i < args.problem_size; i ++) {
+  bool verify(VerificationSetting& ver) {
+    for(size_t i = 0; i < args.problem_size; i++) {
       T error = 0.0;
       for(size_t j = 0; j < args.problem_size; j++) {
         T e = (alpha[i] * input1[j] + beta[i]) - input2[j];
-        error += e*e;
+        error += e * e;
       }
-      expected_output[i] = error; 
+      expected_output[i] = error;
     }
 
     return compare(expected_output, args.problem_size, 0.000001);
   }
-  
+
   static std::string getBenchmarkName(BenchmarkArgs& args) {
     std::stringstream name;
     name << "LinearRegression_";
     name << ReadableTypename<T>::name;
-    return name.str();     
+    return name.str();
   }
 };
 
-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
   BenchmarkApp app(argc, argv);
   app.run<LinearRegressionBench<float>>();
-  if(app.deviceSupportsFP64())
+  if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) {
     app.run<LinearRegressionBench<double>>();
+  }
   return 0;
 }
diff --git a/single-kernel/median.cpp b/single-kernel/median.cpp
index 2c7d4c8d..d8659f33 100644
--- a/single-kernel/median.cpp
+++ b/single-kernel/median.cpp
@@ -1,14 +1,14 @@
-#include <CL/sycl.hpp>
 #include <iostream>
+#include <sycl/sycl.hpp>
 
-#include "common.h"
 #include "bitmap.h"
+#include "common.h"
 
 
-namespace s = cl::sycl;
+namespace s = sycl;
 class MedianFilterBenchKernel; // kernel forward declaration
 
-void swap(cl::sycl::float4 A[], int i, int j) {
+void swap(sycl::float4 A[], int i, int j) {
   /*if(A[i] > A[j]) {
     float temp = A[i];
     A[i] = A[j];
@@ -20,126 +20,121 @@ void swap(cl::sycl::float4 A[], int i, int j) {
 
 /*
   A median filter with a windows of 3 pixels (3x3).
-  Input and output are two-dimensional buffers of floats.     
+  Input and output are two-dimensional buffers of floats.
  */
-class MedianFilterBench
-{
+class MedianFilterBench {
 protected:
-    std::vector<cl::sycl::float4> input;
-    std::vector<cl::sycl::float4> output;
+  std::vector<sycl::float4> input;
+  std::vector<sycl::float4> output;
 
-    size_t w, h; // size of the input picture
-    size_t size; // user-defined size (input and output will be size x size)
-    BenchmarkArgs args;
+  size_t w, h; // size of the input picture
+  size_t size; // user-defined size (input and output will be size x size)
+  BenchmarkArgs args;
 
-    PrefetchedBuffer<cl::sycl::float4, 2>  input_buf;    
-    PrefetchedBuffer<cl::sycl::float4, 2> output_buf;
+  PrefetchedBuffer<sycl::float4, 2> input_buf;
+  PrefetchedBuffer<sycl::float4, 2> output_buf;
 
 public:
-  MedianFilterBench(const BenchmarkArgs &_args) : args(_args) {}
+  MedianFilterBench(const BenchmarkArgs& _args) : args(_args) {}
 
   void setup() {
     size = args.problem_size; // input size defined by the user
-    input.resize(size * size); 
+    input.resize(size * size);
     load_bitmap_mirrored("../share/Brommy.bmp", size, input);
     output.resize(size * size);
 
-    input_buf.initialize(args.device_queue, input.data(), s::range<2>(size, size));    
+    input_buf.initialize(args.device_queue, input.data(), s::range<2>(size, size));
     output_buf.initialize(args.device_queue, output.data(), s::range<2>(size, size));
   }
 
-  void run(std::vector<cl::sycl::event>& events) {
-
-    events.push_back(args.device_queue.submit(
-        [&](cl::sycl::handler& cgh) {
-      auto in  = input_buf .get_access<s::access::mode::read>(cgh);
+  void run(std::vector<sycl::event>& events) {
+    events.push_back(args.device_queue.submit([&](sycl::handler& cgh) {
+      auto in = input_buf.get_access<s::access::mode::read>(cgh);
       auto out = output_buf.get_access<s::access::mode::discard_write>(cgh);
-      cl::sycl::range<2> ndrange {size, size};
-
-      cgh.parallel_for<class MedianFilterBenchKernel>(ndrange,
-        [in, out, size_ = size](cl::sycl::id<2> gid)
-        {
-          int x = gid[0];
-          int y = gid[1];
-
-          // Optimization note: this array can be prefetched in local memory, TODO
-	  cl::sycl::float4 window[9];
-          int k = 0;
-          for(int i = -1; i<2; i++)
-            for(int j = -1; j<2; j++) {
-              uint xs = s::min(s::max(x+j, 0), static_cast<int>(size_-1)); // borders are handled here with extended values
-              uint ys = s::min(s::max(y+i, 0), static_cast<int>(size_-1));
-              window[k] =in[ {xs,ys} ];
-              k++;
-            }
-          
-          // (channel-wise) median selection using bitonic sorting
-          // the following network is used (Bose-Nelson algorithm):
-          // [[0,1],[2,3],[4,5],[7,8]]
-          // [[0,2],[1,3],[6,8]]
-          // [[1,2],[6,7],[5,8]]
-          // [[4,7],[3,8]]
-          // [[4,6],[5,7]]
-          // [[5,6],[2,7]]
-          // [[0,5],[1,6],[3,7]]
-          // [[0,4],[1,5],[3,6]]
-          // [[1,4],[2,5]]
-          // [[2,4],[3,5]]
-          // [[3,4]]
-          // se also http://pages.ripco.net/~jgamble/nw.html
-	  swap(window, 0, 1);
-          swap(window, 2, 3);
-          swap(window, 0, 2);
-          swap(window, 1, 3);
-          swap(window, 1, 2);
-          swap(window, 4, 5);
-          swap(window, 7, 8);
-          swap(window, 6, 8);
-          swap(window, 6, 7);
-          swap(window, 4, 7);
-          swap(window, 4, 6);
-          swap(window, 5, 8);
-          swap(window, 5, 7);
-          swap(window, 5, 6);
-          swap(window, 0, 5);
-          swap(window, 0, 4);
-          swap(window, 1, 6);
-          swap(window, 1, 5);
-          swap(window, 1, 4);
-          swap(window, 2, 7);
-          swap(window, 3, 8);
-          swap(window, 3, 7);
-          swap(window, 2, 5);
-          swap(window, 2, 4);
-          swap(window, 3, 6);
-          swap(window, 3, 5);
-          swap(window, 3, 4);
-
-	  out[gid] = window[4];
-       }
-       );
-     }));
-     
-     args.device_queue.wait_and_throw();
-   }
-
-
-  bool verify(VerificationSetting &ver) {  
+      sycl::range<2> ndrange{size, size};
+
+      cgh.parallel_for<class MedianFilterBenchKernel>(ndrange, [in, out, size_ = size](sycl::id<2> gid) {
+        int x = gid[0];
+        int y = gid[1];
+
+        // Optimization note: this array can be prefetched in local memory, TODO
+        sycl::float4 window[9];
+        int k = 0;
+        for(int i = -1; i < 2; i++)
+          for(int j = -1; j < 2; j++) {
+            uint xs =
+                s::min(s::max(x + j, 0), static_cast<int>(size_ - 1)); // borders are handled here with extended values
+            uint ys = s::min(s::max(y + i, 0), static_cast<int>(size_ - 1));
+            window[k] = in[{xs, ys}];
+            k++;
+          }
+
+        // (channel-wise) median selection using bitonic sorting
+        // the following network is used (Bose-Nelson algorithm):
+        // [[0,1],[2,3],[4,5],[7,8]]
+        // [[0,2],[1,3],[6,8]]
+        // [[1,2],[6,7],[5,8]]
+        // [[4,7],[3,8]]
+        // [[4,6],[5,7]]
+        // [[5,6],[2,7]]
+        // [[0,5],[1,6],[3,7]]
+        // [[0,4],[1,5],[3,6]]
+        // [[1,4],[2,5]]
+        // [[2,4],[3,5]]
+        // [[3,4]]
+        // se also http://pages.ripco.net/~jgamble/nw.html
+        swap(window, 0, 1);
+        swap(window, 2, 3);
+        swap(window, 0, 2);
+        swap(window, 1, 3);
+        swap(window, 1, 2);
+        swap(window, 4, 5);
+        swap(window, 7, 8);
+        swap(window, 6, 8);
+        swap(window, 6, 7);
+        swap(window, 4, 7);
+        swap(window, 4, 6);
+        swap(window, 5, 8);
+        swap(window, 5, 7);
+        swap(window, 5, 6);
+        swap(window, 0, 5);
+        swap(window, 0, 4);
+        swap(window, 1, 6);
+        swap(window, 1, 5);
+        swap(window, 1, 4);
+        swap(window, 2, 7);
+        swap(window, 3, 8);
+        swap(window, 3, 7);
+        swap(window, 2, 5);
+        swap(window, 2, 4);
+        swap(window, 3, 6);
+        swap(window, 3, 5);
+        swap(window, 3, 4);
+
+        out[gid] = window[4];
+      });
+    }));
+
+    args.device_queue.wait_and_throw();
+  }
+
+
+  bool verify(VerificationSetting& ver) {
     save_bitmap("median.bmp", size, output);
 
     bool pass = true;
-    auto output_acc = output_buf.get_access<s::access::mode::read>();
+    auto output_acc = output_buf.get_host_access();
 
-    for(size_t i=ver.begin[0]; i<ver.begin[0]+ver.range[0]; i++){
+    for(size_t i = ver.begin[0]; i < ver.begin[0] + ver.range[0]; i++) {
       int x = i % size;
       int y = i / size;
-      cl::sycl::float4 window[9];
+      sycl::float4 window[9];
       int k = 0;
-      for(int i = -1; i<2; i++)
-        for(int j = -1; j<2; j++) {
-          uint xs = fmin(fmax(x+j, 0), size-1); // borders are handled here with extended values
-          uint ys = fmin(fmax(y+i, 0), size-1);
-          window[k] =input[xs + ys*size ];
+      for(int i = -1; i < 2; i++)
+        for(int j = -1; j < 2; j++) {
+          uint xs = fmin(fmax(x + j, 0), size - 1); // borders are handled here with extended values
+          uint ys = fmin(fmax(y + i, 0), size - 1);
+          window[k] = input[xs + ys * size];
           k++;
         }
       swap(window, 0, 1);
@@ -169,31 +164,25 @@ class MedianFilterBench
       swap(window, 3, 6);
       swap(window, 3, 5);
       swap(window, 3, 4);
-      cl::sycl::float4 expected = window[4];
-      cl::sycl::float4 dif = fdim(output_acc.get_pointer()[i], expected);
-      float length = cl::sycl::length(dif);
-      if(length > 0.01f)
-      {
+      sycl::float4 expected = window[4];
+      sycl::float4 dif = fdim(output_acc.get_pointer()[i], expected);
+      float length = sycl::length(dif);
+      if(length > 0.01f) {
         pass = false;
         break;
       }
-    }    
+    }
     return pass;
-}
+  }
 
 
-static std::string getBenchmarkName(BenchmarkArgs& args) {
-    return "MedianFilter";
-  }
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "MedianFilter"; }
 
 }; // MedianFilterBench class
 
 
-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
   BenchmarkApp app(argc, argv);
-  app.run<MedianFilterBench>();  
+  app.run<MedianFilterBench>();
   return 0;
 }
-
-
diff --git a/single-kernel/mol_dyn.cpp b/single-kernel/mol_dyn.cpp
index 917c2802..df1c7204 100644
--- a/single-kernel/mol_dyn.cpp
+++ b/single-kernel/mol_dyn.cpp
@@ -1,12 +1,11 @@
 #include "common.h"
 #include <iostream>
 
-//using namespace cl::sycl;
-namespace s = cl::sycl;
+// using namespace sycl;
+namespace s = sycl;
 class MolecularDynamicsKernel;
 
-class MolecularDynamicsBench
-{
+class MolecularDynamicsBench {
 protected:
   std::vector<s::float4> input;
   std::vector<s::float4> output;
@@ -23,7 +22,7 @@ class MolecularDynamicsBench
   PrefetchedBuffer<s::float4, 1> output_buf;
 
 public:
-  MolecularDynamicsBench(const BenchmarkArgs &_args) : args(_args) {}
+  MolecularDynamicsBench(const BenchmarkArgs& _args) : args(_args) {}
 
   void setup() {
     // host memory allocation and initialization
@@ -50,104 +49,97 @@ class MolecularDynamicsBench
     output_buf.initialize(args.device_queue, output.data(), s::range<1>(args.problem_size * sizeof(s::float4)));
   }
 
-  void run(std::vector<cl::sycl::event>& events) {
-    
-    events.push_back(args.device_queue.submit(
-        [&](cl::sycl::handler& cgh) {
+  void run(std::vector<sycl::event>& events) {
+    events.push_back(args.device_queue.submit([&](sycl::handler& cgh) {
       auto in = input_buf.get_access<s::access::mode::read>(cgh);
       auto neigh = neighbour_buf.get_access<s::access::mode::read>(cgh);
       auto out = output_buf.get_access<s::access::mode::discard_write>(cgh);
 
-      cl::sycl::range<1> ndrange (args.problem_size);
-
-      cgh.parallel_for<class MolecularDynamicsKernel>(ndrange,
-        [=, problem_size = args.problem_size, neighCount_ = neighCount,
-         inum_ = inum, cutsq_ = cutsq, lj1_ = lj1, lj2_ = lj2]
-        (cl::sycl::id<1> idx)
-        {
-            size_t gid= idx[0];
-
-            if (gid < problem_size) {
-                s::float4 ipos = in[gid];
-                s::float4 f = {0.0f, 0.0f, 0.0f, 0.0f};
-                int j = 0;
-                while (j < neighCount_) {
-                    int jidx = neigh[j*inum_ + gid];
-                    s::float4 jpos = in[jidx];
-
-                    // Calculate distance
-                    float delx = ipos.x() - jpos.x();
-                    float dely = ipos.y() - jpos.y();
-                    float delz = ipos.z() - jpos.z();
-                    float r2inv = delx*delx + dely*dely + delz*delz;
-
-                    // If distance is less than cutoff, calculate force
-                    if (r2inv < cutsq_) {
-                        r2inv = 10.0f/r2inv;
-                        float r6inv = r2inv * r2inv * r2inv;
-                        float forceC = r2inv*r6inv*(lj1_*r6inv - lj2_);
-
-                        f.x() += delx * forceC;
-                        f.y() += dely * forceC;
-                        f.z() += delz * forceC;
-                    }
-                    j++;
+      sycl::range<1> ndrange(args.problem_size);
+
+      cgh.parallel_for<class MolecularDynamicsKernel>(
+          ndrange, [=, problem_size = args.problem_size, neighCount_ = neighCount, inum_ = inum, cutsq_ = cutsq,
+                       lj1_ = lj1, lj2_ = lj2](sycl::id<1> idx) {
+            size_t gid = idx[0];
+
+            if(gid < problem_size) {
+              s::float4 ipos = in[gid];
+              s::float4 f = {0.0f, 0.0f, 0.0f, 0.0f};
+              int j = 0;
+              while(j < neighCount_) {
+                int jidx = neigh[j * inum_ + gid];
+                s::float4 jpos = in[jidx];
+
+                // Calculate distance
+                float delx = ipos.x() - jpos.x();
+                float dely = ipos.y() - jpos.y();
+                float delz = ipos.z() - jpos.z();
+                float r2inv = delx * delx + dely * dely + delz * delz;
+
+                // If distance is less than cutoff, calculate force
+                if(r2inv < cutsq_) {
+                  r2inv = 10.0f / r2inv;
+                  float r6inv = r2inv * r2inv * r2inv;
+                  float forceC = r2inv * r6inv * (lj1_ * r6inv - lj2_);
+
+                  f.x() += delx * forceC;
+                  f.y() += dely * forceC;
+                  f.z() += delz * forceC;
                 }
-                out[gid] = f;
+                j++;
+              }
+              out[gid] = f;
             }
-        });
+          });
     }));
   }
 
-  bool verify(VerificationSetting &ver) {
-    auto output_acc = output_buf.get_access<s::access::mode::read>();
+  bool verify(VerificationSetting& ver) {
+    auto output_acc = output_buf.get_host_access();
 
     bool pass = true;
     unsigned equal = 1;
     constexpr float maxErr = 10.f * std::numeric_limits<float>::epsilon();
     for(unsigned int i = 0; i < args.problem_size; ++i) {
-        s::float4 ipos = input[i];
-        s::float4 f = {0.0f, 0.0f, 0.0f, 0.0f};
-        int j = 0;
-        while (j < neighCount) {
-            int jidx = neighbour[j*inum + i];
-            s::float4 jpos = input[jidx];
-
-            // Calculate distance
-            float delx = ipos.x() - jpos.x();
-            float dely = ipos.y() - jpos.y();
-            float delz = ipos.z() - jpos.z();
-            float r2inv = delx*delx + dely*dely + delz*delz;
-
-            // If distance is less than cutoff, calculate force
-            if (r2inv < cutsq) {
-                r2inv = 10.0f/r2inv;
-                float r6inv = r2inv * r2inv * r2inv;
-                float forceC = r2inv*r6inv*(lj1*r6inv - lj2);
-
-                f.x() += delx * forceC;
-                f.y() += dely * forceC;
-                f.z() += delz * forceC;
-            }
-            j++;
+      s::float4 ipos = input[i];
+      s::float4 f = {0.0f, 0.0f, 0.0f, 0.0f};
+      int j = 0;
+      while(j < neighCount) {
+        int jidx = neighbour[j * inum + i];
+        s::float4 jpos = input[jidx];
+
+        // Calculate distance
+        float delx = ipos.x() - jpos.x();
+        float dely = ipos.y() - jpos.y();
+        float delz = ipos.z() - jpos.z();
+        float r2inv = delx * delx + dely * dely + delz * delz;
+
+        // If distance is less than cutoff, calculate force
+        if(r2inv < cutsq) {
+          r2inv = 10.0f / r2inv;
+          float r6inv = r2inv * r2inv * r2inv;
+          float forceC = r2inv * r6inv * (lj1 * r6inv - lj2);
+
+          f.x() += delx * forceC;
+          f.y() += dely * forceC;
+          f.z() += delz * forceC;
         }
+        j++;
+      }
 
-        if(s::distance(f, output_acc[i]) / s::length(f) > maxErr) {
-          pass = false;
-          break;
-        }
+      if(s::distance(f, output_acc[i]) / s::length(f) > maxErr) {
+        pass = false;
+        break;
+      }
     }
     return pass;
   }
-  
-  static std::string getBenchmarkName(BenchmarkArgs& args) {
-    return "MolecularDynamics";
-  }
+
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "MolecularDynamics"; }
 };
 
-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
   BenchmarkApp app(argc, argv);
-  app.run<MolecularDynamicsBench>();  
+  app.run<MolecularDynamicsBench>();
   return 0;
 }
diff --git a/single-kernel/nbody.cpp b/single-kernel/nbody.cpp
index a080832a..0af9ef35 100644
--- a/single-kernel/nbody.cpp
+++ b/single-kernel/nbody.cpp
@@ -1,18 +1,19 @@
 #include "common.h"
 
-#include <iostream>
 #include <cassert>
+#include <iostream>
 #include <limits>
 
-using namespace cl;
+using namespace sycl;
 
-template<class float_type> class NDRangeNBodyKernel;
-template<class float_type> class HierarchicalNBodyKernel;
+template <class float_type>
+class NDRangeNBodyKernel;
+template <class float_type>
+class HierarchicalNBodyKernel;
 
 
-template<class float_type>
-class NBody
-{
+template <class float_type>
+class NBody {
 protected:
   using particle_type = sycl::vec<float_type, 4>;
   using vector_type = sycl::vec<float_type, 3>;
@@ -31,19 +32,17 @@ class NBody
 
   PrefetchedBuffer<particle_type> particles_buf;
   PrefetchedBuffer<vector_type> velocities_buf;
+
 public:
-  NBody(const BenchmarkArgs& _args)
-      : args(_args), gravitational_softening{1.e-5f}, dt{1.e-2f} {
+  NBody(const BenchmarkArgs& _args) : args(_args), gravitational_softening{1.e-5f}, dt{1.e-2f} {
     assert(args.problem_size % args.local_size == 0);
   }
 
   void setup() {
-    
     particles.resize(args.problem_size);
     velocities.resize(args.problem_size);
 
     for(std::size_t i = 0; i < args.problem_size; ++i) {
-
       float_type rel_i = static_cast<float_type>(i) / static_cast<float_type>(args.problem_size);
 
       particles[i].x() = rel_i * std::cos(3000.f * 2.f * M_PI * rel_i);
@@ -56,21 +55,21 @@ class NBody
       velocities[i].z() = 0;
     }
 
-    particles_buf. initialize(args.device_queue, this->particles.data(), sycl::range<1>{this->args.problem_size});
+    particles_buf.initialize(args.device_queue, this->particles.data(), sycl::range<1>{this->args.problem_size});
     velocities_buf.initialize(args.device_queue, this->velocities.data(), sycl::range<1>{this->args.problem_size});
 
-    output_particles. initialize(args.device_queue, sycl::range<1>{args.problem_size});
+    output_particles.initialize(args.device_queue, sycl::range<1>{args.problem_size});
     output_velocities.initialize(args.device_queue, sycl::range<1>{args.problem_size});
   }
 
 
-  bool verify(VerificationSetting &ver) {
-    auto resulting_particles = output_particles.template get_access<sycl::access::mode::read>();
-    auto resulting_velocities = output_velocities.template get_access<sycl::access::mode::read>();
+  bool verify(VerificationSetting& ver) {
+    auto resulting_particles = output_particles.get_host_access();
+    auto resulting_velocities = output_velocities.get_host_access();
 
     std::vector<particle_type> host_resulting_particles(particles.size());
     std::vector<vector_type> host_resulting_velocities(particles.size());
-  
+
 
     for(std::size_t i = 0; i < particles.size(); ++i) {
       const particle_type my_p = particles[i];
@@ -78,27 +77,20 @@ class NBody
       vector_type acceleration{static_cast<float_type>(0.0f)};
 
       for(std::size_t j = 0; j < particles.size(); ++j) {
-
         if(i != j) {
           const particle_type p = particles[j];
-          
-          const vector_type R {
-            p.x() - my_p.x(), 
-            p.y() - my_p.y(),
-            p.z() - my_p.z()
-          };
 
-          const float_type r_inv = sycl::rsqrt(R.x() * R.x() + R.y() * R.y() + R.z() * R.z() + 
-                                              gravitational_softening);
+          const vector_type R{p.x() - my_p.x(), p.y() - my_p.y(), p.z() - my_p.z()};
+
+          const float_type r_inv = sycl::rsqrt(R.x() * R.x() + R.y() * R.y() + R.z() * R.z() + gravitational_softening);
 
           acceleration += static_cast<float_type>(p.w()) * r_inv * r_inv * r_inv * R;
         }
-        
       }
 
       vector_type new_v = my_v + acceleration * dt;
       particle_type new_p = my_p;
-      new_p.x() += new_v.x() * dt; 
+      new_p.x() += new_v.x() * dt;
       new_p.y() += new_v.y() * dt;
       new_p.z() += new_v.z() * dt;
 
@@ -131,12 +123,10 @@ class NBody
       auto output_particles_access = output_particles.template get_access<sycl::access::mode::discard_write>(cgh);
       auto output_velocities_access = output_velocities.template get_access<sycl::access::mode::discard_write>(cgh);
 
-      auto scratch = sycl::accessor<particle_type, 1, sycl::access::mode::read_write, sycl::access::target::local>{
-          sycl::range<1>{args.local_size}, cgh};
+      auto scratch = sycl::local_accessor<particle_type, 1>{sycl::range<1>{args.local_size}, cgh};
 
       cgh.parallel_for<NDRangeNBodyKernel<float_type>>(execution_range,
           [=, dt = this->dt, gravitational_softening = this->gravitational_softening](sycl::nd_item<1> tid) {
-
             const size_t global_id = tid.get_global_id(0);
             const size_t local_id = tid.get_local_id(0);
             const size_t num_particles = tid.get_global_range()[0];
@@ -153,7 +143,7 @@ class NBody
               scratch[local_id] = (global_id < num_particles) ? particles_access[offset + local_id]
                                                               : particle_type{static_cast<float_type>(0.0f)};
 
-              tid.barrier();
+              sycl::group_barrier(tid.get_group());
 
               for(int i = 0; i < local_size; ++i) {
                 const particle_type p = scratch[i];
@@ -168,7 +158,7 @@ class NBody
                   acceleration += static_cast<float_type>(p.w()) * r_inv * r_inv * r_inv * R;
               }
 
-              tid.barrier();
+              sycl::group_barrier(tid.get_group());
             }
 
             // This is a dirt cheap Euler integration, but could be
@@ -199,8 +189,7 @@ class NBody
       auto output_particles_access = output_particles.template get_access<sycl::access::mode::discard_write>(cgh);
       auto output_velocities_access = output_velocities.template get_access<sycl::access::mode::discard_write>(cgh);
 
-      auto scratch = sycl::accessor<particle_type, 1, sycl::access::mode::read_write, sycl::access::target::local>{
-          sycl::range<1>{args.local_size}, cgh};
+      auto scratch = sycl::local_accessor<particle_type, 1>{sycl::range<1>{args.local_size}, cgh};
 
 
       const size_t local_size = args.local_size;
@@ -267,20 +256,16 @@ class NBody
   }
 };
 
-template<class float_type>
-class NBodyNDRange : public NBody<float_type>
-{
+template <class float_type>
+class NBodyNDRange : public NBody<float_type> {
 public:
   using typename NBody<float_type>::particle_type;
   using typename NBody<float_type>::vector_type;
 
-  NBodyNDRange(const BenchmarkArgs& _args)
-  : NBody<float_type>{_args} {}
+  NBodyNDRange(const BenchmarkArgs& _args) : NBody<float_type>{_args} {}
 
 
-  void run(){
-    this->submitNDRange(this->particles_buf.get(), this->velocities_buf.get());
-  }
+  void run() { this->submitNDRange(this->particles_buf.get(), this->velocities_buf.get()); }
 
   std::string getBenchmarkName(BenchmarkArgs& args) {
     std::stringstream name;
@@ -291,43 +276,38 @@ class NBodyNDRange : public NBody<float_type>
 };
 
 
-template<class float_type>
-class NBodyHierarchical : public NBody<float_type>
-{
+template <class float_type>
+class NBodyHierarchical : public NBody<float_type> {
 public:
   using typename NBody<float_type>::particle_type;
   using typename NBody<float_type>::vector_type;
 
-  NBodyHierarchical(const BenchmarkArgs& _args)
-  : NBody<float_type>{_args} {}
+  NBodyHierarchical(const BenchmarkArgs& _args) : NBody<float_type>{_args} {}
 
 
-  void run(){
-    this->submitHierarchical(this->particles_buf.get(), this->velocities_buf.get());
-  }
+  void run() { this->submitHierarchical(this->particles_buf.get(), this->velocities_buf.get()); }
 
   std::string getBenchmarkName(BenchmarkArgs& args) {
     std::stringstream name;
     name << "NBody_Hierarchical_";
     name << ReadableTypename<float_type>::name;
-    
+
     return name.str();
   }
 };
 
-int main(int argc, char** argv)
-{
-
+int main(int argc, char** argv) {
   BenchmarkApp app(argc, argv);
 
-  app.run< NBodyHierarchical<float> >();
-  if(app.deviceSupportsFP64())
+  app.run<NBodyHierarchical<float>>();
+  if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) {
     app.run<NBodyHierarchical<double>>();
-
+  }
   if(app.shouldRunNDRangeKernels()) {
-    app.run< NBodyNDRange<float> >();
-    if(app.deviceSupportsFP64())
+    app.run<NBodyNDRange<float>>();
+    if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) {
       app.run<NBodyNDRange<double>>();
+    }
   }
 
   return 0;
diff --git a/single-kernel/perlin.cpp b/single-kernel/perlin.cpp
index 633e2137..f97626fb 100644
--- a/single-kernel/perlin.cpp
+++ b/single-kernel/perlin.cpp
@@ -1 +1 @@
-TODO: import Perlin noise from the Insieme OpenCL benchmark
+TODO : import Perlin noise from the Insieme OpenCL benchmark
diff --git a/single-kernel/scalar_prod.cpp b/single-kernel/scalar_prod.cpp
index 7583924b..da146f55 100644
--- a/single-kernel/scalar_prod.cpp
+++ b/single-kernel/scalar_prod.cpp
@@ -1,47 +1,46 @@
 #include "common.h"
 
+#include <iomanip>
 #include <iostream>
 #include <type_traits>
-#include <iomanip>
 
-//using namespace cl::sycl;
-namespace s = cl::sycl;
+// using namespace sycl;
+namespace s = sycl;
 
-template<typename T, bool>
+template <typename T, bool>
 class ScalarProdKernel;
-template<typename T, bool>
+template <typename T, bool>
 class ScalarProdKernelHierarchical;
 
-template<typename T, bool>
+template <typename T, bool>
 class ScalarProdReduction;
-template<typename T, bool>
+template <typename T, bool>
 class ScalarProdReductionHierarchical;
-template<typename T, bool>
+template <typename T, bool>
 class ScalarProdGatherKernel;
 
-template<typename T, bool Use_ndrange = true>
-class ScalarProdBench
-{
-protected:    
-    std::vector<T> input1;
-    std::vector<T> input2;
-    std::vector<T> output;
-    BenchmarkArgs args;
+template <typename T, bool Use_ndrange = true>
+class ScalarProdBench {
+protected:
+  std::vector<T> input1;
+  std::vector<T> input2;
+  std::vector<T> output;
+  BenchmarkArgs args;
 
-    PrefetchedBuffer<T, 1> input1_buf;
-    PrefetchedBuffer<T, 1> input2_buf;
-    PrefetchedBuffer<T, 1> output_buf;
+  PrefetchedBuffer<T, 1> input1_buf;
+  PrefetchedBuffer<T, 1> input2_buf;
+  PrefetchedBuffer<T, 1> output_buf;
 
 public:
-  ScalarProdBench(const BenchmarkArgs &_args) : args(_args) {}
-  
-  void setup() {      
+  ScalarProdBench(const BenchmarkArgs& _args) : args(_args) {}
+
+  void setup() {
     // host memory allocation and initialization
     input1.resize(args.problem_size);
     input2.resize(args.problem_size);
     output.resize(args.problem_size);
 
-    for (size_t i = 0; i < args.problem_size; i++) {
+    for(size_t i = 0; i < args.problem_size; i++) {
       input1[i] = static_cast<T>(1);
       input2[i] = static_cast<T>(2);
       output[i] = static_cast<T>(0);
@@ -52,35 +51,29 @@ class ScalarProdBench
     output_buf.initialize(args.device_queue, output.data(), s::range<1>(args.problem_size));
   }
 
-  void run(std::vector<cl::sycl::event>& events) {
-    
-    events.push_back(args.device_queue.submit(
-        [&](cl::sycl::handler& cgh) {
+  void run(std::vector<sycl::event>& events) {
+    events.push_back(args.device_queue.submit([&](sycl::handler& cgh) {
       auto in1 = input1_buf.template get_access<s::access::mode::read>(cgh);
       auto in2 = input2_buf.template get_access<s::access::mode::read>(cgh);
       // Use discard_write here, otherwise the content of the hostbuffer must first be copied to device
       auto intermediate_product = output_buf.template get_access<s::access::mode::discard_write>(cgh);
 
-      if(Use_ndrange){
-        cl::sycl::nd_range<1> ndrange (args.problem_size, args.local_size);
+      if(Use_ndrange) {
+        sycl::nd_range<1> ndrange(args.problem_size, args.local_size);
 
-        cgh.parallel_for<class ScalarProdKernel<T, Use_ndrange>>(ndrange,
-          [=](cl::sycl::nd_item<1> item) 
-          {
-            size_t gid= item.get_global_linear_id();
-            intermediate_product[gid] = in1[gid] * in2[gid];
-          });
-      }
-      else {
+        cgh.parallel_for<class ScalarProdKernel<T, Use_ndrange>>(ndrange, [=](sycl::nd_item<1> item) {
+          size_t gid = item.get_global_linear_id();
+          intermediate_product[gid] = in1[gid] * in2[gid];
+        });
+      } else {
         cgh.parallel_for_work_group<class ScalarProdKernelHierarchical<T, Use_ndrange>>(
-          cl::sycl::range<1>{args.problem_size / args.local_size},
-          cl::sycl::range<1>{args.local_size},
-          [=](cl::sycl::group<1> grp){
-            grp.parallel_for_work_item([&](cl::sycl::h_item<1> idx){
-              size_t gid = idx.get_global_id(0);
-              intermediate_product[gid] = in1[gid] * in2[gid];
+            sycl::range<1>{args.problem_size / args.local_size}, sycl::range<1>{args.local_size},
+            [=](sycl::group<1> grp) {
+              grp.parallel_for_work_item([&](sycl::h_item<1> idx) {
+                size_t gid = idx.get_global_id(0);
+                intermediate_product[gid] = in1[gid] * in2[gid];
+              });
             });
-          });
       }
     }));
 
@@ -91,119 +84,109 @@ class ScalarProdBench
     // Not yet tested with more than 2
     auto elements_per_thread = 2;
 
-    while (array_size!= 1) {
-      auto n_wgroups = (array_size + wgroup_size*elements_per_thread - 1)/(wgroup_size*elements_per_thread); // two threads per work item
-
-      events.push_back(args.device_queue.submit(
-        [&](cl::sycl::handler& cgh) {
-
-          auto global_mem = output_buf.template get_access<s::access::mode::read_write>(cgh);
-      
-          // local memory for reduction
-          auto local_mem = s::accessor <T, 1, s::access::mode::read_write, s::access::target::local> {s::range<1>(wgroup_size), cgh};
-          cl::sycl::nd_range<1> ndrange (n_wgroups*wgroup_size, wgroup_size);
-    
-          if(Use_ndrange) {
-            cgh.parallel_for<class ScalarProdReduction<T, Use_ndrange>>(ndrange,
-            [=](cl::sycl::nd_item<1> item) 
-              {
-                size_t gid= item.get_global_linear_id();
-                size_t lid = item.get_local_linear_id();
-
-                // initialize local memory to 0
-                local_mem[lid] = 0; 
-
-                for(int i = 0; i < elements_per_thread; ++i) {
-                  int input_element = gid + i * n_wgroups * wgroup_size;
-                  
-                  if(input_element < array_size)
-                    local_mem[lid] += global_mem[input_element];
-                }
+    while(array_size != 1) {
+      auto n_wgroups = (array_size + wgroup_size * elements_per_thread - 1) /
+                       (wgroup_size * elements_per_thread); // two threads per work item
 
-                item.barrier(s::access::fence_space::local_space);
+      events.push_back(args.device_queue.submit([&](sycl::handler& cgh) {
+        auto global_mem = output_buf.template get_access<s::access::mode::read_write>(cgh);
 
-                for(size_t stride = wgroup_size/elements_per_thread; stride >= 1; stride /= elements_per_thread) {
-                  if(lid < stride) {
-                    for(int i = 0; i < elements_per_thread-1; ++i){
-                      local_mem[lid] += local_mem[lid + stride + i];
-                    }
-                  }
-                  item.barrier(s::access::fence_space::local_space);
-                }
-                
-                // Only one work-item per work group writes to global memory 
-                if (lid == 0) {
-                  global_mem[item.get_global_id()] = local_mem[0];
+        // local memory for reduction
+        auto local_mem = s::local_accessor<T, 1>{s::range<1>(wgroup_size), cgh};
+
+        sycl::nd_range<1> ndrange(n_wgroups * wgroup_size, wgroup_size);
+
+        if(Use_ndrange) {
+          cgh.parallel_for<class ScalarProdReduction<T, Use_ndrange>>(ndrange, [=](sycl::nd_item<1> item) {
+            size_t gid = item.get_global_linear_id();
+            size_t lid = item.get_local_linear_id();
+
+            // initialize local memory to 0
+            local_mem[lid] = 0;
+
+            for(int i = 0; i < elements_per_thread; ++i) {
+              int input_element = gid + i * n_wgroups * wgroup_size;
+
+              if(input_element < array_size)
+                local_mem[lid] += global_mem[input_element];
+            }
+
+            sycl::group_barrier(item.get_group());
+
+            for(size_t stride = wgroup_size / elements_per_thread; stride >= 1; stride /= elements_per_thread) {
+              if(lid < stride) {
+                for(int i = 0; i < elements_per_thread - 1; ++i) {
+                  local_mem[lid] += local_mem[lid + stride + i];
                 }
-              });
-          }
-          else {
-            cgh.parallel_for_work_group<class ScalarProdReductionHierarchical<T, Use_ndrange>>(
-              cl::sycl::range<1>{n_wgroups}, cl::sycl::range<1>{wgroup_size},
-              [=](cl::sycl::group<1> grp){
-                
-                grp.parallel_for_work_item([&](cl::sycl::h_item<1> idx){
+              }
+              sycl::group_barrier(item.get_group());
+            }
+
+            // Only one work-item per work group writes to global memory
+            if(lid == 0) {
+              global_mem[item.get_global_id()] = local_mem[0];
+            }
+          });
+        } else {
+          cgh.parallel_for_work_group<class ScalarProdReductionHierarchical<T, Use_ndrange>>(
+              sycl::range<1>{n_wgroups}, sycl::range<1>{wgroup_size}, [=](sycl::group<1> grp) {
+                grp.parallel_for_work_item([&](sycl::h_item<1> idx) {
                   const size_t gid = idx.get_global_id(0);
                   const size_t lid = idx.get_local_id(0);
 
                   // initialize local memory to 0
-                  local_mem[lid] = 0; 
+                  local_mem[lid] = 0;
 
                   for(int i = 0; i < elements_per_thread; ++i) {
                     int input_element = gid + i * n_wgroups * wgroup_size;
-                  
+
                     if(input_element < array_size)
                       local_mem[lid] += global_mem[input_element];
                   }
                 });
 
-                for(size_t stride = wgroup_size/elements_per_thread; stride >= 1; stride /= elements_per_thread) {
-                  grp.parallel_for_work_item([&](cl::sycl::h_item<1> idx){
-                  
+                for(size_t stride = wgroup_size / elements_per_thread; stride >= 1; stride /= elements_per_thread) {
+                  grp.parallel_for_work_item([&](sycl::h_item<1> idx) {
                     const size_t lid = idx.get_local_id(0);
-                    
+
                     if(lid < stride) {
-                      for(int i = 0; i < elements_per_thread-1; ++i){
+                      for(int i = 0; i < elements_per_thread - 1; ++i) {
                         local_mem[lid] += local_mem[lid + stride + i];
                       }
                     }
                   });
                 }
-                grp.parallel_for_work_item([&](cl::sycl::h_item<1> idx){
+                grp.parallel_for_work_item([&](sycl::h_item<1> idx) {
                   const size_t lid = idx.get_local_id(0);
                   if(lid == 0)
-                    global_mem[grp.get_id(0) * grp.get_local_range(0)] = local_mem[0];
+                    global_mem[grp.get_group_id(0) * grp.get_local_range(0)] = local_mem[0];
                 });
               });
-          }
-        }));
-      
-      events.push_back(args.device_queue.submit(
-        [&](cl::sycl::handler& cgh) {
-
-          auto global_mem = output_buf.template get_access<s::access::mode::read_write>(cgh);
-      
-          cgh.parallel_for<ScalarProdGatherKernel<T, Use_ndrange>>(cl::sycl::range<1>{n_wgroups},
-                                                   [=](cl::sycl::id<1> idx){
-            global_mem[idx] = global_mem[idx * wgroup_size];
-          });
-        }));
+        }
+      }));
+
+      events.push_back(args.device_queue.submit([&](sycl::handler& cgh) {
+        auto global_mem = output_buf.template get_access<s::access::mode::read_write>(cgh);
+
+        cgh.parallel_for<ScalarProdGatherKernel<T, Use_ndrange>>(
+            sycl::range<1>{n_wgroups}, [=](sycl::id<1> idx) { global_mem[idx] = global_mem[idx * wgroup_size]; });
+      }));
       array_size = n_wgroups;
     }
   }
 
-  bool verify(VerificationSetting &ver) { 
+  bool verify(VerificationSetting& ver) {
     bool pass = true;
-    auto expected = static_cast <T>(0);
+    auto expected = static_cast<T>(0);
 
-    auto output_acc = output_buf.template get_access<s::access::mode::read>();
+    auto output_acc = output_buf.get_host_access();
 
     for(size_t i = 0; i < args.problem_size; i++) {
-        expected += input1[i] * input2[i];
+      expected += input1[i] * input2[i];
     }
 
-    //std::cout << "Scalar product on CPU =" << expected << std::endl;
-    //std::cout << "Scalar product on Device =" << output[0] << std::endl;
+    // std::cout << "Scalar product on CPU =" << expected << std::endl;
+    // std::cout << "Scalar product on Device =" << output[0] << std::endl;
 
     // Todo: update to type-specific test (Template specialization?)
     const auto tolerance = 0.00001f;
@@ -213,7 +196,7 @@ class ScalarProdBench
 
     return pass;
   }
-  
+
   static std::string getBenchmarkName(BenchmarkArgs& args) {
     std::stringstream name;
     name << "ScalarProduct_";
@@ -223,22 +206,22 @@ class ScalarProdBench
   }
 };
 
-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
   BenchmarkApp app(argc, argv);
   if(app.shouldRunNDRangeKernels()) {
     app.run<ScalarProdBench<int, true>>();
     app.run<ScalarProdBench<long long, true>>();
     app.run<ScalarProdBench<float, true>>();
-    if(app.deviceSupportsFP64())
+    if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) {
       app.run<ScalarProdBench<double, true>>();
+    }
   }
 
   app.run<ScalarProdBench<int, false>>();
   app.run<ScalarProdBench<long long, false>>();
   app.run<ScalarProdBench<float, false>>();
-  if(app.deviceSupportsFP64())
+  if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) {
     app.run<ScalarProdBench<double, false>>();
-
+  }
   return 0;
 }
diff --git a/single-kernel/sobel.cpp b/single-kernel/sobel.cpp
index 9068668e..f906befc 100644
--- a/single-kernel/sobel.cpp
+++ b/single-kernel/sobel.cpp
@@ -1,57 +1,57 @@
-#include <CL/sycl.hpp>
 #include <iostream>
+#include <sycl/sycl.hpp>
 
-#include "common.h"
 #include "bitmap.h"
+#include "common.h"
 
 
-namespace s = cl::sycl;
+namespace s = sycl;
 class SobelBenchKernel; // kernel forward declaration
 
 /*
   A Sobel filter with a convolution matrix 3x3.
-  Input and output are two-dimensional buffers of floats.     
+  Input and output are two-dimensional buffers of floats.
  */
-class SobelBench
-{
+class SobelBench {
 protected:
-  std::vector<cl::sycl::float4> input;
-  std::vector<cl::sycl::float4> output;
+  std::vector<sycl::float4> input;
+  std::vector<sycl::float4> output;
 
   size_t w, h; // size of the input picture
   size_t size; // user-defined size (input and output will be size x size)
   BenchmarkArgs args;
 
 
-  PrefetchedBuffer<cl::sycl::float4, 2> input_buf;    
-  PrefetchedBuffer<cl::sycl::float4, 2> output_buf;
+  PrefetchedBuffer<sycl::float4, 2> input_buf;
+  PrefetchedBuffer<sycl::float4, 2> output_buf;
+
 public:
-  SobelBench(const BenchmarkArgs &_args) : args(_args) {}
+  SobelBench(const BenchmarkArgs& _args) : args(_args) {}
 
   void setup() {
     size = args.problem_size; // input size defined by the user
-    input.resize(size * size); 
+    input.resize(size * size);
     load_bitmap_mirrored("../../share/Brommy.bmp", size, input);
     output.resize(size * size);
 
-    input_buf.initialize(args.device_queue, input.data(), s::range<2>(size, size));    
+    input_buf.initialize(args.device_queue, input.data(), s::range<2>(size, size));
     output_buf.initialize(args.device_queue, output.data(), s::range<2>(size, size));
   }
 
-  void run(std::vector<cl::sycl::event>& events) {
-    events.push_back(args.device_queue.submit([&](cl::sycl::handler& cgh) {
+  void run(std::vector<sycl::event>& events) {
+    events.push_back(args.device_queue.submit([&](sycl::handler& cgh) {
       auto in = input_buf.get_access<s::access::mode::read>(cgh);
       auto out = output_buf.get_access<s::access::mode::discard_write>(cgh);
-      cl::sycl::range<2> ndrange{size, size};
+      sycl::range<2> ndrange{size, size};
 
       // Sobel kernel 3x3
       const float kernel[] = {1, 0, -1, 2, 0, -2, 1, 0, -1};
 
-      cgh.parallel_for<class SobelBenchKernel>(ndrange, [in, out, kernel, size_ = size](cl::sycl::id<2> gid) {
+      cgh.parallel_for<class SobelBenchKernel>(ndrange, [in, out, kernel, size_ = size](sycl::id<2> gid) {
         int x = gid[0];
         int y = gid[1];
-        cl::sycl::float4 Gx = cl::sycl::float4(0, 0, 0, 0);
-        cl::sycl::float4 Gy = cl::sycl::float4(0, 0, 0, 0);
+        sycl::float4 Gx = sycl::float4(0, 0, 0, 0);
+        sycl::float4 Gy = sycl::float4(0, 0, 0, 0);
         const int radius = 3;
 
         // constant-size loops in [0,1,2]
@@ -68,29 +68,29 @@ class SobelBench
               continue;
 
             // sample color
-            cl::sycl::float4 sample = in[{xs, ys}];
+            sycl::float4 sample = in[{xs, ys}];
 
             // convolution calculation
             int offset_x = x_shift + y_shift * radius;
             int offset_y = y_shift + x_shift * radius;
 
             float conv_x = kernel[offset_x];
-            cl::sycl::float4 conv4_x = cl::sycl::float4(conv_x);
+            sycl::float4 conv4_x = sycl::float4(conv_x);
             Gx += conv4_x * sample;
 
             float conv_y = kernel[offset_y];
-            cl::sycl::float4 conv4_y = cl::sycl::float4(conv_y);
+            sycl::float4 conv4_y = sycl::float4(conv_y);
             Gy += conv4_y * sample;
           }
         }
         // taking root of sums of squares of Gx and Gy
-        cl::sycl::float4 color = hypot(Gx, Gy);
-        cl::sycl::float4 minval = cl::sycl::float4(0.0, 0.0, 0.0, 0.0);
-        cl::sycl::float4 maxval = cl::sycl::float4(1.0, 1.0, 1.0, 1.0);
+        sycl::float4 color = hypot(Gx, Gy);
+        sycl::float4 minval = sycl::float4(0.0, 0.0, 0.0, 0.0);
+        sycl::float4 maxval = sycl::float4(1.0, 1.0, 1.0, 1.0);
         out[gid] = clamp(color, minval, maxval);
       });
     }));
-   }
+  }
 
 
   bool verify(VerificationSetting& ver) {
@@ -104,7 +104,7 @@ class SobelBench
     for(size_t i = ver.begin[0]; i < ver.begin[0] + ver.range[0]; i++) {
       int x = i % size;
       int y = i / size;
-      cl::sycl::float4 Gx, Gy;
+      sycl::float4 Gx, Gy;
       for(uint x_shift = 0; x_shift < 3; x_shift++)
         for(uint y_shift = 0; y_shift < 3; y_shift++) {
           uint xs = x + x_shift - 1;
@@ -113,23 +113,23 @@ class SobelBench
             continue;
           if(xs < 0 || xs >= size || ys < 0 || ys >= size)
             continue;
-          cl::sycl::float4 sample = input[xs + ys * size];
+          sycl::float4 sample = input[xs + ys * size];
           int offset_x = x_shift + y_shift * radius;
           int offset_y = y_shift + x_shift * radius;
           float conv_x = kernel[offset_x];
-          cl::sycl::float4 conv4_x = cl::sycl::float4(conv_x);
+          sycl::float4 conv4_x = sycl::float4(conv_x);
           Gx += conv4_x * sample;
           float conv_y = kernel[offset_y];
-          cl::sycl::float4 conv4_y = cl::sycl::float4(conv_y);
+          sycl::float4 conv4_y = sycl::float4(conv_y);
           Gy += conv4_y * sample;
         }
 
-      cl::sycl::float4 color = hypot(Gx, Gy);
-      cl::sycl::float4 minval = cl::sycl::float4(0.0, 0.0, 0.0, 0.0);
-      cl::sycl::float4 maxval = cl::sycl::float4(1.0, 1.0, 1.0, 1.0);
-      cl::sycl::float4 expected = clamp(color, minval, maxval);
-      cl::sycl::float4 dif = fdim(output[i], expected);
-      float length = cl::sycl::length(dif);
+      sycl::float4 color = hypot(Gx, Gy);
+      sycl::float4 minval = sycl::float4(0.0, 0.0, 0.0, 0.0);
+      sycl::float4 maxval = sycl::float4(1.0, 1.0, 1.0, 1.0);
+      sycl::float4 expected = clamp(color, minval, maxval);
+      sycl::float4 dif = fdim(output[i], expected);
+      float length = sycl::length(dif);
       if(length > 0.01f) {
         pass = false;
         break;
@@ -144,11 +144,8 @@ class SobelBench
 }; // SobelBench class
 
 
-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
   BenchmarkApp app(argc, argv);
-  app.run<SobelBench>();  
+  app.run<SobelBench>();
   return 0;
 }
-
-
diff --git a/single-kernel/sobel5.cpp b/single-kernel/sobel5.cpp
index 92ede750..47316328 100644
--- a/single-kernel/sobel5.cpp
+++ b/single-kernel/sobel5.cpp
@@ -1,11 +1,11 @@
-#include <CL/sycl.hpp>
 #include <iostream>
+#include <sycl/sycl.hpp>
 
-#include "common.h"
 #include "bitmap.h"
+#include "common.h"
 
 
-namespace s = cl::sycl;
+namespace s = sycl;
 class Sobel5BenchKernel; // kernel forward declaration
 
 
@@ -13,22 +13,22 @@ class Sobel5BenchKernel; // kernel forward declaration
   A Sobel filter with a convolution matrix 5x5.
   The convolution kernel is calculated by using a recursive conv2 on the
   [1 2 1]'*[1 0 -1] basis matrix.
-  Input and output are two-dimensional buffers of floats.     
+  Input and output are two-dimensional buffers of floats.
  */
-class Sobel5Bench
-{
+class Sobel5Bench {
 protected:
-    std::vector<cl::sycl::float4> input;
-    std::vector<cl::sycl::float4> output;
+  std::vector<sycl::float4> input;
+  std::vector<sycl::float4> output;
+
+  size_t w, h; // size of the input picture
+  size_t size; // user-defined size (input and output will be size x size)
+  BenchmarkArgs args;
 
-    size_t w, h; // size of the input picture
-    size_t size; // user-defined size (input and output will be size x size)
-    BenchmarkArgs args;
+  PrefetchedBuffer<sycl::float4, 2> input_buf;
+  PrefetchedBuffer<sycl::float4, 2> output_buf;
 
-    PrefetchedBuffer<cl::sycl::float4, 2> input_buf;
-    PrefetchedBuffer<cl::sycl::float4, 2> output_buf;
 public:
-  Sobel5Bench(const BenchmarkArgs &_args) : args(_args) {}
+  Sobel5Bench(const BenchmarkArgs& _args) : args(_args) {}
 
   void setup() {
     size = args.problem_size; // input size defined by the user
@@ -40,132 +40,113 @@ class Sobel5Bench
     output_buf.initialize(args.device_queue, output.data(), s::range<2>(size, size));
   }
 
-  void run(std::vector<cl::sycl::event>& events) {
-    events.push_back(args.device_queue.submit(
-        [&](cl::sycl::handler& cgh) {
-      auto in  = input_buf .get_access<s::access::mode::read>(cgh);
+  void run(std::vector<sycl::event>& events) {
+    events.push_back(args.device_queue.submit([&](sycl::handler& cgh) {
+      auto in = input_buf.get_access<s::access::mode::read>(cgh);
       auto out = output_buf.get_access<s::access::mode::discard_write>(cgh);
-      cl::sycl::range<2> ndrange {size, size};
+      sycl::range<2> ndrange{size, size};
 
       // Sobel kernel 5x5
-      const float kernel[] =
-      { 1,  2, 0,  -2, -1,
-        4,  8, 0,  -8, -4,
-	      6, 12, 0, -12, -6,
-	      4,  8, 0,  -8, -4,
-	      1,  2, 0,  -2, -1
-      };
-
-      cgh.parallel_for<Sobel5BenchKernel>(ndrange,
-        [in, out, kernel, size_ = size](cl::sycl::id<2> gid)
-        {
-          int x = gid[0];
-          int y = gid[1];
-          cl::sycl::float4 Gx = cl::sycl::float4(0,0,0,0);
-          cl::sycl::float4 Gy = cl::sycl::float4(0,0,0,0);
-          const int radius = 5;
-
-          // constant-size loops in [0,1,2,3,4]
-          for(int x_shift = 0; x_shift<5; x_shift++)
-          {
-            for(int y_shift = 0; y_shift<5; y_shift++)
-            {
-              // sample position
-              uint xs = x + x_shift - 2; // [x-2,x-1,x,x+1,x+2]
-              uint ys = y + y_shift - 2; // [y-2,y-1,y,y+1,y+2]
-              // for the same pixel, convolution is always 0  
-              if(x==xs && y==ys) continue;
-              // boundary check
-              if(xs < 0 || xs >= size_ || ys < 0 || ys >= size_) continue;
-
-              // sample color
-              cl::sycl::float4 sample = in[ {xs,ys} ];
-
-              // convolution calculation
-              int offset_x = x_shift + y_shift * radius;
-              int offset_y = y_shift + x_shift * radius;
-
-              float conv_x   = kernel[offset_x];
-              cl::sycl::float4 conv4_x = cl::sycl::float4(conv_x);
-              Gx += conv4_x * sample;
-
-              float conv_y   = kernel[offset_y];
-              cl::sycl::float4 conv4_y = cl::sycl::float4(conv_y);
-              Gy += conv4_y * sample;
-            }
+      const float kernel[] = {1, 2, 0, -2, -1, 4, 8, 0, -8, -4, 6, 12, 0, -12, -6, 4, 8, 0, -8, -4, 1, 2, 0, -2, -1};
+
+      cgh.parallel_for<Sobel5BenchKernel>(ndrange, [in, out, kernel, size_ = size](sycl::id<2> gid) {
+        int x = gid[0];
+        int y = gid[1];
+        sycl::float4 Gx = sycl::float4(0, 0, 0, 0);
+        sycl::float4 Gy = sycl::float4(0, 0, 0, 0);
+        const int radius = 5;
+
+        // constant-size loops in [0,1,2,3,4]
+        for(int x_shift = 0; x_shift < 5; x_shift++) {
+          for(int y_shift = 0; y_shift < 5; y_shift++) {
+            // sample position
+            uint xs = x + x_shift - 2; // [x-2,x-1,x,x+1,x+2]
+            uint ys = y + y_shift - 2; // [y-2,y-1,y,y+1,y+2]
+            // for the same pixel, convolution is always 0
+            if(x == xs && y == ys)
+              continue;
+            // boundary check
+            if(xs < 0 || xs >= size_ || ys < 0 || ys >= size_)
+              continue;
+
+            // sample color
+            sycl::float4 sample = in[{xs, ys}];
+
+            // convolution calculation
+            int offset_x = x_shift + y_shift * radius;
+            int offset_y = y_shift + x_shift * radius;
+
+            float conv_x = kernel[offset_x];
+            sycl::float4 conv4_x = sycl::float4(conv_x);
+            Gx += conv4_x * sample;
+
+            float conv_y = kernel[offset_y];
+            sycl::float4 conv4_y = sycl::float4(conv_y);
+            Gy += conv4_y * sample;
           }
-          // taking root of sums of squares of Gx and Gy        
-          cl::sycl::float4 color = hypot(Gx, Gy);
-          cl::sycl::float4 minval = cl::sycl::float4(0.0, 0.0, 0.0, 0.0);
-          cl::sycl::float4 maxval = cl::sycl::float4(1.0, 1.0, 1.0, 1.0);
-          out[gid] = clamp(color, minval, maxval);
-      }
-       );
-     }));
-   }
-      
-  bool verify(VerificationSetting &ver) {
+        }
+        // taking root of sums of squares of Gx and Gy
+        sycl::float4 color = hypot(Gx, Gy);
+        sycl::float4 minval = sycl::float4(0.0, 0.0, 0.0, 0.0);
+        sycl::float4 maxval = sycl::float4(1.0, 1.0, 1.0, 1.0);
+        out[gid] = clamp(color, minval, maxval);
+      });
+    }));
+  }
+
+  bool verify(VerificationSetting& ver) {
     // Triggers writeback
     output_buf.reset();
     save_bitmap("sobel5.bmp", size, output);
 
-    const float kernel[] = { 1, 2, 0,  -2, -1,4,  8, 0,  -8, -4, 6, 12, 0, -12, -6, 4,  8, 0,  -8, -4, 1,  2, 0,  -2, -1 };
+    const float kernel[] = {1, 2, 0, -2, -1, 4, 8, 0, -8, -4, 6, 12, 0, -12, -6, 4, 8, 0, -8, -4, 1, 2, 0, -2, -1};
 
     bool pass = true;
     int radius = 5;
-    for(size_t i=ver.begin[0]; i<ver.begin[0]+ver.range[0]; i++){
+    for(size_t i = ver.begin[0]; i < ver.begin[0] + ver.range[0]; i++) {
       int x = i % size;
       int y = i / size;
-      cl::sycl::float4 Gx, Gy;
-        for(uint x_shift = 0; x_shift<5; x_shift++)
-             for(uint y_shift = 0; y_shift<5; y_shift++) {
-                  uint xs = x + x_shift - 2;
-                  uint ys = y + y_shift - 2;
-                  if(x==xs && y==ys)  continue;
-                  if(xs < 0 || xs >= size || ys < 0 || ys >= size) continue;
-                  cl::sycl::float4 sample = input[xs + ys * size];
-                  int offset_x  = x_shift + y_shift * radius;
-                  int offset_y  = y_shift + x_shift * radius;
-                  float conv_x   = kernel[offset_x];
-                  cl::sycl::float4 conv4_x = cl::sycl::float4(conv_x);
-                  Gx += conv4_x * sample;
-                  float conv_y   = kernel[offset_y];
-                  cl::sycl::float4 conv4_y = cl::sycl::float4(conv_y);
-                  Gy += conv4_y * sample;
-               }
-        cl::sycl::float4 color = hypot(Gx, Gy);
-        cl::sycl::float4 minval = cl::sycl::float4(0.0, 0.0, 0.0, 0.0);
-        cl::sycl::float4 maxval = cl::sycl::float4(1.0, 1.0, 1.0, 1.0);
-        cl::sycl::float4 expected = clamp(color, minval, maxval);
-        cl::sycl::float4 dif = fdim(output[i], expected);
-        float length = cl::sycl::length(dif);
-        if(length > 0.01f)
-        {
-            pass = false;
-            break;
+      sycl::float4 Gx, Gy;
+      for(uint x_shift = 0; x_shift < 5; x_shift++)
+        for(uint y_shift = 0; y_shift < 5; y_shift++) {
+          uint xs = x + x_shift - 2;
+          uint ys = y + y_shift - 2;
+          if(x == xs && y == ys)
+            continue;
+          if(xs < 0 || xs >= size || ys < 0 || ys >= size)
+            continue;
+          sycl::float4 sample = input[xs + ys * size];
+          int offset_x = x_shift + y_shift * radius;
+          int offset_y = y_shift + x_shift * radius;
+          float conv_x = kernel[offset_x];
+          sycl::float4 conv4_x = sycl::float4(conv_x);
+          Gx += conv4_x * sample;
+          float conv_y = kernel[offset_y];
+          sycl::float4 conv4_y = sycl::float4(conv_y);
+          Gy += conv4_y * sample;
         }
+      sycl::float4 color = hypot(Gx, Gy);
+      sycl::float4 minval = sycl::float4(0.0, 0.0, 0.0, 0.0);
+      sycl::float4 maxval = sycl::float4(1.0, 1.0, 1.0, 1.0);
+      sycl::float4 expected = clamp(color, minval, maxval);
+      sycl::float4 dif = fdim(output[i], expected);
+      float length = sycl::length(dif);
+      if(length > 0.01f) {
+        pass = false;
+        break;
+      }
     }
     return pass;
-}
-
-
-static std::string getBenchmarkName(BenchmarkArgs& args) {
-    return "Sobel5";
   }
 
-}; // SobelBench class
 
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "Sobel5"; }
 
+}; // SobelBench class
 
 
-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
   BenchmarkApp app(argc, argv);
   app.run<Sobel5Bench>();
   return 0;
 }
-
-
-
-
-
diff --git a/single-kernel/sobel7.cpp b/single-kernel/sobel7.cpp
index b75d25da..5a2d7cf0 100644
--- a/single-kernel/sobel7.cpp
+++ b/single-kernel/sobel7.cpp
@@ -1,32 +1,32 @@
-#include <CL/sycl.hpp>
 #include <iostream>
+#include <sycl/sycl.hpp>
 
-#include "common.h"
 #include "bitmap.h"
+#include "common.h"
 
 
-namespace s = cl::sycl;
+namespace s = sycl;
 class Sobel7BenchKernel; // kernel forward declaration
 
 
 /*
   A Sobel filter with a convolution matrix 7x7.
-  Input and output are two-dimensional buffers of floats.     
+  Input and output are two-dimensional buffers of floats.
  */
-class Sobel7Bench
-{
+class Sobel7Bench {
 protected:
-    std::vector<cl::sycl::float4> input;
-    std::vector<cl::sycl::float4> output;
+  std::vector<sycl::float4> input;
+  std::vector<sycl::float4> output;
+
+  size_t w, h; // size of the input picture
+  size_t size; // user-defined size (input and output will be size x size)
+  BenchmarkArgs args;
 
-    size_t w, h; // size of the input picture
-    size_t size; // user-defined size (input and output will be size x size)
-    BenchmarkArgs args;
+  PrefetchedBuffer<sycl::float4, 2> input_buf;
+  PrefetchedBuffer<sycl::float4, 2> output_buf;
 
-    PrefetchedBuffer<cl::sycl::float4, 2> input_buf;
-    PrefetchedBuffer<cl::sycl::float4, 2> output_buf;
 public:
-  Sobel7Bench(const BenchmarkArgs &_args) : args(_args) {}
+  Sobel7Bench(const BenchmarkArgs& _args) : args(_args) {}
 
   void setup() {
     size = args.problem_size; // input size defined by the user
@@ -38,22 +38,22 @@ class Sobel7Bench
     output_buf.initialize(args.device_queue, output.data(), s::range<2>(size, size));
   }
 
-  void run(std::vector<cl::sycl::event>& events) {
-    events.push_back(args.device_queue.submit([&](cl::sycl::handler& cgh) {
+  void run(std::vector<sycl::event>& events) {
+    events.push_back(args.device_queue.submit([&](sycl::handler& cgh) {
       auto in = input_buf.get_access<s::access::mode::read>(cgh);
       auto out = output_buf.get_access<s::access::mode::discard_write>(cgh);
-      cl::sycl::range<2> ndrange{size, size};
+      sycl::range<2> ndrange{size, size};
 
       // Sobel kernel 7x7
       const float kernel[] = {130, 120, 78, 0, -78, -120, -130, 180, 195, 156, 0, -156, -195, -180, 234, 312, 390, 0,
           -390, -312, -234, 260, 390, 780, 0, -780, -390, -260, 234, 312, 390, 0, -390, -312, -234, 180, 195, 156, 0,
           -156, -195, -180, 130, 120, 78, 0, -78, -120, -130};
 
-      cgh.parallel_for<Sobel7BenchKernel>(ndrange, [in, out, kernel, size_ = size](cl::sycl::id<2> gid) {
+      cgh.parallel_for<Sobel7BenchKernel>(ndrange, [in, out, kernel, size_ = size](sycl::id<2> gid) {
         int x = gid[0];
         int y = gid[1];
-        cl::sycl::float4 Gx = cl::sycl::float4(0, 0, 0, 0);
-        cl::sycl::float4 Gy = cl::sycl::float4(0, 0, 0, 0);
+        sycl::float4 Gx = sycl::float4(0, 0, 0, 0);
+        sycl::float4 Gy = sycl::float4(0, 0, 0, 0);
         const int radius = 7;
 
         // constant-size loops in [0,1,2,3,4,5,6]
@@ -70,25 +70,25 @@ class Sobel7Bench
               continue;
 
             // sample color
-            cl::sycl::float4 sample = in[{xs, ys}];
+            sycl::float4 sample = in[{xs, ys}];
 
             // convolution calculation
             int offset_x = x_shift + y_shift * radius;
             int offset_y = y_shift + x_shift * radius;
 
             float conv_x = kernel[offset_x];
-            cl::sycl::float4 conv4_x = cl::sycl::float4(conv_x);
+            sycl::float4 conv4_x = sycl::float4(conv_x);
             Gx += conv4_x * sample;
 
             float conv_y = kernel[offset_y];
-            cl::sycl::float4 conv4_y = cl::sycl::float4(conv_y);
+            sycl::float4 conv4_y = sycl::float4(conv_y);
             Gy += conv4_y * sample;
           }
         }
         // taking root of sums of squares of Gx and Gy
-        cl::sycl::float4 color = hypot(Gx, Gy);
-        cl::sycl::float4 minval = cl::sycl::float4(0.0, 0.0, 0.0, 0.0);
-        cl::sycl::float4 maxval = cl::sycl::float4(1.0, 1.0, 1.0, 1.0);
+        sycl::float4 color = hypot(Gx, Gy);
+        sycl::float4 minval = sycl::float4(0.0, 0.0, 0.0, 0.0);
+        sycl::float4 maxval = sycl::float4(1.0, 1.0, 1.0, 1.0);
         out[gid] = clamp(color, minval, maxval);
       });
     }));
@@ -99,22 +99,16 @@ class Sobel7Bench
     output_buf.reset();
     save_bitmap("sobel7.bmp", size, output);
 
-    const float kernel[] = {
-      130, 120, 78,  0, -78,  -120, -130,
-      180, 195, 156, 0, -156, -195, -180,
-      234, 312, 390, 0, -390, -312, -234,
-      260, 390, 780, 0, -780, -390, -260,
-      234, 312, 390, 0, -390, -312, -234,
-      180, 195, 156, 0, -156, -195, -180,
-      130, 120, 78,  0, -78,  -120, -130
-    };
+    const float kernel[] = {130, 120, 78, 0, -78, -120, -130, 180, 195, 156, 0, -156, -195, -180, 234, 312, 390, 0,
+        -390, -312, -234, 260, 390, 780, 0, -780, -390, -260, 234, 312, 390, 0, -390, -312, -234, 180, 195, 156, 0,
+        -156, -195, -180, 130, 120, 78, 0, -78, -120, -130};
 
     bool pass = true;
     int radius = 7;
     for(size_t i = ver.begin[0]; i < ver.begin[0] + ver.range[0]; i++) {
       int x = i % size;
       int y = i / size;
-      cl::sycl::float4 Gx, Gy;
+      sycl::float4 Gx, Gy;
       for(uint x_shift = 0; x_shift < 7; x_shift++)
         for(uint y_shift = 0; y_shift < 7; y_shift++) {
           uint xs = x + x_shift - 3;
@@ -123,22 +117,22 @@ class Sobel7Bench
             continue;
           if(xs < 0 || xs >= size || ys < 0 || ys >= size)
             continue;
-          cl::sycl::float4 sample = input[xs + ys * size];
+          sycl::float4 sample = input[xs + ys * size];
           int offset_x = x_shift + y_shift * radius;
           int offset_y = y_shift + x_shift * radius;
           float conv_x = kernel[offset_x];
-          cl::sycl::float4 conv4_x = cl::sycl::float4(conv_x);
+          sycl::float4 conv4_x = sycl::float4(conv_x);
           Gx += conv4_x * sample;
           float conv_y = kernel[offset_y];
-          cl::sycl::float4 conv4_y = cl::sycl::float4(conv_y);
+          sycl::float4 conv4_y = sycl::float4(conv_y);
           Gy += conv4_y * sample;
         }
-      cl::sycl::float4 color = hypot(Gx, Gy);
-      cl::sycl::float4 minval = cl::sycl::float4(0.0, 0.0, 0.0, 0.0);
-      cl::sycl::float4 maxval = cl::sycl::float4(1.0, 1.0, 1.0, 1.0);
-      cl::sycl::float4 expected = clamp(color, minval, maxval);
-      cl::sycl::float4 dif = fdim(output[i], expected);
-      float length = cl::sycl::length(dif);
+      sycl::float4 color = hypot(Gx, Gy);
+      sycl::float4 minval = sycl::float4(0.0, 0.0, 0.0, 0.0);
+      sycl::float4 maxval = sycl::float4(1.0, 1.0, 1.0, 1.0);
+      sycl::float4 expected = clamp(color, minval, maxval);
+      sycl::float4 dif = fdim(output[i], expected);
+      float length = sycl::length(dif);
       if(length > 0.01f) {
         pass = false;
         break;
@@ -148,24 +142,13 @@ class Sobel7Bench
   }
 
 
-static std::string getBenchmarkName(BenchmarkArgs& args) {
-    return "Sobel7";
-  }
+  static std::string getBenchmarkName(BenchmarkArgs& args) { return "Sobel7"; }
 
 }; // SobelBench class
 
 
-
-
-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
   BenchmarkApp app(argc, argv);
   app.run<Sobel7Bench>();
   return 0;
 }
-
-
-
-
-
-
diff --git a/single-kernel/vec_add.cpp b/single-kernel/vec_add.cpp
index 7d768814..0a1a7231 100644
--- a/single-kernel/vec_add.cpp
+++ b/single-kernel/vec_add.cpp
@@ -2,17 +2,17 @@
 
 #include <iostream>
 
-// Opening cl::sycl namespace is unsupported on hipSYCL 
-// (mainly due to CUDA/HIP design issues), better 
+// Opening sycl namespace is unsupported on hipSYCL
+// (mainly due to CUDA/HIP design issues), better
 // avoid it
-//using namespace cl::sycl;
-namespace s = cl::sycl;
-template <typename T> class VecAddKernel;
+// using namespace sycl;
+namespace s = sycl;
+template <typename T>
+class VecAddKernel;
 
 template <typename T>
-class VecAddBench
-{
-protected:    
+class VecAddBench {
+protected:
   std::vector<T> input1;
   std::vector<T> input2;
   std::vector<T> output;
@@ -23,15 +23,15 @@ class VecAddBench
   PrefetchedBuffer<T, 1> output_buf;
 
 public:
-  VecAddBench(const BenchmarkArgs &_args) : args(_args) {}
-  
+  VecAddBench(const BenchmarkArgs& _args) : args(_args) {}
+
   void setup() {
     // host memory intilization
     input1.resize(args.problem_size);
     input2.resize(args.problem_size);
     output.resize(args.problem_size);
 
-    for (size_t i =0; i < args.problem_size; i++) {
+    for(size_t i = 0; i < args.problem_size; i++) {
       input1[i] = static_cast<T>(i);
       input2[i] = static_cast<T>(i);
       output[i] = static_cast<T>(0);
@@ -42,39 +42,33 @@ class VecAddBench
     output_buf.initialize(args.device_queue, output.data(), s::range<1>(args.problem_size));
   }
 
-  void run(std::vector<cl::sycl::event>& events) {
-    events.push_back(args.device_queue.submit(
-        [&](cl::sycl::handler& cgh) {
+  void run(std::vector<sycl::event>& events) {
+    events.push_back(args.device_queue.submit([&](sycl::handler& cgh) {
       auto in1 = input1_buf.template get_access<s::access::mode::read>(cgh);
       auto in2 = input2_buf.template get_access<s::access::mode::read>(cgh);
       // Use discard_write here, otherwise the content of the host buffer must first be copied to device
       auto out = output_buf.template get_access<s::access::mode::discard_write>(cgh);
-      cl::sycl::range<1> ndrange {args.problem_size};
+      sycl::range<1> ndrange{args.problem_size};
 
-      cgh.parallel_for<class VecAddKernel<T>>(ndrange,
-        [=](cl::sycl::id<1> gid) 
-        {
-          out[gid] = in1[gid] + in2[gid];
-        });
+      cgh.parallel_for<class VecAddKernel<T>>(ndrange, [=](sycl::id<1> gid) { out[gid] = in1[gid] + in2[gid]; });
     }));
-
   }
 
-  bool verify(VerificationSetting &ver) {
-    //Triggers writeback
+  bool verify(VerificationSetting& ver) {
+    // Triggers writeback
     output_buf.reset();
 
     bool pass = true;
-    for(size_t i=ver.begin[0]; i<ver.begin[0]+ver.range[0]; i++){
-        auto expected = input1[i] + input2[i];
-        if(expected != output[i]){
-            pass = false;
-            break;
-        }
-      }    
+    for(size_t i = ver.begin[0]; i < ver.begin[0] + ver.range[0]; i++) {
+      auto expected = input1[i] + input2[i];
+      if(expected != output[i]) {
+        pass = false;
+        break;
+      }
+    }
     return pass;
   }
-  
+
   static std::string getBenchmarkName(BenchmarkArgs& args) {
     std::stringstream name;
     name << "VectorAddition_";
@@ -83,13 +77,13 @@ class VecAddBench
   }
 };
 
-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
   BenchmarkApp app(argc, argv);
   app.run<VecAddBench<int>>();
-  app.run<VecAddBench<long long>>();  
+  app.run<VecAddBench<long long>>();
   app.run<VecAddBench<float>>();
-  if(app.deviceSupportsFP64())
+  if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) {
     app.run<VecAddBench<double>>();
+  }
   return 0;
 }
diff --git a/sycl2020/USM/usm_accessors_latency.cpp b/sycl2020/USM/usm_accessors_latency.cpp
new file mode 100644
index 00000000..49d2c922
--- /dev/null
+++ b/sycl2020/USM/usm_accessors_latency.cpp
@@ -0,0 +1,171 @@
+#include "common.h"
+#include "memory_wrappers.h"
+
+namespace s = sycl;
+
+static constexpr std::size_t kernels_launch_default = 5000;
+
+template <typename DATA_TYPE, bool in_order = false, bool synch = false>
+class accessor_latency_kernel;
+template <typename DATA_TYPE, bool in_order = false, bool synch = false>
+class usm_latency_kernel;
+
+
+/**
+Measure Accessors latency compared to USM
+The benchmark submits multiple small kernels which stress SYCL dependency tracking.
+ */
+template <typename DATA_TYPE, bool in_order>
+class LatencyBenchmark {
+protected:
+  BenchmarkArgs args;
+  size_t kernel_launches_num;
+
+  LatencyBenchmark(const BenchmarkArgs& args, const size_t kernel_launches_num)
+      : args(args), kernel_launches_num(kernel_launches_num) {}
+
+  s::range<1> getRange() const { return s::range<1>{args.problem_size}; }
+
+  s::nd_range<1> getNDRange() const {
+    return s::nd_range<1>{args.problem_size, args.problem_size > 1024 ? 1024 : args.problem_size};
+  }
+
+  sycl::queue& getQueue() {
+    if constexpr(in_order) {
+      return args.device_queue_in_order;
+    } else {
+      return args.device_queue;
+    }
+  }
+};
+
+template <typename DATA_TYPE, bool in_order = false, bool synch = false>
+class AccessorLatency : LatencyBenchmark<DATA_TYPE, in_order> {
+protected:
+  PrefetchedBuffer<DATA_TYPE> buff_A;
+  PrefetchedBuffer<DATA_TYPE> buff_B;
+  PrefetchedBuffer<DATA_TYPE> buff_C;
+
+public:
+  using base = LatencyBenchmark<DATA_TYPE, in_order>;
+  using base::args;
+  using base::base;
+  using base::getNDRange;
+  using base::getQueue;
+  using base::getRange;
+  using base::kernel_launches_num;
+
+  AccessorLatency(const BenchmarkArgs& args, const size_t kernel_launches_num) : base(args, kernel_launches_num) {}
+
+  void setup() {
+    const auto range = getRange();
+    buff_A.initialize(args.device_queue, range);
+    buff_B.initialize(args.device_queue, range);
+    buff_C.initialize(args.device_queue, range);
+  }
+
+  void run(std::vector<sycl::event>& events) {
+    auto& queue = getQueue();
+    for(int i = 0; i < kernel_launches_num; i++) {
+      auto event = queue.submit([&](s::handler& cgh) {
+        auto acc_A = buff_A.template get_access<s::access::mode::read>(cgh, buff_A.get_range());
+        auto acc_B = buff_B.template get_access<s::access::mode::read>(cgh, buff_B.get_range());
+        auto acc_C = buff_C.template get_access<s::access::mode::write>(cgh, buff_C.get_range());
+
+        cgh.parallel_for<class accessor_latency_kernel<DATA_TYPE, in_order, synch>>(
+            getNDRange(), [=](s::nd_item<1> item) {
+              const auto id = item.get_global_linear_id();
+              acc_C[id] = acc_A[id] + acc_B[id];
+            });
+      });
+      if constexpr(synch) {
+        queue.wait();
+      }
+      events.push_back(event);
+    }
+  }
+
+
+  static std::string getBenchmarkName(BenchmarkArgs& args) {
+    std::stringstream name;
+    name << "SYCL2020_Accessors_Latency_";
+    name << ReadableTypename<DATA_TYPE>::name << "_";
+    name << (in_order ? "in_order" : "out_of_order") << "_";
+    name << (synch ? "synch" : "") << "_";
+    return name.str();
+  }
+};
+
+template <typename DATA_TYPE, bool in_order = false, bool synch = false>
+class USMLatency : LatencyBenchmark<DATA_TYPE, in_order> {
+protected:
+  USMBuffer<DATA_TYPE> buff_A;
+  USMBuffer<DATA_TYPE> buff_B;
+  USMBuffer<DATA_TYPE> buff_C;
+
+  using base = LatencyBenchmark<DATA_TYPE, in_order>;
+  using base::args;
+  using base::base;
+  using base::getNDRange;
+  using base::getQueue;
+  using base::getRange;
+  using base::kernel_launches_num;
+
+public:
+  USMLatency(const BenchmarkArgs& args, const size_t kernel_launches_num) : base(args, kernel_launches_num) {}
+
+  void setup() {
+    buff_A.initialize(getQueue(), getRange());
+    buff_B.initialize(getQueue(), getRange());
+    buff_C.initialize(getQueue(), getRange());
+  }
+
+  void run(std::vector<sycl::event>& events) {
+    auto& queue = getQueue();
+    sycl::event event;
+    auto* acc_A = buff_A.get();
+    auto* acc_B = buff_B.get();
+    auto* acc_C = buff_C.get();
+    for(int i = 0; i < kernel_launches_num; i++) {
+      event = queue.submit([&](s::handler& cgh) {
+        // Disable kernel dependencies build when queue is in_order
+        if constexpr(!in_order && !synch) {
+          cgh.depends_on(event);
+        }
+        cgh.parallel_for<class usm_latency_kernel<DATA_TYPE, in_order, synch>>(getNDRange(), [=](s::nd_item<1> item) {
+          const auto id = item.get_global_linear_id();
+          acc_C[id] = acc_A[id] + acc_B[id];
+        });
+      });
+      if constexpr(synch) {
+        queue.wait();
+      }
+      // Add kernel event to kernel's list
+      events.push_back(event);
+    }
+  }
+
+
+  static std::string getBenchmarkName(BenchmarkArgs& args) {
+    std::stringstream name;
+    name << "USM_Latency_";
+    name << ReadableTypename<DATA_TYPE>::name << "_";
+    name << (in_order ? "in_order" : "out_of_order") << "_";
+    name << (synch ? "synch" : "") << "_";
+    return name.str();
+  }
+};
+
+template <template <typename DATA_TYPE, bool in_order = false, bool synch = false> typename latency_kernel>
+void launchBenchmarks(BenchmarkApp& app, const size_t kernel_launches_num) {
+  app.run<latency_kernel<float>>(kernel_launches_num);       // out-of-order, no synch
+  app.run<latency_kernel<float, true>>(kernel_launches_num); // in-order, no synch
+}
+
+int main(int argc, char** argv) {
+  BenchmarkApp app(argc, argv);
+  const size_t kernel_launches_num = app.getArgs().cli.getOrDefault("--num-launches", kernels_launch_default);
+
+  launchBenchmarks<AccessorLatency>(app, kernel_launches_num);
+  launchBenchmarks<USMLatency>(app, kernel_launches_num);
+}
diff --git a/sycl2020/USM/usm_allocation_latency.cpp b/sycl2020/USM/usm_allocation_latency.cpp
new file mode 100644
index 00000000..582dc6ff
--- /dev/null
+++ b/sycl2020/USM/usm_allocation_latency.cpp
@@ -0,0 +1,71 @@
+#include "common.h"
+#include "polybenchUtilFuncts.h"
+#include "usm_utils.hpp"
+
+/**
+Measure USM allocation time for different types of USM memory.
+*/
+template <typename DATA_TYPE, sycl::usm::alloc usm_type>
+class USMAllocationLatency {
+protected:
+  BenchmarkArgs args;
+  DATA_TYPE* buffer;
+
+public:
+  USMAllocationLatency(const BenchmarkArgs& _args) : args(_args), buffer(nullptr) {}
+
+  ~USMAllocationLatency() {
+    if(buffer != nullptr) {
+      sycl::free(buffer, args.device_queue);
+    }
+  }
+
+  void setup() {}
+
+  void run(std::vector<sycl::event>& events) {
+    sycl::queue& queue = args.device_queue;
+    buffer = static_cast<DATA_TYPE*>(sycl::malloc(args.problem_size * sizeof(DATA_TYPE), queue, usm_type));
+  }
+
+  bool verify(VerificationSetting& settings) {
+    sycl::queue& queue = args.device_queue;
+    queue.fill(buffer, DATA_TYPE{1}, args.problem_size).wait();
+    DATA_TYPE* host_ptr = buffer;
+    if constexpr(usm_type == sycl::usm::alloc::device) {
+      host_ptr = static_cast<DATA_TYPE*>(
+          sycl::malloc(args.problem_size * sizeof(DATA_TYPE), args.device_queue, sycl::usm::alloc::host));
+      queue.copy(buffer, host_ptr, args.problem_size).wait();
+    }
+
+    bool pass = true;
+    for(int i = 0; i < args.problem_size; i++) {
+      if(host_ptr[i] != DATA_TYPE{1}) {
+        pass = false;
+      }
+    }
+    if constexpr(usm_type == sycl::usm::alloc::device) {
+      sycl::free(host_ptr, queue);
+    }
+    return pass;
+  }
+
+
+  static std::string getBenchmarkName(BenchmarkArgs& args) {
+    std::stringstream name;
+    name << "USM_Allocation_latency_";
+    name << ReadableTypename<DATA_TYPE>::name << "_";
+    name << usm_to_string(usm_type);
+    return name.str();
+  }
+};
+
+
+int main(int argc, char** argv) {
+  BenchmarkApp app(argc, argv);
+
+  app.run<USMAllocationLatency<float, sycl::usm::alloc::device>>();
+  app.run<USMAllocationLatency<float, sycl::usm::alloc::host>>();
+  app.run<USMAllocationLatency<float, sycl::usm::alloc::shared>>();
+
+  return 0;
+}
\ No newline at end of file
diff --git a/sycl2020/USM/usm_instr_mix.cpp b/sycl2020/USM/usm_instr_mix.cpp
new file mode 100644
index 00000000..91c1fc70
--- /dev/null
+++ b/sycl2020/USM/usm_instr_mix.cpp
@@ -0,0 +1,124 @@
+#include "common.h"
+#include "polybenchUtilFuncts.h"
+#include "usm_utils.hpp"
+
+static constexpr std::size_t d_kernel_launch = 100;
+static constexpr std::size_t offset = 4;
+static constexpr float d_instr_mix = 1;
+
+/**
+This benchmark measure the performance of USM allocations with different Host-Device instruction mixes
+It copies some data on the device, then performs some operations on the device, copies the data back to the host and
+performs some operations on the host. This is done in a loop.
+
+It takes a float parameter --instr-mix that specifies the percentage of operations that are performed on the device.
+
+A parameter --num-launches specifies the number of times the operation loop is executed
+
+The benchmark is run with 4 different configurations:
+  - USM allocations with device memory
+  - USM allocations with host memory
+  - USM allocations with shared memory
+  - USM allocations with shared memory and prefetching
+
+The benchmark uses 2 additional configurations:
+  - USM allocations with initialization
+  - USM allocations without initialization
+This helps to measure the overhead of the initialization operation
+*/
+template <typename DATA_TYPE, sycl::usm::alloc usm_type, bool include_init, bool use_prefetch>
+class USMInstructionMix {
+protected:
+  BenchmarkArgs args;
+  size_t kernel_launches;
+  USMBuffer<DATA_TYPE, 1, usm_type> buff1;
+  float instr_mix;
+
+public:
+  USMInstructionMix(const BenchmarkArgs& _args, size_t kernel_launches, float instr_mix)
+      : args(_args), kernel_launches(kernel_launches), instr_mix(instr_mix) {}
+
+
+  void setup() {
+    if constexpr(!include_init) {
+      buff1.initialize(args.device_queue, args.problem_size);
+    }
+  }
+
+  void run(std::vector<sycl::event>& events) {
+    sycl::queue& queue = args.device_queue;
+
+    if constexpr(include_init) {
+      buff1.initialize(args.device_queue, args.problem_size);
+    }
+
+    for(size_t i = 0; i < kernel_launches; i++) {
+      auto device_copy_event = buff1.update_device();
+      // Prefetch if using shared memory, should increase performance
+      if constexpr(usm_type == sycl::usm::alloc::shared && use_prefetch) {
+        device_copy_event = queue.prefetch(buff1.get(), buff1.size() * sizeof(DATA_TYPE), device_copy_event);
+      }
+      auto kernel_event = queue.submit([&](sycl::handler& cgh) {
+        auto* acc_1 = buff1.get();
+        cgh.depends_on(device_copy_event);
+
+        cgh.parallel_for(sycl::nd_range<1>{{args.problem_size}, {args.local_size}},
+            [=, _instr_mix = instr_mix](sycl::nd_item<1> item) {
+              const auto id = item.get_global_id(0);
+              const auto num_ops = item.get_global_range(0) * _instr_mix;
+              for(size_t i = id, j = 0; i < num_ops; i += item.get_global_range(0), j++) {
+                acc_1[(id + j) % item.get_global_range(0)] += DATA_TYPE{1};
+              }
+            });
+      });
+      events.push_back(kernel_event);
+      auto [host_ptr, copy_event] = buff1.update_and_get_host_ptr(kernel_event);
+      copy_event.wait(); // Need this wait 'cause we can't use host tasks and synchronization with the device is needed
+      // Host op
+      for(size_t i = 0; i < buff1.size(); i++) {
+        host_ptr[i] -= DATA_TYPE{1};
+      }
+    }
+  }
+
+  bool verify(VerificationSetting& ver) {
+    buff1.update_host();
+    bool pass = false;
+    for(size_t i = 0; i < buff1.size(); i++) {
+      if(buff1.get_host_ptr()[i] != DATA_TYPE{0}) {
+        pass = true;
+      }
+    }
+    return pass;
+  }
+
+  static std::string getBenchmarkName(BenchmarkArgs& args) {
+    const float device_op = args.cli.getOrDefault("--instr-mix", d_instr_mix);
+
+    std::stringstream name;
+    name << "USM_Instr_Mix_";
+    name << ReadableTypename<DATA_TYPE>::name << "_";
+    name << usm_to_string(usm_type) << "_";
+    name << "1:" << (std::size_t(device_op) == device_op ? std::size_t(device_op) : device_op)
+         << "mix_"; // avoid .0 if it's an integer
+    name << (include_init ? "with_init_" : "no_init_");
+    name << (use_prefetch ? "with_prefetch" : "no_prefetch");
+    return name.str();
+  }
+};
+
+int main(int argc, char** argv) {
+  BenchmarkApp app(argc, argv);
+  const size_t kernel_launches_num = app.getArgs().cli.getOrDefault("--num-launches", d_kernel_launch);
+  const float instr_mix = app.getArgs().cli.getOrDefault("--instr-mix", d_instr_mix);
+
+  app.run<USMInstructionMix<float, sycl::usm::alloc::device, true, false>>(kernel_launches_num, instr_mix);
+  app.run<USMInstructionMix<float, sycl::usm::alloc::host, true, false>>(kernel_launches_num, instr_mix);
+  app.run<USMInstructionMix<float, sycl::usm::alloc::shared, true, false>>(kernel_launches_num, instr_mix);
+  app.run<USMInstructionMix<float, sycl::usm::alloc::shared, true, true>>(kernel_launches_num, instr_mix);
+
+  app.run<USMInstructionMix<float, sycl::usm::alloc::device, false, false>>(kernel_launches_num, instr_mix);
+  app.run<USMInstructionMix<float, sycl::usm::alloc::host, false, false>>(kernel_launches_num, instr_mix);
+  app.run<USMInstructionMix<float, sycl::usm::alloc::shared, false, false>>(kernel_launches_num, instr_mix);
+  app.run<USMInstructionMix<float, sycl::usm::alloc::shared, false, true>>(kernel_launches_num, instr_mix);
+}
\ No newline at end of file
diff --git a/sycl2020/USM/usm_pinned_overhead.cpp b/sycl2020/USM/usm_pinned_overhead.cpp
new file mode 100644
index 00000000..11ac8493
--- /dev/null
+++ b/sycl2020/USM/usm_pinned_overhead.cpp
@@ -0,0 +1,101 @@
+#include "common.h"
+
+static constexpr int HOST_DEVICE = 0;
+static constexpr int DEVICE_HOST = 1;
+static constexpr int TEST_VAL = 1;
+static constexpr std::size_t d_num_copies = 1;
+
+/**
+Measure the overhead of copying data from host to device and vice versa using pinned and non-pinned memory.
+Takes a --num-copies parameter to specify how many copies to perform.
+*/
+template <typename DATA_TYPE, bool use_pinned_memory, int direction, bool include_init>
+class USMPinnedOverhead {
+protected:
+  BenchmarkArgs args;
+  DATA_TYPE* buffer;
+  DATA_TYPE* host_memory;
+  size_t num_copies;
+
+private:
+  void init() {
+    sycl::queue& queue = args.device_queue;
+    if constexpr(use_pinned_memory) {
+      host_memory = (DATA_TYPE*)sycl::malloc_host(args.problem_size * sizeof(DATA_TYPE), queue);
+    } else {
+      host_memory = (DATA_TYPE*)malloc(args.problem_size * sizeof(DATA_TYPE));
+    }
+  }
+
+public:
+  USMPinnedOverhead(const BenchmarkArgs& _args, size_t num_copies)
+      : args(_args), buffer(nullptr), host_memory(nullptr), num_copies(num_copies) {}
+
+  ~USMPinnedOverhead() {
+    if(buffer == nullptr || host_memory == nullptr) {
+      return;
+    }
+    if constexpr(use_pinned_memory) {
+      sycl::free(host_memory, args.device_queue);
+    } else {
+      free(host_memory);
+    }
+    sycl::free(buffer, args.device_queue);
+  }
+
+  void setup() {
+    sycl::queue& queue = args.device_queue;
+    if constexpr(!include_init) {
+      init();
+    }
+    buffer = (DATA_TYPE*)sycl::malloc_device(args.problem_size * sizeof(DATA_TYPE), queue);
+  }
+
+  void run(std::vector<sycl::event>& events) {
+    sycl::queue& queue = args.device_queue;
+    if constexpr(include_init) {
+      init();
+    }
+
+    for(size_t i = 0; i < num_copies; i++) {
+      if constexpr(direction == HOST_DEVICE) {
+        events.push_back(queue.copy(host_memory, buffer, args.problem_size));
+      } else {
+        events.push_back(queue.copy(buffer, host_memory, args.problem_size));
+      }
+    }
+  }
+
+  static ThroughputMetric getThroughputMetric(const BenchmarkArgs& args) {
+    const size_t num_copies = args.cli.getOrDefault("--num-copies", d_num_copies);
+    const double copiedGiB = args.problem_size * sizeof(DATA_TYPE) * num_copies / 1024.0 / 1024.0 / 1024.0;
+    return {copiedGiB, "GiB"};
+  }
+
+  static std::string getBenchmarkName(BenchmarkArgs& args) {
+    std::stringstream name;
+    const size_t num_copies = args.cli.getOrDefault("--num-copies", d_num_copies);
+
+    name << "USM_Pinned_Overhead_";
+    name << ReadableTypename<DATA_TYPE>::name << "_";
+    name << (direction == HOST_DEVICE ? "HostDevice" : "DeviceHost") << "_";
+    name << (use_pinned_memory ? "Pinned" : "NonPinned") << "_";
+    name << (include_init ? "Init" : "NoInit") << "_";
+    name << num_copies;
+
+    return name.str();
+  }
+};
+
+int main(int argc, char** argv) {
+  BenchmarkApp app(argc, argv);
+  const size_t num_copies = app.getArgs().cli.getOrDefault("--num-copies", d_num_copies);
+
+  app.run<USMPinnedOverhead<float, false, HOST_DEVICE, true>>(num_copies);
+  app.run<USMPinnedOverhead<float, true, HOST_DEVICE, true>>(num_copies);
+  app.run<USMPinnedOverhead<float, false, DEVICE_HOST, true>>(num_copies);
+  app.run<USMPinnedOverhead<float, true, DEVICE_HOST, true>>(num_copies);
+
+
+  return 0;
+}
\ No newline at end of file
diff --git a/sycl2020/USM/usm_utils.hpp b/sycl2020/USM/usm_utils.hpp
new file mode 100644
index 00000000..bc79ded9
--- /dev/null
+++ b/sycl2020/USM/usm_utils.hpp
@@ -0,0 +1,15 @@
+#pragma once
+#include <sycl/sycl.hpp>
+
+std::string usm_to_string(sycl::usm::alloc usm_type) {
+  if(usm_type == sycl::usm::alloc::device) {
+    return "device";
+  }
+  if(usm_type == sycl::usm::alloc::host) {
+    return "host";
+  }
+  if(usm_type == sycl::usm::alloc::shared) {
+    return "shared";
+  }
+  return "unknown";
+}
\ No newline at end of file
diff --git a/sycl2020/atomics/atomic_reduction.cpp b/sycl2020/atomics/atomic_reduction.cpp
new file mode 100644
index 00000000..2ee5ffed
--- /dev/null
+++ b/sycl2020/atomics/atomic_reduction.cpp
@@ -0,0 +1,76 @@
+#include "common.h"
+#include "polybenchUtilFuncts.h"
+#include <iostream>
+
+namespace s = sycl;
+
+template <typename T>
+class ReductionAtomic {
+  int problem_size = 1;
+  BenchmarkArgs args;
+  PrefetchedBuffer<T, 1> in_buf;
+  PrefetchedBuffer<T, 1> out_buf;
+  std::vector<T> in_vec;
+  T reduction_results;
+
+public:
+  ReductionAtomic(const BenchmarkArgs& _args) : args(_args) {}
+
+  void setup() {
+    problem_size = args.problem_size;
+    in_vec.resize(problem_size);
+
+    std::fill(in_vec.begin(), in_vec.end(), 1);
+
+    reduction_results = 0.f;
+    in_buf.initialize(args.device_queue, in_vec.data(), s::range<1>{in_vec.size()});
+    out_buf.initialize(args.device_queue, &reduction_results, s::range<1>{1});
+  }
+  void run(std::vector<s::event>& events) {
+    events.push_back(args.device_queue.submit([&](s::handler& cgh) {
+      auto in_acc = in_buf.template get_access<s::access_mode::read>(cgh);
+      auto out_acc = out_buf.template get_access<s::access_mode::write>(cgh);
+      auto ndrange = s::nd_range<1>{problem_size, args.local_size};
+      cgh.parallel_for<ReductionAtomic<T>>(ndrange, [=](sycl::nd_item<1> it) {
+        const auto gid = it.get_global_id();
+
+        s::atomic_ref<T, s::memory_order::relaxed, s::memory_scope::device, s::access::address_space::global_space> atm(
+            out_acc[0]);
+
+        atm.fetch_add(in_acc[gid]);
+      });
+    }));
+  }
+  bool verify(VerificationSetting& ver) {
+    auto results = out_buf.get_host_access();
+    constexpr auto ERROR_THRESHOLD = 0.05f;
+
+    T verified_results = 0;
+    for(int i = 0; i < in_vec.size(); i++) verified_results += in_vec[i];
+
+    if(percentDiff(results[0], verified_results) > ERROR_THRESHOLD) {
+      return false;
+    } else
+      return true;
+  }
+
+
+  static std::string getBenchmarkName(BenchmarkArgs& args) {
+    std::stringstream name;
+    name << "ReductionAtomic_";
+    name << ReadableTypename<T>::name;
+
+    return name.str();
+  }
+};
+
+
+int main(int argc, char** argv) {
+  BenchmarkApp app(argc, argv);
+  app.run<ReductionAtomic<int>>();
+  app.run<ReductionAtomic<long long>>();
+  app.run<ReductionAtomic<float>>();
+
+  app.run<ReductionAtomic<double>>();
+  return 0;
+}
diff --git a/sycl2020/group_algorithms/reduce_over_group.cpp b/sycl2020/group_algorithms/reduce_over_group.cpp
new file mode 100644
index 00000000..d2d898e7
--- /dev/null
+++ b/sycl2020/group_algorithms/reduce_over_group.cpp
@@ -0,0 +1,83 @@
+#include "common.h"
+#include "polybenchUtilFuncts.h"
+#include <iostream>
+
+namespace s = sycl;
+
+template <typename T>
+class ReduceGroupAlgorithm {
+  int problem_size = 1;
+  int local_size = 1;
+  BenchmarkArgs args;
+  PrefetchedBuffer<T, 1> in_buf;
+  PrefetchedBuffer<T, 1> out_buf;
+  std::vector<T> in_vec;
+  T reduction_results;
+
+public:
+  ReduceGroupAlgorithm(const BenchmarkArgs& _args) : args(_args) {}
+
+  void setup() {
+    problem_size = args.problem_size;
+    local_size = args.local_size;
+    in_vec.resize(problem_size);
+    std::fill(in_vec.begin(), in_vec.end(), 1);
+    reduction_results = 0;
+    in_buf.initialize(args.device_queue, in_vec.data(), s::range<1>{in_vec.size()});
+    out_buf.initialize(args.device_queue, &reduction_results, s::range<1>{1});
+  }
+  void run(std::vector<s::event>& events) {
+    events.push_back(args.device_queue.submit([&](s::handler& cgh) {
+      auto in_acc = in_buf.template get_access<s::access_mode::read>(cgh);
+      auto out_acc = out_buf.template get_access<s::access_mode::write>(cgh);
+      const int group_size = local_size;
+
+      auto ndrange = s::nd_range<1>{problem_size, local_size};
+
+      cgh.parallel_for<ReduceGroupAlgorithm<T>>(ndrange, [=](sycl::nd_item<1> item) {
+        const int lid = item.get_local_id(0);
+        const auto gid = item.get_global_id();
+
+        T partial_sum = s::reduce_over_group(item.get_group(), in_acc[gid], s::plus<T>());
+
+        s::atomic_ref<T, s::memory_order::relaxed, s::memory_scope::device, s::access::address_space::global_space> atm(
+            out_acc[0]);
+        if(lid == 0) {
+          atm.fetch_add(partial_sum);
+        }
+      });
+    }));
+  }
+  bool verify(VerificationSetting& ver) {
+    auto results = out_buf.get_host_access();
+    constexpr auto ERROR_THRESHOLD = 0.05;
+
+    T verified_results = problem_size;
+
+    if(percentDiff(results[0], verified_results) > ERROR_THRESHOLD) {
+      std::cerr << "output: " << results[0] << " correct output: " << verified_results << std::endl;
+      return false;
+    } else
+      return true;
+  }
+
+
+  static std::string getBenchmarkName(BenchmarkArgs& args) {
+    std::stringstream name;
+    name << "ReduceGroupAlgorithm_";
+    name << ReadableTypename<T>::name;
+
+    return name.str();
+  }
+};
+
+
+int main(int argc, char** argv) {
+  BenchmarkApp app(argc, argv);
+  app.run<ReduceGroupAlgorithm<int>>();
+  app.run<ReduceGroupAlgorithm<long long>>();
+  app.run<ReduceGroupAlgorithm<float>>();
+
+  app.run<ReduceGroupAlgorithm<double>>();
+  return 0;
+}
diff --git a/sycl2020/kernel_reduction/kernel_reduction.cpp b/sycl2020/kernel_reduction/kernel_reduction.cpp
new file mode 100644
index 00000000..851ff954
--- /dev/null
+++ b/sycl2020/kernel_reduction/kernel_reduction.cpp
@@ -0,0 +1,89 @@
+#include "common.h"
+#include "polybenchUtilFuncts.h"
+#include <iostream>
+
+namespace s = sycl;
+
+template <typename T, typename Op, int coarse_factor>
+class KernelReductionBench {
+  int problem_size = 1;
+  BenchmarkArgs args;
+  PrefetchedBuffer<T, 1> in_buf;
+  PrefetchedBuffer<T, 1> out_buf;
+  std::vector<T> in_vec;
+  T reduction_results;
+
+public:
+  KernelReductionBench(const BenchmarkArgs& _args) : args(_args) {}
+
+  void setup() {
+    problem_size = args.problem_size;
+
+    in_vec.resize(problem_size);
+    std::fill(in_vec.begin(), in_vec.end(), 1);
+    reduction_results = 0;
+    in_buf.initialize(args.device_queue, in_vec.data(), s::range<1>{in_vec.size()});
+    out_buf.initialize(args.device_queue, &reduction_results, s::range<1>{1});
+  }
+  void run(std::vector<s::event>& events) {
+    events.push_back(args.device_queue.submit([&](s::handler& cgh) {
+// Update reduction variables to SYCL 2020 spec behavior #578: https://github.com/AdaptiveCpp/AdaptiveCpp/pull/578
+#ifdef __ACPP__
+      auto r = s::reduction(out_buf.template get_access<s::access_mode::read_write>(cgh), Op());
+#else
+      auto r = s::reduction(out_buf.get(), cgh, Op());
+#endif
+      auto in_acc = in_buf.template get_access<s::access_mode::read>(cgh);
+      cgh.parallel_for(s::range<1>{problem_size / coarse_factor}, r, [=](s::id<1> idx, auto& op) {
+        for(int i = 0; i < coarse_factor; i++) op.combine(in_acc[idx * coarse_factor + i]);
+      });
+    }));
+  }
+  bool verify(VerificationSetting& ver) {
+    constexpr auto ERROR_THRESHOLD = 0.05;
+
+    auto results = out_buf.get_host_access();
+    T verified_results = problem_size;
+
+    if(percentDiff(results[0], verified_results) > ERROR_THRESHOLD) {
+      std::cout << results[0] << " -- " << verified_results << std::endl;
+      return false;
+    } else
+      return true;
+  }
+
+
+  static std::string getBenchmarkName(BenchmarkArgs& args) {
+    std::stringstream name;
+    name << "KernelReduction_";
+    name << ReadableTypename<T>::name;
+    if constexpr(std::is_same<Op, sycl::plus<T>>::value) {
+      name << "_plus";
+    }
+    name << "_cf" << coarse_factor;
+    return name.str();
+  }
+};
+
+
+template <typename T, typename Op>
+void runCoarsening(BenchmarkApp& app) {
+  app.run<KernelReductionBench<T, Op, 1>>();
+  app.run<KernelReductionBench<T, Op, 4>>();
+  app.run<KernelReductionBench<T, Op, 8>>();
+}
+
+template <typename T>
+void runOperators(BenchmarkApp& app) {
+  runCoarsening<T, sycl::plus<T>>(app);
+}
+
+int main(int argc, char** argv) {
+  BenchmarkApp app(argc, argv);
+  runOperators<int>(app);
+  runOperators<long long>(app);
+  runOperators<float>(app);
+
+  runOperators<double>(app);
+  return 0;
+}
diff --git a/sycl2020/spec_constants/spec_constant_convolution.cpp b/sycl2020/spec_constants/spec_constant_convolution.cpp
new file mode 100644
index 00000000..8669122c
--- /dev/null
+++ b/sycl2020/spec_constants/spec_constant_convolution.cpp
@@ -0,0 +1,213 @@
+// Specialization constant benchmark
+// - runs a generic 9 point stencil of which only 5 points are used in practice
+// - weights are provided either
+//   * fully dynamically (AccessVariants::dynamic_value),
+//   * as specialization constants (AccessVariants::spec_const_value), or
+//   * statically at compile time (AccessVariants::constexpr_value)
+// Example run: ./spec_constant_convolution --device=gpu --no-verification --size=8192 --output=out.csv
+
+#include "common.h"
+#include <iostream>
+
+enum class AccessVariants {
+  dynamic_value,
+  spec_const_value,
+  constexpr_value,
+};
+
+namespace s = sycl;
+template <typename T, AccessVariants AccessVariant, int InnerLoops>
+class ConvKernel;
+
+// T is the data type operated on
+// AccessVariant determines if coefficients are accessed dynamically, use specialization constants, or are static
+// InnerLoops allows tuning the arithmetic intensity of the kernel
+template <typename T, AccessVariants AccessVariant, int InnerLoops>
+class SpecConstConvBench {
+  int problem_size = 1;
+
+  using coeff_t = std::array<std::array<T, 3>, 3>;
+
+  // internal function to generate some coefficients for the specialization constant
+  coeff_t getCoefficients() {
+    // trick the compiler a bit - problem size is always < 0, but the compiler doesn't know that
+    T val(problem_size < 0 ? problem_size : 2);
+    T val0(problem_size < 0 ? problem_size : 0);
+    return {{{val0, val, val0}, {val, val, val}, {val0, val, val0}}};
+  }
+
+  T getDivider() {
+    // analogous to above
+    return problem_size < 0 ? T(problem_size) : T(5);
+  }
+
+  T getInitValue() {
+    // analogous to above
+    return problem_size < 0 ? T(problem_size) : T(1);
+  }
+
+#ifdef HIPSYCL_EXT_SPECIALIZED
+  // ACPP implements sycl::specialized instead of spec constants
+  sycl::specialized<coeff_t> coeff_spec;
+  sycl::specialized<T> div_spec;
+#else
+  // ids for the specialization constants
+  static constexpr s::specialization_id<coeff_t> coeff_id;
+  static constexpr s::specialization_id<T> div_id;
+#endif
+
+  BenchmarkArgs args;
+
+  PrefetchedBuffer<T, 2> in_buf;
+  PrefetchedBuffer<T, 2> out_buf;
+
+  std::vector<T> in_vec;
+
+public:
+  SpecConstConvBench(const BenchmarkArgs& _args) : args(_args) {}
+
+  void setup() {
+    problem_size = (int)args.problem_size;
+
+    in_vec.resize(problem_size * problem_size);
+    std::fill(in_vec.begin(), in_vec.end(), getInitValue());
+
+    in_buf.initialize(args.device_queue, in_vec.data(), s::range<2>(problem_size, problem_size));
+    out_buf.initialize(args.device_queue, in_vec.data(), s::range<2>(problem_size, problem_size));
+  }
+
+
+  void run(std::vector<sycl::event>& events) {
+    events.push_back(args.device_queue.submit([&](sycl::handler& cgh) {
+      auto in = in_buf.template get_access<s::access::mode::read>(cgh);
+      auto out = out_buf.template get_access<s::access::mode::write>(cgh);
+
+      // set the specialization constants
+      coeff_t dynamic_coeff;
+      T dynamic_div;
+      if constexpr(AccessVariant == AccessVariants::dynamic_value) {
+        dynamic_coeff = getCoefficients();
+        dynamic_div = getDivider();
+      } else if constexpr(AccessVariant == AccessVariants::spec_const_value) {
+#ifndef HIPSYCL_EXT_SPECIALIZED
+        cgh.set_specialization_constant<coeff_id>(getCoefficients());
+        cgh.set_specialization_constant<div_id>(getDivider());
+#else
+        coeff_spec = getCoefficients();
+        div_spec = getDivider();
+#endif
+      }
+
+      cgh.parallel_for<class ConvKernel<T, AccessVariant, InnerLoops>>(in.get_range(),
+#ifdef HIPSYCL_EXT_SPECIALIZED
+          [=, coeff_spec_copy = coeff_spec, div_spec_copy = div_spec](
+              s::item<2> item_id) // Copy to avoid this ptr access in lambda
+#else
+      [=](s::item<2> item_id, s::kernel_handler h)
+#endif
+          {
+            T acc = 0;
+            coeff_t coeff;
+            T div;
+            if constexpr(AccessVariant == AccessVariants::dynamic_value) {
+              coeff = dynamic_coeff;
+              div = dynamic_div;
+            } else if constexpr(AccessVariant == AccessVariants::spec_const_value) {
+#ifndef HIPSYCL_EXT_SPECIALIZED
+              coeff = h.get_specialization_constant<coeff_id>();
+              div = h.get_specialization_constant<div_id>();
+#else
+              coeff = coeff_spec_copy;
+              div = div_spec_copy;
+#endif
+            } else if constexpr(AccessVariant == AccessVariants::constexpr_value) {
+              coeff = {{{0, 2, 0}, {2, 2, 2}, {0, 2, 0}}};
+              div = 5;
+            }
+            for(int k = 0; k < InnerLoops; ++k) {
+              for(int i = -1; i <= 1; i++) {
+                if(item_id[0] + i < 0 || item_id[0] + i >= in.get_range()[0])
+                  continue;
+                for(int j = -1; j <= 1; j++) {
+                  if(item_id[1] + j < 0 || item_id[1] + j >= out.get_range()[1])
+                    continue;
+                  acc += coeff[i + 1][j + 1] * in[item_id[0] + i][item_id[1] + j];
+                }
+              }
+            }
+            out[item_id] = acc / (div * T(InnerLoops));
+          });
+    }));
+  }
+
+  bool verify(VerificationSetting& ver) {
+    auto out_acc = out_buf.get_host_access();
+
+    bool pass = true;
+
+    auto c = getCoefficients();
+    auto d = getDivider();
+    auto v = getInitValue();
+    T expected_val = 0;
+    for(int i = 0; i < InnerLoops; ++i) {
+      expected_val += v * c[0][0] + v * c[0][1] + v * c[0][2]   //
+                      + v * c[1][0] + v * c[1][1] + v * c[1][2] //
+                      + v * c[2][0] + v * c[2][1] + v * c[2][2];
+    }
+    expected_val /= d * InnerLoops;
+
+    for(size_t x = 1; x < args.problem_size - 1 && pass; ++x) {
+      for(size_t y = 1; y < args.problem_size - 1 && pass; ++y) {
+        if(out_acc[x][y] != expected_val) {
+          std::cout << "Fail at = " << x << " / " << y << "\nExpected = " << expected_val << "Actual =" << out_acc[x][y]
+                    << std::endl;
+          pass = false;
+          break;
+        }
+      }
+    }
+
+    return pass;
+  }
+
+  static std::string getBenchmarkName(BenchmarkArgs& args) {
+    std::stringstream name;
+    name << "SpecConstantConvolution_";
+    name << ReadableTypename<T>::name;
+    if constexpr(AccessVariant == AccessVariants::dynamic_value) {
+      name << "_DynamicValue";
+    } else if constexpr(AccessVariant == AccessVariants::spec_const_value) {
+      name << "_SpecConstValue";
+    } else if constexpr(AccessVariant == AccessVariants::constexpr_value) {
+      name << "_ConstExprValue";
+    }
+    name << "_IL" << InnerLoops;
+    return name.str();
+  }
+};
+
+
+template <typename T, AccessVariants AccessVariant>
+void runLoopCounts(BenchmarkApp& app) {
+  app.run<SpecConstConvBench<T, AccessVariant, 1>>();
+  app.run<SpecConstConvBench<T, AccessVariant, 16>>();
+  app.run<SpecConstConvBench<T, AccessVariant, 64>>();
+}
+
+template <typename T>
+void runAccessVariants(BenchmarkApp& app) {
+  runLoopCounts<T, AccessVariants::dynamic_value>(app);
+  runLoopCounts<T, AccessVariants::spec_const_value>(app);
+  runLoopCounts<T, AccessVariants::constexpr_value>(app);
+}
+
+int main(int argc, char** argv) {
+  BenchmarkApp app(argc, argv);
+  runAccessVariants<int>(app);
+  runAccessVariants<long long>(app);
+  runAccessVariants<float>(app);
+  if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) {
+    runAccessVariants<double>(app);
+  }
+  return 0;
+}