diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 00000000..d21f4880 --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,28 @@ +--- +# +# Clang-Tidy configuration for SYCL-Bench. +# +# There are three usage scenarios: +# 1. Automatic checks through an IDE (CLion, VsCode, ...) +# 2. Running manually on select files (not recommended) +# `clang-tidy -p path/to/compile_commands.json file1 [file2, ...]` +# Note: A script for running clang-tidy on all Celerity sources is provided in `ci/run-clang-tidy.sh` +# 3. Running on a diff (for CI) +# `git diff -U0 --no-color | clang-tidy-diff.py -p1 -path path/to/compile_commands.json` +# +InheritParentConfig: false +# See https://clang.llvm.org/extra/clang-tidy/checks/list.html for a full list of available checks. +Checks: -*, + readability-*, + -readability-avoid-const-params-in-decls, + -readability-function-cognitive-complexity, + -readability-identifier-length, + -readability-magic-numbers, + -readability-uppercase-literal-suffix, + -readability-convert-member-functions-to-static + -readability-qualified-auto + +# Treat naming violations as errors +WarningsAsErrors: "readability-identifier-naming" +# Use .clang-format configuration for fixes +FormatStyle: file diff --git a/.gitignore b/.gitignore index 10f3c3d8..b9b4ae53 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,10 @@ /build* +*.csv +img/ +# Clangd +.cache/ +.clangd + +# Vscode +.vscode/ \ No newline at end of file diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 00000000..eff18c29 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,21 @@ +cff-version: 1.2.0 +message: "If you use this software, please cite it as below." +conference-paper: "Proceedings of the 12th International Workshop on OpenCL and SYCL (IWOCL 24)" +authors: +- family-names: "Luigi" + given-names: "Crisci" +- family-names: "Lorenzo" + given-names: "Carpentieri" +- family-names: "Peter" + given-names: "Thoman" +- family-names: "Aksel" + given-names: "Alpay" +- family-names: "Vincent" + given-names: "Heuveline" +- family-names: "Biagio" + given-names: "Cosenza" +title: "SYCL-Bench 2020: Benchmarking SYCL 2020 on AMD, Intel, and NVIDIA GPUs" +version: 2.0.4 +doi: 10.1145/3648115.3648120 +date-released: 2024-04-08 +url: "https://github.com/unisa-hpc/sycl-bench/" diff --git a/CMakeLists.txt b/CMakeLists.txt index a22b0fcf..ae0f2426 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,24 +1,31 @@ -cmake_minimum_required (VERSION 3.5) +cmake_minimum_required(VERSION 3.5) project(sycl-bench) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_SOURCE_DIR}/cmake) set_property(GLOBAL PROPERTY USE_FOLDERS ON) if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE "Release" CACHE STRING "CMake Build Type" FORCE) + set(CMAKE_BUILD_TYPE "Release" CACHE STRING "CMake Build Type" FORCE) endif() set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) -# Due to CMake limitations, hipSYCL requires C++ standard to be set manually -set(CMAKE_SYCL_FLAGS "${CMAKE_SYCL_FLAGS} -std=c++17") +# Default build flags +set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -DDEBUG -fno-omit-frame-pointer" CACHE STRING "Flags used by the C++ compiler during debug builds." FORCE) +set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG -march=native -ffast-math" CACHE STRING "Flags used by the C++ compiler during release builds." FORCE) +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -DNDEBUG -march=native -ffast-math -g -fno-omit-frame-pointer" CACHE STRING "Flags used by the C++ compiler during release builds with debug info." FORCE) + if(CMAKE_GENERATOR STREQUAL "Ninja") - set(CMAKE_SYCL_FLAGS "${CMAKE_SYCL_FLAGS} -fdiagnostics-color=always") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-color=always") - set(COMPUTECPP_USER_FLAGS "${COMPUTECPP_USER_FLAGS} -fdiagnostics-color=always") + set(CMAKE_SYCL_FLAGS "${CMAKE_SYCL_FLAGS} -fdiagnostics-color=always") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-color=always") + set(COMPUTECPP_USER_FLAGS "${COMPUTECPP_USER_FLAGS} -fdiagnostics-color=always") +endif() + +if(SYCL_BENCH_ENABLE_QUEUE_PROFILING) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSYCL_BENCH_ENABLE_QUEUE_PROFILING") endif() include(InstallRequiredSystemLibraries) @@ -28,31 +35,55 @@ include_directories(${CMAKE_SOURCE_DIR}/include) include_directories(${CMAKE_SOURCE_DIR}/polybench/common) set(supported_implementations - ComputeCpp - hipSYCL - LLVM - LLVM-CUDA - triSYCL + AdaptiveCpp + dpcpp + triSYCL ) list(FIND supported_implementations ${SYCL_IMPL} impl_idx) + if(NOT SYCL_IMPL OR impl_idx EQUAL -1) - message(FATAL_ERROR "Please specify SYCL_IMPL (one of: ${supported_implementations})") + message(FATAL_ERROR "Please specify SYCL_IMPL (one of: ${supported_implementations})") endif() -if(SYCL_IMPL STREQUAL "ComputeCpp") - find_package(ComputeCpp MODULE REQUIRED) -elseif(SYCL_IMPL STREQUAL "hipSYCL") - find_package(hipSYCL CONFIG REQUIRED) -elseif(SYCL_IMPL STREQUAL "LLVM") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl") -elseif(SYCL_IMPL STREQUAL "LLVM-CUDA") - set(CMAKE_CXX_FLAGS - "${CMAKE_CXX_FLAGS} -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice") +if(SYCL_IMPL STREQUAL "AdaptiveCpp") + find_package(AdaptiveCpp REQUIRED) +elseif(SYCL_IMPL STREQUAL "dpcpp") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl") + + if(DPCPP_WITH_CUDA_BACKEND) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") + + set(CUDA_ARCH "" CACHE STRING "CUDA device architecture e.g. sm_70") + + if(NOT CUDA_ARCH STREQUAL "") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xsycl-target-backend --cuda-gpu-arch=${CUDA_ARCH}") + endif() + endif() + + if(DPCPP_WITH_ROCM_BACKEND) + set(ROCM_ARCH "" CACHE STRING "ROCm device architecture e.g. gfx908") + + if(NOT ROCM_ARCH STREQUAL "") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amd_gpu_${ROCM_ARCH}") + endif() + endif() + + if(DPCPP_WITH_LZ_BACKEND) + set(LZ_ARCH "" CACHE STRING "Level Zero device architecture e.g. acm-g10") + + if(NOT LZ_ARCH STREQUAL "") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=${LZ_ARCH}") + endif() + endif() + elseif(SYCL_IMPL STREQUAL "triSYCL") - find_package(TriSYCL MODULE REQUIRED) + find_package(TriSYCL MODULE REQUIRED) endif() +# Check if SYCL implementation implements the required SYCL features +include(HasFeatures) + set(benchmarks micro/arith.cpp micro/DRAM.cpp @@ -92,42 +123,56 @@ set(benchmarks polybench/mvt.cpp polybench/syr2k.cpp polybench/syrk.cpp - #compiletime/compiletime.cpp -) -foreach(benchmark IN LISTS benchmarks) - get_filename_component(target ${benchmark} NAME_WE) + # compiletime/compiletime.cpp + sycl2020/atomics/atomic_reduction.cpp + sycl2020/USM/usm_accessors_latency.cpp + sycl2020/USM/usm_instr_mix.cpp + sycl2020/USM/usm_pinned_overhead.cpp + sycl2020/USM/usm_allocation_latency.cpp +) +# Selectively add benchmarks based on some SYCL 2020 features +if (SYCL_BENCH_HAS_SPEC_CONSTANTS) + list(APPEND benchmarks sycl2020/spec_constants/spec_constant_convolution.cpp) +endif() +if (SYCL_BENCH_HAS_KERNEL_REDUCTIONS) + list(APPEND benchmarks sycl2020/kernel_reduction/kernel_reduction.cpp) +endif() +if (SYCL_BENCH_HAS_GROUP_ALGORITHMS) + list(APPEND benchmarks sycl2020/group_algorithms/reduce_over_group.cpp) +endif() - add_executable(${target} ${benchmark}) +# Setting variables +add_compile_definitions(SYCL_BENCH_HAS_FP64_SUPPORT=$) - if(SYCL_IMPL STREQUAL "ComputeCpp" OR SYCL_IMPL STREQUAL "hipSYCL") - add_sycl_to_target(TARGET ${target} SOURCES ${benchmark}) - endif() +foreach(benchmark IN LISTS benchmarks) + get_filename_component(target ${benchmark} NAME_WE) - if(SYCL_IMPL STREQUAL "ComputeCpp" AND COMPUTECPP_BITCODE STREQUAL "ptx64") - target_compile_definitions(${target} PRIVATE SYCL_BENCH_ENABLE_QUEUE_PROFILING) - endif() + add_executable(${target} ${benchmark}) - if(SYCL_IMPL STREQUAL "LLVM") - target_compile_definitions(${target} PRIVATE __LLVM_SYCL__) - endif() + if(SYCL_IMPL STREQUAL "AdaptiveCpp") + add_sycl_to_target(TARGET ${target} SOURCES ${benchmark}) + endif() - if(SYCL_IMPL STREQUAL "LLVM-CUDA") - target_compile_definitions(${target} PRIVATE __LLVM_SYCL_CUDA__) - endif() + if(SYCL_IMPL STREQUAL "dpcpp") + target_compile_definitions(${target} PRIVATE __DPCPP__) + endif() if(SYCL_IMPL STREQUAL "triSYCL") add_sycl_to_target(${target}) target_compile_definitions(${target} PRIVATE __TRISYCL__) endif() - + + if(ENABLE_TIME_EVENT_PROFILING) + target_compile_definitions(${target} PUBLIC SYCL_BENCH_ENABLE_QUEUE_PROFILING=1) + endif() + install(TARGETS ${target} RUNTIME DESTINATION bin/benchmarks/) get_filename_component(dir ${benchmark} DIRECTORY) set_property(TARGET ${target} PROPERTY FOLDER ${dir}) endforeach(benchmark) # The "compiletime" target should only be used in the context of the compile time evaluation script -#set_target_properties(compiletime PROPERTIES EXCLUDE_FROM_ALL 1) - +# set_target_properties(compiletime PROPERTIES EXCLUDE_FROM_ALL 1) install(PROGRAMS bin/run-suite DESTINATION bin/) -install(FILES ${PROJECT_SOURCE_DIR}/Brommy.bmp DESTINATION share/) +install(FILES ${PROJECT_SOURCE_DIR}/share/Brommy.bmp DESTINATION share/) \ No newline at end of file diff --git a/README.md b/README.md index 9aa6b758..be957022 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ $ mkdir build && cd build Compile with CMake ``` -$ cmake -DSYCL_IMPL=[target SYCL implementation] [other compiler arguments] .. +$ cmake -DSYCL_IMPL=[target SYCL implementation] [-DSYCL_BENCH_HAS_FP64_SUPPORT=ON|OFF] [other compiler arguments] .. $ cmake --build . $ sudo make install ``` @@ -57,6 +57,24 @@ Packages built via the `package` target will contain all files contained in a SY ## Attribution If you use SYCL-Bench, please cite the following papers: +``` +@inproceedings{SYCL-Bench:IWOCL:2024, +author = {Crisci, Luigi and Carpentieri, Lorenzo and Thoman, Peter and Alpay, Aksel and Heuveline, Vincent and Cosenza, Biagio}, +title = {SYCL-Bench 2020: Benchmarking SYCL 2020 on AMD, Intel, and NVIDIA GPUs}, +year = {2024}, +isbn = {9798400717901}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/3648115.3648120}, +doi = {10.1145/3648115.3648120}, +booktitle = {Proceedings of the 12th International Workshop on OpenCL and SYCL}, +articleno = {1}, +numpages = {12}, +keywords = {GPU, HPC, SYCL, benchmark, heterogeneous computing, portability}, +location = {, Chicago, IL, USA, }, +series = {IWOCL '24} +} +``` ``` @inproceedings{SYCL-Bench:Euro-Par:2020, diff --git a/bin/run-suite b/bin/run-suite index ea8c44ab..b8a75715 100755 --- a/bin/run-suite +++ b/bin/run-suite @@ -108,9 +108,36 @@ default_profile = { }, 'mvt' : { '--size' : create_log_range(2**14, 2**14) - }, - }, - 'individual-benchmark-flags' : set([]) + }, + 'usm_accessors_latency' : { + '--size' : create_log_range(2**20, 2**20) + }, + 'usm_allocation_latency' : { + '--size' : create_log_range(2**25, 2**25) + }, + 'usm_instr_mix' : { + '--size' : create_log_range(2**14, 2**14) + }, + 'usm_pinned_overhead' : { + '--size' : create_log_range(2**20, 2**20) + }, + 'spec_constant_convolution' : { + '--size' : create_log_range(2**11, 2**11) + }, + 'atomic_reduction' : { + '--size' : create_log_range(2**20, 2**20) + }, + 'reduce_over_group' : { + '--size' : create_log_range(2**20, 2**20) + }, + 'kernel_reduction' : { + '--size' : create_log_range(2**20, 2**20) + } + }, + 'individual-benchmark-flags' : { + 'usm_instr_mix' : ['--instr-mix=6'], + 'usm_pinned_overhead' : ['--num-copies=5'], + } } def construct_profile(overridden_options_dict, @@ -222,8 +249,9 @@ if __name__ == '__main__': if benchmark_name in individual_benchmark_options: for param in individual_benchmark_options[benchmark_name]: options[param] = individual_benchmark_options[benchmark_name][param] + if benchmark_name in individual_benchmark_flags: - for f in individual_benchmark_flags: + for f in individual_benchmark_flags[benchmark_name]: flags.add(f) max_runtime = 0.0 diff --git a/cmake/HasFeatures.cmake b/cmake/HasFeatures.cmake new file mode 100644 index 00000000..8ca8bd6c --- /dev/null +++ b/cmake/HasFeatures.cmake @@ -0,0 +1,22 @@ +macro(check_feature VAR FILENAME) + if(NOT DEFINED RUN_RES_${VAR}) + try_run(RUN_RES_${VAR} COMPILE_RES_${VAR} ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/has-features/src/${FILENAME} + CMAKE_FLAGS ${CMAKE_CXX_FLAGS} + COMPILE_OUTPUT_VARIABLE OUTPUT_VAR + RUN_OUTPUT_VARIABLE RUN_VAR + ) + endif() + + if (COMPILE_RES_${VAR} AND RUN_RES_${VAR} EQUAL 0) + set(RES ON) + else() + set(RES OFF) + endif() + message(STATUS "${VAR}: ${RES}") +endmacro() + +message(STATUS "Checking for SYCL features....") +check_feature(KERNEL_REDUCTIONS kernel_reduction_dummy.cpp) +check_feature(SPEC_CONSTANTS spec_constants_dummy.cpp) +check_feature(GROUP_ALGORITHMS group_algorithms_dummy.cpp) +check_feature(FP64_SUPPORT fp64_support_dummy.cpp) \ No newline at end of file diff --git a/cmake/has-features/src/fp64_support_dummy.cpp b/cmake/has-features/src/fp64_support_dummy.cpp new file mode 100644 index 00000000..525df029 --- /dev/null +++ b/cmake/has-features/src/fp64_support_dummy.cpp @@ -0,0 +1,15 @@ +#include + +int main() { + sycl::queue q; + sycl::buffer x(1); + + q.submit([&](sycl::handler& cgh) { + sycl::accessor a(x, cgh, sycl::read_write); + cgh.parallel_for(sycl::range<1>(1), [=](sycl::id<1> idx) { a[idx] = 0; }); + }); + + sycl::host_accessor host{x}; + assert(host[0] == 0); + +} \ No newline at end of file diff --git a/cmake/has-features/src/group_algorithms_dummy.cpp b/cmake/has-features/src/group_algorithms_dummy.cpp new file mode 100644 index 00000000..1e8e5308 --- /dev/null +++ b/cmake/has-features/src/group_algorithms_dummy.cpp @@ -0,0 +1,17 @@ +#include +#include + + +int main() { + sycl::queue q; + int* i = sycl::malloc_shared(1, q); + q.submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl::nd_range<1>{{1}, {1}}, [=](sycl::nd_item<1> item) { + // call only the group algorithms used in SYCL-Bench + *i = sycl::reduce_over_group(item.get_group(), 1, sycl::plus{}); + }); + }).wait(); + + assert(*i == 1); + sycl::free(i, q); +} \ No newline at end of file diff --git a/cmake/has-features/src/kernel_reduction_dummy.cpp b/cmake/has-features/src/kernel_reduction_dummy.cpp new file mode 100644 index 00000000..1cce2205 --- /dev/null +++ b/cmake/has-features/src/kernel_reduction_dummy.cpp @@ -0,0 +1,18 @@ +#include + +int main() { + sycl::queue q; + sycl::buffer x(1); + q.submit([&](sycl::handler& cgh) { +#ifdef __ACPP__ + auto r = sycl::reduction(x.template get_access(cgh), sycl::plus{}); +#else + auto r = sycl::reduction(x, cgh, sycl::plus{}); +#endif + + cgh.parallel_for(sycl::range<1>{5}, r, [=](sycl::id<1> idx, auto& op) { op.combine(1); }); + }).wait(); + + sycl::host_accessor host{x}; + assert(host[0] == 5); +} \ No newline at end of file diff --git a/cmake/has-features/src/spec_constants_dummy.cpp b/cmake/has-features/src/spec_constants_dummy.cpp new file mode 100644 index 00000000..36942366 --- /dev/null +++ b/cmake/has-features/src/spec_constants_dummy.cpp @@ -0,0 +1,38 @@ +#include + +#ifndef __ACPP__ + +static constexpr sycl::specialization_id x; + +int main() { + sycl::queue q; + int* i = sycl::malloc_shared(1, q); + q.submit([&](sycl::handler& cgh) { + cgh.set_specialization_constant(5); + cgh.parallel_for(sycl::range(1), [=](sycl::item<1> item, sycl::kernel_handler h) { + *i = h.get_specialization_constant(); + }); + }).wait(); + + assert(*i == 5); + sycl::free(i, q); +} + +#else + +// AdaptiveCpp implements sycl::specialized instead of spec constants + +int main() { + sycl::queue q; + sycl::specialized x; + x = 5; //Requires copy assignment operator + int* i = sycl::malloc_shared(1, q); + q.parallel_for(sycl::range(1), [=](sycl::id<1> idx) { + *i = x; + }).wait(); + + assert(*i == 5); + sycl::free(i, q); +} + +#endif \ No newline at end of file diff --git a/compiletime/compiletime.cpp b/compiletime/compiletime.cpp index d57ab23f..ef8f32e2 100644 --- a/compiletime/compiletime.cpp +++ b/compiletime/compiletime.cpp @@ -1,14 +1,14 @@ // Skeleton for compile time measurements -- doesn't do anything on its own, but should compile successfully -#include +#include -namespace s = cl::sycl; +namespace s = sycl; #include void run(size_t rt_size) { - cl::sycl::queue device_queue; - #include + sycl::queue device_queue; +#include device_queue.wait_and_throw(); } diff --git a/compiletime/compiletime_gen.rb b/compiletime/compiletime_gen.rb index e700ee10..e34c3d95 100644 --- a/compiletime/compiletime_gen.rb +++ b/compiletime/compiletime_gen.rb @@ -11,9 +11,9 @@ # operations available for generation OP_MAPPING = { - "sin" => "OUT = cl::sycl::sin(IN1);", - "cos" => "OUT = cl::sycl::cos(IN1);", - "sqrt" => "OUT = cl::sycl::sqrt(IN1);", + "sin" => "OUT = sycl::sin(IN1);", + "cos" => "OUT = sycl::cos(IN1);", + "sqrt" => "OUT = sycl::sqrt(IN1);", "add" => "OUT = IN1 + IN2;", "mad" => "OUT = IN1 * IN2 + IN1;", } @@ -129,17 +129,17 @@ def parse_cmd(args) f.puts kernel_names.each do |kn| - fwr.call "device_queue.submit([&](cl::sycl::handler& cgh) {" + fwr.call "device_queue.submit([&](sycl::handler& cgh) {" buffers.each do |bn, an| fwr.call "auto #{an} = #{bn}.get_access(cgh);" end - fwr.call "cl::sycl::range<#{options.dimensions}> ndrange{#{ndrange}};" + fwr.call "sycl::range<#{options.dimensions}> ndrange{#{ndrange}};" full_kernel_name = kn full_kernel_name += "<#{options.type}, #{otions.dimensions}>" if options.templated - fwr.call "cgh.parallel_for<#{full_kernel_name}>(ndrange, [=](cl::sycl::id<#{options.dimensions}> gid) {" + fwr.call "cgh.parallel_for<#{full_kernel_name}>(ndrange, [=](sycl::id<#{options.dimensions}> gid) {" fwr.call "#{acc_names[0]}[gid] += #{capture_names.join(" + ")};" # use each capture fwr.call "#{acc_names[0]}[gid] += #{acc_names.join("[gid] + ")}[gid];" # use each buffer diff --git a/include/benchmark_hook.h b/include/benchmark_hook.h index 22db140b..fefa1008 100644 --- a/include/benchmark_hook.h +++ b/include/benchmark_hook.h @@ -3,17 +3,16 @@ #include "result_consumer.h" -class BenchmarkHook -{ +class BenchmarkHook { public: virtual void atInit() = 0; virtual void preSetup() = 0; - virtual void postSetup()= 0; + virtual void postSetup() = 0; virtual void preKernel() = 0; virtual void postKernel() = 0; virtual void emitResults(ResultConsumer&) {} - virtual ~BenchmarkHook(){} + virtual ~BenchmarkHook() {} }; #endif diff --git a/include/benchmark_traits.h b/include/benchmark_traits.h index 7b4b935d..2822b677 100644 --- a/include/benchmark_traits.h +++ b/include/benchmark_traits.h @@ -2,7 +2,7 @@ #include -#include +#include namespace detail { @@ -13,7 +13,7 @@ struct SupportsQueueProfiling { template struct SupportsQueueProfiling().run(std::declval&>()))>> { + std::void_t().run(std::declval&>()))>> { static constexpr bool value = true; }; diff --git a/include/bitmap.h b/include/bitmap.h index ff094d5d..02d282af 100644 --- a/include/bitmap.h +++ b/include/bitmap.h @@ -1,96 +1,93 @@ #ifndef BITMAP_H #define BITMAP_H -#include // float4 definition #include +#include // float4 definition #include using std::string; -void load_bitmap_mirrored(string filename, int size, std::vector &pixels); -void save_bitmap(string filename, int size, const std::vector &buffer); +void load_bitmap_mirrored(string filename, int size, std::vector& pixels); +void save_bitmap(string filename, int size, const std::vector& buffer); /** A single Pixel in the image. A Pixel has red, green, and blue integer components in the range from 0 to 255. **/ -class Pixel -{ +class Pixel { public: int r, g, b; // Initializes a Pixel with a default black color. - Pixel() : r(0), g(0), b(0) { } + Pixel() : r(0), g(0), b(0) {} // Initializes a color Pixel with the specified RGB values. - Pixel(int _r, int _g, int _b) : r(_r), g(_g), b(_b) { } + Pixel(int _r, int _g, int _b) : r(_r), g(_g), b(_b) {} }; -//To abbreviate a pixel matrix built as a vector of vectors -typedef std::vector < std::vector > PixelMatrix; +// To abbreviate a pixel matrix built as a vector of vectors +typedef std::vector> PixelMatrix; /** * Represents a bitmap with pixels in row-major order. * Limitations: Windows BMP, no compression, 24 bit color depth. -**/ -class Bitmap -{ - private: - PixelMatrix pixels; - - public: - /** - * Opens a file as its name is provided and reads pixel-by-pixel the colors - * into a matrix of RGB pixels. Any errors will cout but will result in an - * empty matrix (with no rows and no columns). - * - * @param name of the filename to be opened and read as a matrix of pixels - **/ - void open(std::string); - - /** - * Saves the current image, represented by the matrix of pixels, as a - * Windows BMP file with the name provided by the parameter. File extension - * is not forced but should be .bmp. Any errors will cout and will NOT - * attempt to save the file. - * - * @param name of the filename to be written as a bmp image - **/ - void save(std::string); - - /** - * Validates whether or not the current matrix of pixels represents a - * proper image with non-zero-size rows and consistent non-zero-size - * columns for each row. In addition, each pixel in the matrix is validated - * to have red, green, and blue components with values between 0 and 255 - * - * @return boolean value of whether or not the matrix is a valid image - **/ - bool isImage(); - - /** - * Provides a vector of vector of pixels representing the bitmap - * - * @return the bitmap image, represented by a matrix of RGB pixels - **/ - PixelMatrix toPixelMatrix(); - - /** - * Overwrites the current bitmap with that represented by a matrix of - * pixels. Does not validate that the new matrix of pixels is a proper - * image. - * - * @param a matrix of pixels to represent a bitmap - **/ - void fromPixelMatrix(const PixelMatrix &); - + **/ +class Bitmap { +private: + PixelMatrix pixels; + +public: + /** + * Opens a file as its name is provided and reads pixel-by-pixel the colors + * into a matrix of RGB pixels. Any errors will cout but will result in an + * empty matrix (with no rows and no columns). + * + * @param name of the filename to be opened and read as a matrix of pixels + **/ + void open(std::string); + + /** + * Saves the current image, represented by the matrix of pixels, as a + * Windows BMP file with the name provided by the parameter. File extension + * is not forced but should be .bmp. Any errors will cout and will NOT + * attempt to save the file. + * + * @param name of the filename to be written as a bmp image + **/ + void save(std::string); + + /** + * Validates whether or not the current matrix of pixels represents a + * proper image with non-zero-size rows and consistent non-zero-size + * columns for each row. In addition, each pixel in the matrix is validated + * to have red, green, and blue components with values between 0 and 255 + * + * @return boolean value of whether or not the matrix is a valid image + **/ + bool isImage(); + + /** + * Provides a vector of vector of pixels representing the bitmap + * + * @return the bitmap image, represented by a matrix of RGB pixels + **/ + PixelMatrix toPixelMatrix(); + + /** + * Overwrites the current bitmap with that represented by a matrix of + * pixels. Does not validate that the new matrix of pixels is a proper + * image. + * + * @param a matrix of pixels to represent a bitmap + **/ + void fromPixelMatrix(const PixelMatrix&); }; ////////////////////////////////////////////////////////////////////////////////// -#include #include -//#include "bitmap.h" +#include +// #include "bitmap.h" #include typedef unsigned char uchar_t; @@ -99,27 +96,24 @@ typedef unsigned short int uint16_t; typedef signed int int32_t; typedef signed short int int16_t; -const int MIN_RGB=0; -const int MAX_RGB=255; -const int BMP_MAGIC_ID=2; +const int MIN_RGB = 0; +const int MAX_RGB = 255; +const int BMP_MAGIC_ID = 2; // -------------------------------------------------------------- // Windows BMP-specific format data -struct bmpfile_magic -{ - uchar_t magic[BMP_MAGIC_ID]; +struct bmpfile_magic { + uchar_t magic[BMP_MAGIC_ID]; }; -struct bmpfile_header -{ - uint32_t file_size; - uint16_t creator1; - uint16_t creator2; - uint32_t bmp_offset; +struct bmpfile_header { + uint32_t file_size; + uint16_t creator1; + uint16_t creator2; + uint32_t bmp_offset; }; -struct bmpfile_dib_info -{ +struct bmpfile_dib_info { uint32_t header_size; int32_t width; int32_t height; @@ -141,213 +135,180 @@ struct bmpfile_dib_info * empty matrix (with no rows and no columns). * * @param name of the filename to be opened and read as a matrix of pixels -**/ -void Bitmap::open(std::string filename) -{ - std::ifstream file(filename.c_str(), std::ios::in | std::ios::binary); - //clear data if already holds information - for(int i=0; i row_data; + + for(int col = 0; col < dib_info.width; col++) { + int blue = file.get(); + int green = file.get(); + int red = file.get(); + + row_data.push_back(Pixel(red, green, blue)); } - pixels.clear(); - - if (file.fail()) - { - std::cout< row_data; - - for (int col = 0; col < dib_info.width; col++) - { - int blue = file.get(); - int green = file.get(); - int red = file.get(); - - row_data.push_back( Pixel(red, green, blue) ); - } - - // Rows are padded so that they're always a multiple of 4 - // bytes. This line skips the padding at the end of each row. - file.seekg(dib_info.width % 4, std::ios::cur); - - if (flip) - { - pixels.insert(pixels.begin(), row_data); - } - else - { - pixels.push_back(row_data); - } - } - - file.close(); - }//end else (is an image) - }//end else (can open file) + + // Rows are padded so that they're always a multiple of 4 + // bytes. This line skips the padding at the end of each row. + file.seekg(dib_info.width % 4, std::ios::cur); + + if(flip) { + pixels.insert(pixels.begin(), row_data); + } else { + pixels.push_back(row_data); + } + } + + file.close(); + } // end else (is an image) + } // end else (can open file) } // ---------------------------------------------------------------------------- /** * Saves the current image, represented by the matrix of pixels, as a * Windows BMP file with the name provided by the parameter. File extension - * is not forced but should be .bmp. Any errors will cout and will NOT + * is not forced but should be .bmp. Any errors will cout and will NOT * attempt to save the file. * * @param name of the filename to be written as a bmp image -**/ -void Bitmap::save(std::string filename) -{ - std::ofstream file(filename.c_str(), std::ios::out | std::ios::binary); - - if (file.fail()) - { - std::cout<= 0; row--) - { - const std::vector & row_data = pixels[row]; - - for (int col = 0; col < row_data.size(); col++) - { - const Pixel& pix = row_data[col]; - - file.put((uchar_t)(pix.b)); - file.put((uchar_t)(pix.g)); - file.put((uchar_t)(pix.r)); - } - - // Rows are padded so that they're always a multiple of 4 - // bytes. This line skips the padding at the end of each row. - for (int i = 0; i < row_data.size() % 4; i++) - { - file.put(0); - } - } - - file.close(); - } + **/ +void Bitmap::save(std::string filename) { + std::ofstream file(filename.c_str(), std::ios::out | std::ios::binary); + + if(file.fail()) { + std::cout << filename << " could not be opened for editing. " + << "Is it already open by another program or is it read-only?\n"; + + } else if(!isImage()) { + std::cout << "Bitmap cannot be saved. It is not a valid image.\n"; + } else { + // Write all the header information that the BMP file format requires. + bmpfile_magic magic; + magic.magic[0] = 'B'; + magic.magic[1] = 'M'; + file.write((char*)(&magic), sizeof(magic)); + bmpfile_header header = {0}; + header.bmp_offset = sizeof(bmpfile_magic) + sizeof(bmpfile_header) + sizeof(bmpfile_dib_info); + header.file_size = header.bmp_offset + (pixels.size() * 3 + pixels[0].size() % 4) * pixels.size(); + file.write((char*)(&header), sizeof(header)); + bmpfile_dib_info dib_info = {0}; + dib_info.header_size = sizeof(bmpfile_dib_info); + dib_info.width = pixels[0].size(); + dib_info.height = pixels.size(); + dib_info.num_planes = 1; + dib_info.bits_per_pixel = 24; + dib_info.compression = 0; + dib_info.bmp_byte_size = 0; + dib_info.hres = 2835; + dib_info.vres = 2835; + dib_info.num_colors = 0; + dib_info.num_important_colors = 0; + file.write((char*)(&dib_info), sizeof(dib_info)); + + // Write each row and column of Pixels into the image file -- we write + // the rows upside-down to satisfy the easiest BMP format. + for(int row = pixels.size() - 1; row >= 0; row--) { + const std::vector& row_data = pixels[row]; + + for(int col = 0; col < row_data.size(); col++) { + const Pixel& pix = row_data[col]; + + file.put((uchar_t)(pix.b)); + file.put((uchar_t)(pix.g)); + file.put((uchar_t)(pix.r)); + } + + // Rows are padded so that they're always a multiple of 4 + // bytes. This line skips the padding at the end of each row. + for(int i = 0; i < row_data.size() % 4; i++) { + file.put(0); + } + } + + file.close(); + } } - + // ---------------------------------------------------------------------------- /** - * Validates whether or not the current matrix of pixels represents a - * proper image with non-zero-size rows and consistent non-zero-size - * columns for each row. In addition, each pixel in the matrix is validated - * to have red, green, and blue components with values between 0 and 255 - * - * @return boolean value of whether or not the matrix is a valid image + * Validates whether or not the current matrix of pixels represents a + * proper image with non-zero-size rows and consistent non-zero-size + * columns for each row. In addition, each pixel in the matrix is validated + * to have red, green, and blue components with values between 0 and 255 + * + * @return boolean value of whether or not the matrix is a valid image **/ -bool Bitmap::isImage() -{ - const int height = pixels.size(); - - if( height == 0 || pixels[0].size() == 0) - { - return false; - } - - const int width = pixels[0].size(); - - for(int row=0; row < height; row++) - { - if( pixels[row].size() != width ) - { - return false; - } - for(int column=0; column < width; column++) - { - Pixel current = pixels[row][column]; - if( current.r > MAX_RGB || current.r < MIN_RGB || - current.g > MAX_RGB || current.g < MIN_RGB || - current.b > MAX_RGB || current.b < MIN_RGB ) - return false; - } - } - return true; +bool Bitmap::isImage() { + const int height = pixels.size(); + + if(height == 0 || pixels[0].size() == 0) { + return false; + } + + const int width = pixels[0].size(); + + for(int row = 0; row < height; row++) { + if(pixels[row].size() != width) { + return false; + } + for(int column = 0; column < width; column++) { + Pixel current = pixels[row][column]; + if(current.r > MAX_RGB || current.r < MIN_RGB || current.g > MAX_RGB || current.g < MIN_RGB || + current.b > MAX_RGB || current.b < MIN_RGB) + return false; + } + } + return true; } // ---------------------------------------------------------------------------- @@ -355,17 +316,13 @@ bool Bitmap::isImage() * Provides a vector of vector of pixels representing the bitmap * * @return the bitmap image, represented by a matrix of RGB pixels -**/ -PixelMatrix Bitmap::toPixelMatrix() -{ - if( isImage() ) - { - return pixels; - } - else - { - return PixelMatrix(); - } + **/ +PixelMatrix Bitmap::toPixelMatrix() { + if(isImage()) { + return pixels; + } else { + return PixelMatrix(); + } } // ---------------------------------------------------------------------------- @@ -375,60 +332,54 @@ PixelMatrix Bitmap::toPixelMatrix() * image. * * @param a matrix of pixels to represent a bitmap -**/ -void Bitmap::fromPixelMatrix(const PixelMatrix & values) -{ - pixels = values; -} + **/ +void Bitmap::fromPixelMatrix(const PixelMatrix& values) { pixels = values; } #endif -void load_bitmap_mirrored(string filename, int size, std::vector &input){ +void load_bitmap_mirrored(string filename, int size, std::vector& input) { Bitmap input_image; input_image.open(filename); - //std::cout << "input image " << filename << " loaded" << std::endl; + // std::cout << "input image " << filename << " loaded" << std::endl; PixelMatrix pixels = input_image.toPixelMatrix(); int w = pixels.size(); int h; - if(w>0) + if(w > 0) h = pixels[0].size(); else h = 0; // prepare the input buffer (similar to a GL_MIRRORED_REPEAT of the input picture) input.resize(size * size); - for(size_t i=0; i [" << size << "x" << size << "]" << std::endl; + // std::cout << "image resized to match the input size: "; + // std::cout << "[" << w << "x" << h << "] => [" << size << "x" << size << "]" << std::endl; } -void save_bitmap(string filename, int size, const std::vector &output){ - // write the output picture - //std::cout << "saving the output picture in " << filename << std::endl; - Bitmap output_image; - PixelMatrix pixels; - pixels.resize(size); -//std::cout << "debug " << size << " - " << output.size() << std::endl; - for(size_t i=0; i& output) { + // write the output picture + // std::cout << "saving the output picture in " << filename << std::endl; + Bitmap output_image; + PixelMatrix pixels; + pixels.resize(size); + // std::cout << "debug " << size << " - " << output.size() << std::endl; + for(size_t i = 0; i < size; i++) { + pixels[i].resize(size); + for(size_t j = 0; j < size; j++) { + sycl::float4 color = output[i * size + j] * 255.f; + // std::cout << color.x() << "," << color.z() << "/"; + pixels[i][j].r = (int)color.x(); + pixels[i][j].g = (int)color.y(); + pixels[i][j].b = (int)color.z(); } - output_image.fromPixelMatrix(pixels); - output_image.save(filename); + } + output_image.fromPixelMatrix(pixels); + output_image.save(filename); } - - - diff --git a/include/command_line.h b/include/command_line.h index 1fb73f93..852fbfea 100644 --- a/include/command_line.h +++ b/include/command_line.h @@ -1,50 +1,48 @@ #ifndef BENCHMARK_COMMAND_LINE_HPP #define BENCHMARK_COMMAND_LINE_HPP +#include "common.h" + +#include "result_consumer.h" +#include +#include +#include +#include #include +#include #include #include -#include #include -#include -#include -#include -#include -#include "result_consumer.h" using CommandLineArguments = std::unordered_map; using FlagList = std::unordered_set; namespace detail { -template -inline T simple_cast(const std::string& s) -{ +template +inline T simple_cast(const std::string& s) { std::stringstream sstr{s}; T result; sstr >> result; return result; } -template -inline std::vector parseCommaDelimitedList(const std::string& s) -{ +template +inline std::vector parseCommaDelimitedList(const std::string& s) { std::stringstream istr(s); std::string current; std::vector result; - while(std::getline(istr, current, ',')) - result.push_back(simple_cast(current)); - + while(std::getline(istr, current, ',')) result.push_back(simple_cast(current)); + return result; } -template -inline SyclArraylike parseSyclArray(const std::string& s, std::size_t defaultValue) -{ +template +inline SyclArraylike parseSyclArray(const std::string& s, std::size_t defaultValue) { auto elements = parseCommaDelimitedList(s); if(s.size() > 3) - throw std::invalid_argument{"Invalid sycl range/id: "+s}; + throw std::invalid_argument{"Invalid sycl range/id: " + s}; else if(s.size() == 3) return SyclArraylike{elements[0], elements[1], elements[2]}; else if(s.size() == 2) @@ -52,116 +50,88 @@ inline SyclArraylike parseSyclArray(const std::string& s, std::size_t defaultVal else if(s.size() == 1) return SyclArraylike{elements[0], defaultValue, defaultValue}; else - throw std::invalid_argument{"Invalid sycl range/id: "+s}; + throw std::invalid_argument{"Invalid sycl range/id: " + s}; } -} +} // namespace detail -template -inline T cast(const std::string& s) -{ +template +inline T cast(const std::string& s) { return detail::simple_cast(s); } -template<> -inline cl::sycl::range<3> -cast(const std::string& s) -{ - return detail::parseSyclArray>(s, 1); +template <> +inline sycl::range<3> cast(const std::string& s) { + return detail::parseSyclArray>(s, 1); } -template<> -inline cl::sycl::id<3> -cast(const std::string& s) -{ - return detail::parseSyclArray>(s, 0); +template <> +inline sycl::id<3> cast(const std::string& s) { + return detail::parseSyclArray>(s, 0); } -class CommandLine -{ +class CommandLine { public: CommandLine() = default; - CommandLine(int argc, char** argv) - { - for (int i = 0; i < argc; ++i) - { + CommandLine(int argc, char** argv) { + for(int i = 0; i < argc; ++i) { std::string arg = argv[i]; auto pos = arg.find("="); - if(pos != std::string::npos) - { - auto argName = arg.substr(0,pos); - auto argVal = arg.substr(pos+1); - - if(args.find(argName) != args.end()) - { - throw std::invalid_argument{ - "Encountered command line argument several times: " + argName}; + if(pos != std::string::npos) { + auto argName = arg.substr(0, pos); + auto argVal = arg.substr(pos + 1); + + if(args.find(argName) != args.end()) { + throw std::invalid_argument{"Encountered command line argument several times: " + argName}; } args[argName] = argVal; - } - else - { + } else { flags.insert(arg); } } } - bool isArgSet(const std::string& arg) const - { - return args.find(arg) != args.end(); - } + bool isArgSet(const std::string& arg) const { return args.find(arg) != args.end(); } - template - T getOrDefault(const std::string& arg, const T& defaultVal) const - { + template + T getOrDefault(const std::string& arg, const T& defaultVal) const { if(isArgSet(arg)) return cast(args.at(arg)); return defaultVal; } - template - T get(const std::string& arg) const - { - try - { + template + T get(const std::string& arg) const { + try { return cast(args.at(arg)); - } - catch(std::out_of_range& e) - { - throw std::invalid_argument{"Command line argument was requested but missing: "+arg}; + } catch(std::out_of_range& e) { + throw std::invalid_argument{"Command line argument was requested but missing: " + arg}; } } - bool isFlagSet(const std::string& flag) const - { - return flags.find(flag) != flags.end(); - } + bool isFlagSet(const std::string& flag) const { return flags.find(flag) != flags.end(); } - private: - - CommandLineArguments args; FlagList flags; }; -struct VerificationSetting -{ +struct VerificationSetting { bool enabled; - cl::sycl::id<3> begin = {0, 0, 0}; - cl::sycl::range<3> range = {1, 1, 1}; + sycl::id<3> begin = {0, 0, 0}; + sycl::range<3> range = {1, 1, 1}; }; -struct BenchmarkArgs -{ +struct BenchmarkArgs { size_t problem_size; size_t local_size; size_t num_runs; - cl::sycl::queue device_queue; + sycl::queue device_queue; + sycl::queue device_queue_in_order; VerificationSetting verification; // can be used to query additional benchmark specific information from the command line CommandLine cli; @@ -169,68 +139,38 @@ struct BenchmarkArgs bool warmup_run; }; -class CUDASelector : public cl::sycl::device_selector { -public: - int operator()(const cl::sycl::device& device) const override { - using namespace cl::sycl::info; - const std::string driverVersion = device.get_info(); - if(device.is_gpu() && (driverVersion.find("CUDA") != std::string::npos)) { - return 1; - }; - return -1; - } -}; -class BenchmarkCommandLine -{ +class BenchmarkCommandLine { public: - BenchmarkCommandLine(int argc, char **argv) - : cli_parser{argc, argv} {} + BenchmarkCommandLine(int argc, char** argv) : cli_parser{argc, argv} {} - BenchmarkArgs getBenchmarkArgs() const - { + BenchmarkArgs getBenchmarkArgs() const { std::size_t size = cli_parser.getOrDefault("--size", 3072); std::size_t local_size = cli_parser.getOrDefault("--local", 256); std::size_t num_runs = cli_parser.getOrDefault("--num-runs", 5); std::string device_type = cli_parser.getOrDefault("--device", "default"); - bool warmup_run = cli_parser.isFlagSet("--warmup-run"); - if (warmup_run) { - // Make drop of first run transparent to the user - ++num_runs; - } - cl::sycl::queue q = getQueue(device_type); + sycl::queue q = getQueue(device_type); + sycl::queue q_in_order = getQueue(device_type, sycl::property::queue::in_order{}); bool verification_enabled = true; if(cli_parser.isFlagSet("--no-verification")) verification_enabled = false; - auto verification_begin = cli_parser.getOrDefault>( - "--verification-begin", cl::sycl::id<3>{0,0,0}); - - auto verification_range = cli_parser.getOrDefault>( - "--verification-range", cl::sycl::range<3>{1,1,1}); - - auto result_consumer = getResultConsumer( - cli_parser.getOrDefault("--output","stdio")); - - return BenchmarkArgs{size, - local_size, - num_runs, - q, - VerificationSetting{verification_enabled, - verification_begin, - verification_range}, - cli_parser, - result_consumer, - warmup_run}; + auto verification_begin = cli_parser.getOrDefault>("--verification-begin", sycl::id<3>{0, 0, 0}); + + auto verification_range = cli_parser.getOrDefault>("--verification-range", sycl::range<3>{1, 1, 1}); + + auto result_consumer = getResultConsumer(cli_parser.getOrDefault("--output", "stdio")); + + return BenchmarkArgs{size, local_size, num_runs, q, q_in_order, + VerificationSetting{verification_enabled, verification_begin, verification_range}, cli_parser, result_consumer}; } private: std::shared_ptr - getResultConsumer(const std::string& result_consumer_name) const - { + getResultConsumer(const std::string& result_consumer_name) const { if(result_consumer_name == "stdio") return std::shared_ptr{new OstreamResultConsumer{std::cout}}; else @@ -239,27 +179,23 @@ class BenchmarkCommandLine return std::shared_ptr{new AppendingCsvResultConsumer{result_consumer_name}}; } - cl::sycl::queue getQueue(const std::string& device_type) const { - const auto getQueueProperties = [&]() -> cl::sycl::property_list { + template + sycl::queue getQueue(const std::string& device_type, Props&&... props) const { + const auto getQueueProperties = [&]() -> sycl::property_list { + #if defined(SYCL_BENCH_ENABLE_QUEUE_PROFILING) - return cl::sycl::property::queue::enable_profiling{}; + return {sycl::property::queue::enable_profiling{}, props...}; #endif - return {}; + return {props...}; }; -#if defined(__LLVM_SYCL_CUDA__) - if(device_type != "gpu") { - throw std::invalid_argument{"Only the 'gpu' device is supported on LLVM CUDA"}; - } - return cl::sycl::queue{CUDASelector{}, getQueueProperties()}; -#endif if(device_type == "cpu") { - return cl::sycl::queue{cl::sycl::cpu_selector{}, getQueueProperties()}; + return sycl::queue{sycl::cpu_selector_v, getQueueProperties()}; } else if(device_type == "gpu") { - return cl::sycl::queue{cl::sycl::gpu_selector{}, getQueueProperties()}; + return sycl::queue{sycl::gpu_selector_v, getQueueProperties()}; } else if(device_type == "default") { - return cl::sycl::queue{getQueueProperties()}; + return sycl::queue{getQueueProperties()}; } else { throw std::invalid_argument{"unknown device type: " + device_type}; } @@ -269,4 +205,3 @@ class BenchmarkCommandLine }; #endif - diff --git a/include/common.h b/include/common.h index e5e5e6d4..e3c65380 100644 --- a/include/common.h +++ b/include/common.h @@ -1,57 +1,47 @@ -#pragma once -#include +#pragma once +#include -#include -#include +#include // for std::min #include -#include +#include #include -#include // for std::min +#include +#include +#include #include #include -#include #include "command_line.h" #include "result_consumer.h" #include "type_traits.h" - + #include "benchmark_hook.h" #include "benchmark_traits.h" -#include "prefetched_buffer.h" +#include "memory_wrappers.h" #include "time_metrics.h" -#ifdef NV_ENERGY_MEAS - #include "nv_energy_meas.h" +#ifdef NV_ENERGY_MEAS +#include "nv_energy_meas.h" #endif - -template -class BenchmarkManager -{ +template +class BenchmarkManager { public: - BenchmarkManager(const BenchmarkArgs &_args) : args(_args) {} + BenchmarkManager(const BenchmarkArgs& _args) : args(_args) {} - void addHook(BenchmarkHook &h) - { - hooks.push_back(&h); - } + void addHook(BenchmarkHook& h) { hooks.push_back(&h); } - template - void run(Args&&... additionalArgs) - { + template + void run(Args&&... additionalArgs) { args.result_consumer->proceedToBenchmark(Benchmark{args, additionalArgs...}.getBenchmarkName(args)); + args.result_consumer->consumeResult("problem-size", std::to_string(args.problem_size)); + args.result_consumer->consumeResult("local-size", std::to_string(args.local_size)); args.result_consumer->consumeResult( - "problem-size", std::to_string(args.problem_size)); - args.result_consumer->consumeResult( - "local-size", std::to_string(args.local_size)); - args.result_consumer->consumeResult( - "device-name", args.device_queue.get_device() - .template get_info()); - args.result_consumer->consumeResult( - "sycl-implementation", this->getSyclImplementation()); + "device-name", args.device_queue.get_device().get_info()); + args.result_consumer->consumeResult("sycl-implementation", this->getSyclImplementation()); TimeMetricsProcessor time_metrics(args); @@ -71,7 +61,7 @@ class BenchmarkManager args.device_queue.wait_and_throw(); for(auto h : hooks) h->postSetup(); - std::vector run_events; + std::vector run_events; run_events.reserve(1024); // Make sure we don't need to resize during benchmarking. // Performance critical measurement section starts here @@ -87,23 +77,37 @@ class BenchmarkManager for(auto h : hooks) h->postKernel(); // Performance critical measurement section ends here - time_metrics.addTimingResult("run-time", std::chrono::duration_cast(after - before)); + auto run_time = std::chrono::duration_cast(after - before); + time_metrics.addTimingResult("run-time", run_time); if(detail::BenchmarkTraits::supportsQueueProfiling) { #if defined(SYCL_BENCH_ENABLE_QUEUE_PROFILING) // TODO: We might also want to consider the "command_submit" time. std::chrono::nanoseconds total_time{0}; + std::chrono::nanoseconds submit_time{0}; + // Runtime without kernel time + std::chrono::nanoseconds system_time{0}; for(auto& e : run_events) { - const auto start = e.get_profiling_info(); - const auto end = e.get_profiling_info(); + const auto start = e.get_profiling_info(); + const auto end = e.get_profiling_info(); + const auto submit = e.get_profiling_info(); total_time += std::chrono::nanoseconds(end - start); + submit_time += std::chrono::nanoseconds(start - submit); } + system_time += std::chrono::nanoseconds(run_time - total_time); + time_metrics.addTimingResult("kernel-time", total_time); + time_metrics.addTimingResult("submit-time", submit_time); + time_metrics.addTimingResult("system-time", system_time); #else time_metrics.markAsUnavailable("kernel-time"); + time_metrics.markAsUnavailable("submit-time"); + time_metrics.markAsUnavailable("system-time"); #endif } else { time_metrics.markAsUnavailable("kernel-time"); + time_metrics.markAsUnavailable("submit-time"); + time_metrics.markAsUnavailable("system-time"); } if constexpr(detail::BenchmarkTraits::hasVerify) { @@ -123,7 +127,7 @@ class BenchmarkManager time_metrics.emitResults(*args.result_consumer); - for (auto h : hooks) { + for(auto h : hooks) { // Extract results from the hooks h->emitResults(*args.result_consumer); } @@ -131,33 +135,26 @@ class BenchmarkManager if(args.verification.range.size() == 0 || !args.verification.enabled || !detail::BenchmarkTraits::hasVerify) { args.result_consumer->consumeResult("Verification", "N/A"); - } - else if(!all_runs_pass){ + } else if(!all_runs_pass) { // error args.result_consumer->consumeResult("Verification", "FAIL"); - } - else { + } else { // pass args.result_consumer->consumeResult("Verification", "PASS"); - } - + } + args.result_consumer->flush(); - } private: - BenchmarkArgs args; + BenchmarkArgs args; std::vector hooks; std::string getSyclImplementation() const { -#if defined(__HIPSYCL__) - return "hipSYCL"; -#elif defined(__COMPUTECPP__) - return "ComputeCpp"; -#elif defined(__LLVM_SYCL__) +#if defined(__ACPP__) + return "AdaptiveCpp"; +#elif defined(__DPCPP__) return "LLVM (Intel DPC++)"; -#elif defined(__LLVM_SYCL_CUDA__) - return "LLVM CUDA (Codeplay)"; #elif defined(__TRISYCL__) return "triSYCL"; #else @@ -167,38 +164,30 @@ class BenchmarkManager }; -class BenchmarkApp -{ - BenchmarkArgs args; - cl::sycl::queue device_queue; +class BenchmarkApp { + BenchmarkArgs args; + sycl::queue device_queue; std::unordered_set benchmark_names; - -public: - BenchmarkApp(int argc, char** argv) - { - try{ + +public: + BenchmarkApp(int argc, char** argv) { + try { args = BenchmarkCommandLine{argc, argv}.getBenchmarkArgs(); - } - catch(std::exception& e){ + } catch(std::exception& e) { std::cerr << "Error while parsing command lines: " << e.what() << std::endl; } } - const BenchmarkArgs& getArgs() const - { return args; } + const BenchmarkArgs& getArgs() const { return args; } - bool shouldRunNDRangeKernels() const - { - return !args.cli.isFlagSet("--no-ndrange-kernels"); - } + bool shouldRunNDRangeKernels() const { return !args.cli.isFlagSet("--no-ndrange-kernels"); } - bool deviceHasAspect(cl::sycl::aspect asp) const { return device_queue.get_device().has(asp); } + bool deviceHasAspect(sycl::aspect asp) const { return device_queue.get_device().has(asp); } - bool deviceSupportsFP64() const { return deviceHasAspect(cl::sycl::aspect::fp64); } + bool deviceSupportsFP64() const { return deviceHasAspect(sycl::aspect::fp64); } - template - void run(AdditionalArgs&&... additional_args) - { + template + void run(AdditionalArgs&&... additional_args) { try { const auto name = Benchmark{args, additional_args...}.getBenchmarkName(args); if(benchmark_names.count(name) == 0) { @@ -216,11 +205,9 @@ class BenchmarkApp #endif mgr.run(additional_args...); - } - catch(cl::sycl::exception& e){ + } catch(sycl::exception& e) { std::cerr << "SYCL error: " << e.what() << std::endl; - } - catch(std::exception& e){ + } catch(std::exception& e) { std::cerr << "Error: " << e.what() << std::endl; } } diff --git a/include/memory_wrappers.h b/include/memory_wrappers.h new file mode 100644 index 00000000..f895c282 --- /dev/null +++ b/include/memory_wrappers.h @@ -0,0 +1,296 @@ +#pragma once +#include "common.h" +#include + +#include "utils.h" + + +template +class InitializationDummyKernel { +public: + InitializationDummyKernel(AccType acc) : acc{acc} {} + + void operator()() const {} + +private: + AccType acc; +}; + +class InitializationDummyKernel2; + +template +inline void forceDataTransfer(sycl::queue& q, BufferType b) { + q.submit([&](sycl::handler& cgh) { + auto acc = b.template get_access(cgh); + cgh.single_task(InitializationDummyKernel{acc}); + }); + q.wait_and_throw(); +} + +template +inline void forceDataAllocation(sycl::queue& q, BufferType b) { + q.submit([&](sycl::handler& cgh) { + auto acc = b.template get_access(cgh); + cgh.single_task(InitializationDummyKernel{acc}); + }); + q.wait_and_throw(); +} + +template +class PrefetchedBuffer { +public: + void initialize(sycl::queue& q, sycl::range r) { + buff = std::make_shared>(r); + forceDataAllocation(q, *buff); + } + + void initialize(sycl::queue& q, T* data, sycl::range r) { + buff = std::make_shared>(data, r); + buff->set_write_back(false); + forceDataTransfer(q, *buff); + } + + void initialize(sycl::queue& q, const T* data, sycl::range r) { + buff = std::make_shared>(data, r); + buff->set_write_back(false); + forceDataTransfer(q, *buff); + } + + + template + auto get_access(sycl::handler& commandGroupHandler) { + return buff->template get_access(commandGroupHandler); + } + + template + auto get_access() { + return buff->template get_access(); + } + + template + auto get_access( + sycl::handler& commandGroupHandler, sycl::range accessRange, sycl::id accessOffset = {}) { + return buff->template get_access(commandGroupHandler, accessRange, accessOffset); + } + + template + auto get_access(sycl::range accessRange, sycl::id accessOffset = {}) { + return buff->template get_access(accessRange, accessOffset); + } + + auto get_host_access() { return buff->template get_host_access(); } + + sycl::range get_range() const { return buff->get_range(); } + + sycl::buffer& get() const { return *buff; } + + void reset() { buff = nullptr; } + +private: + // Wrap in a shared_ptr to allow default constructing this class + std::shared_ptr> buff; +}; + + +namespace detail { +template +struct has_dim_impl { + static constexpr bool value = val == expected; +}; + +template +static constexpr bool has_dim_v = has_dim_impl::value; + +template +using has_dim_t = std::enable_if_t == true, void>; + +template +struct usm_properties; + +using namespace sycl::usm; +template <> +struct usm_properties { + static constexpr bool is_device_accessible = true; + static constexpr bool is_host_accessible = false; +}; +template <> +struct usm_properties { + static constexpr bool is_device_accessible = true; + static constexpr bool is_host_accessible = true; +}; +template <> +struct usm_properties { + static constexpr bool is_device_accessible = true; + static constexpr bool is_host_accessible = true; +}; + + +} // namespace detail + + +template +class USMBuffer { + static_assert(dim >= 1 && dim <= 3, "Invalid dim provided"); + +protected: + T* _data; + T* _host_ptr; + sycl::range _count; + std::size_t total_size; + sycl::queue* queue; + +public: + USMBuffer() : _data(nullptr), _host_ptr(nullptr), _count(getRange()), total_size(0), queue(nullptr) {} + + ~USMBuffer() { + if(_data != nullptr) { + sycl::free(_data, *queue); + } + if constexpr(!detail::usm_properties::is_host_accessible) { + if(_host_ptr != nullptr) { + sycl::free(_host_ptr, *queue); + } + } + } + + template > + void initialize(sycl::queue& q, size_t count) { + queue = &q; + allocate(count); + } + + void initialize(sycl::queue& q, sycl::range count) { + queue = &q; + allocate(count); + } + + void initialize(const T* data, size_t count) { + allocate(queue, count); + copy(queue, data, _data, count); + } + + void initialize(const T* data, sycl::range count) { + allocate(count); + copy(data, _data, count); + } + + + void update_host() { + if constexpr(!detail::usm_properties::is_host_accessible) { + queue->copy(_data, _host_ptr, total_size); + queue->wait_and_throw(); + } + } + + sycl::event update_host(sycl::event event) { + if constexpr(!detail::usm_properties::is_host_accessible) { + return queue->copy(_data, _host_ptr, total_size, event); + } else + return event; + } + + sycl::event update_device() { + if constexpr(detail::usm_properties::is_device_accessible && + !detail::usm_properties::is_host_accessible) { + assert(_host_ptr != nullptr && "calling update_device when no modification has been made on the host"); + // auto event = queue.copy(_host_ptr, _data, total_size); + // queue.wait_and_throw(); + return queue->copy(_host_ptr, _data, total_size); + } else + return sycl::event{}; + } + + sycl::event update_device(sycl::event event) { + if constexpr(detail::usm_properties::is_device_accessible && + !detail::usm_properties::is_host_accessible) { + assert(_host_ptr != nullptr && "calling update_device when no modification has been made on the host"); + return queue->copy(_host_ptr, _data, total_size, event); + } else + return event; + } + + T* get() const { return _data; } + + T* get_host_ptr() const { + assert(_host_ptr != nullptr && "_host_ptr not initialized. You should first call update_host()"); + return _host_ptr; + } + + T* update_and_get_host_ptr() { + update_host(); + return _host_ptr; + } + + std::tuple update_and_get_host_ptr(sycl::event event) { + auto new_event = update_host(event); + return {_host_ptr, new_event}; + } + + + auto size() const { return total_size; } + +private: + template + T* malloc(size_t count) { + return static_cast(sycl::malloc(count * sizeof(T), *queue, alloc_type)); + } + + auto constexpr getRange() { + if constexpr(dim == 1) { + return sycl::range(0); + } + if constexpr(dim == 2) { + return sycl::range(0, 0); + } + if constexpr(dim == 3) { + return sycl::range(0, 0, 0); + } + } + + std::size_t inline getSize(const sycl::range& count) { + std::size_t total_size = 0; + loop([&](std::size_t val) { total_size += count[val]; }); + return total_size; + } + + template > + void allocate(size_t count) { + assert(count >= 0 && "Cannot allocate negative num bytes"); + _data = malloc(count); + if constexpr(!detail::usm_properties::is_host_accessible) { + _host_ptr = static_cast(sycl::malloc_host(count * sizeof(T), *queue)); + } else { + _host_ptr = _data; + } + this->_count = sycl::range{count}; + total_size = count; + } + + void allocate(const sycl::range& count) { + loop([&](std::size_t idx) { assert(count[idx] >= 0 && "Cannot allocate negative num bytes"); }); + + const size_t total_size = getSize(count); + _data = malloc(total_size); + if constexpr(!detail::usm_properties::is_host_accessible) { + _host_ptr = static_cast(sycl::malloc_host(total_size * sizeof(T), *queue)); + } else { + _host_ptr = _data; + } + + this->_count = count; + this->total_size = total_size; + } + + void copy(const T* src, T* dst, std::size_t count) const { + // assert(count <= _count[0] && "Cannot copy negative num bytes"); + // assert(_data != nullptr && "Called copy on initialized USM buffer"); + queue->copy(src, dst, count).wait_and_throw(); + } + + void copy(const T* src, T* dst, sycl::range count) const { + loop([&](std::size_t idx) { assert(count[idx] >= 0 && "Cannot copy negative num bytes"); }); + + const size_t total_size = getSize(count); + queue->copy(src, dst, count).wait_and_throw(); + } +}; \ No newline at end of file diff --git a/include/nv_energy_meas.h b/include/nv_energy_meas.h index cda24f81..71f55038 100644 --- a/include/nv_energy_meas.h +++ b/include/nv_energy_meas.h @@ -1,4 +1,4 @@ -#pragma once +#pragma once // NVML energy measurmeent // TODO \ No newline at end of file diff --git a/include/prefetched_buffer.h b/include/prefetched_buffer.h deleted file mode 100644 index 251ac592..00000000 --- a/include/prefetched_buffer.h +++ /dev/null @@ -1,89 +0,0 @@ -#pragma once -#include -#include - -template -class InitializationDummyKernel -{ -public: - InitializationDummyKernel(AccType acc) - : acc{acc} {} - - void operator()() const {} -private: - AccType acc; -}; - -class InitializationDummyKernel2; - -template -inline void forceDataTransfer(cl::sycl::queue& q, BufferType b) { - q.submit([&](cl::sycl::handler& cgh) { - auto acc = b.template get_access(cgh); - cgh.single_task(InitializationDummyKernel{acc}); - }); - q.wait_and_throw(); -} - -template -inline void forceDataAllocation(cl::sycl::queue& q, BufferType b) { - q.submit([&](cl::sycl::handler& cgh) { - auto acc = b.template get_access(cgh); - cgh.single_task(InitializationDummyKernel{acc}); - }); - q.wait_and_throw(); -} - -template -class PrefetchedBuffer { -public: - void initialize(cl::sycl::queue& q, cl::sycl::range r) { - buff = std::make_shared>(r); - forceDataAllocation(q, *buff); - } - - void initialize(cl::sycl::queue& q, T* data, cl::sycl::range r) { - buff = std::make_shared>(data, r); - forceDataTransfer(q, *buff); - } - - void initialize(cl::sycl::queue& q, const T* data, cl::sycl::range r) { - buff = std::make_shared>(data, r); - forceDataTransfer(q, *buff); - } - - - template - auto get_access(cl::sycl::handler& commandGroupHandler) { - return buff->template get_access(commandGroupHandler); - } - - template - auto get_access() { - return buff->template get_access(); - } - - template - auto get_access(cl::sycl::handler& commandGroupHandler, cl::sycl::range accessRange, - cl::sycl::id accessOffset = {}) { - return buff->template get_access(commandGroupHandler, accessRange, accessOffset); - } - - template - auto get_access(cl::sycl::range accessRange, cl::sycl::id accessOffset = {}) { - return buff->template get_access(accessRange, accessOffset); - } - - cl::sycl::range get_range() const - { - return buff->get_range(); - } - - cl::sycl::buffer& get() const { return *buff; } - - void reset() { buff = nullptr; } - -private: - // Wrap in a shared_ptr to allow default constructing this class - std::shared_ptr> buff; -}; diff --git a/include/result_consumer.h b/include/result_consumer.h index 24715f4e..77b2e8a2 100644 --- a/include/result_consumer.h +++ b/include/result_consumer.h @@ -1,22 +1,20 @@ #ifndef RESULT_CONSUMER_HPP #define RESULT_CONSUMER_HPP +#include #include #include #include #include #include #include -#include -class ResultConsumer -{ +class ResultConsumer { public: virtual void proceedToBenchmark(const std::string& name) = 0; // Register a result in the result consumer - virtual void consumeResult(const std::string& result_name, - const std::string& result, - const std::string& unit = "") = 0; + virtual void consumeResult( + const std::string& result_name, const std::string& result, const std::string& unit = "") = 0; // Guarantees that the results have been emitted to the output // as specified by the ResultConsumer implementation @@ -25,32 +23,23 @@ class ResultConsumer // Discards the current benchmark's results, useful e.g. in case of errors. virtual void discard() {} - virtual ~ResultConsumer(){} - + virtual ~ResultConsumer() {} }; -class OstreamResultConsumer : public ResultConsumer -{ +class OstreamResultConsumer : public ResultConsumer { std::ostream& output; std::string name; public: - OstreamResultConsumer(std::ostream& ostr) - : output{ostr} - {} + OstreamResultConsumer(std::ostream& ostr) : output{ostr} {} - virtual void proceedToBenchmark(const std::string& benchmark_name) override - { + virtual void proceedToBenchmark(const std::string& benchmark_name) override { name = benchmark_name; - output << "********** Results for " << name - << "**********" << std::endl; - + output << "********** Results for " << name << "**********" << std::endl; } - virtual void consumeResult(const std::string& result_name, - const std::string& result, - const std::string& unit = "") override - { + virtual void consumeResult( + const std::string& result_name, const std::string& result, const std::string& unit = "") override { output << result_name << ": " << result; if(!unit.empty()) { output << " [" << unit << "]"; @@ -58,63 +47,48 @@ class OstreamResultConsumer : public ResultConsumer output << std::endl; } - virtual void flush() override - { - } + virtual void flush() override {} }; // TODO ResultConsumer that appends to a csv -class AppendingCsvResultConsumer : public ResultConsumer -{ +class AppendingCsvResultConsumer : public ResultConsumer { public: using benchmark_data = std::unordered_map; - AppendingCsvResultConsumer(const std::string& filename) - : output{filename, std::ios::app} - {} + AppendingCsvResultConsumer(const std::string& filename) : output{filename, std::ios::app} {} - virtual void proceedToBenchmark(const std::string& benchmark_name) override - { - currentBenchmark = benchmark_name; - } + virtual void proceedToBenchmark(const std::string& benchmark_name) override { currentBenchmark = benchmark_name; } - virtual void consumeResult(const std::string& result_name, - const std::string& result, - const std::string& unit = "") override - { + virtual void consumeResult( + const std::string& result_name, const std::string& result, const std::string& unit = "") override { data[currentBenchmark][result_name] = result; } - virtual void flush() override - { + virtual void flush() override { std::unordered_set columns; - for(const auto& benchmark: data) { + for(const auto& benchmark : data) { for(auto entry : benchmark.second) { columns.insert(entry.first); } } std::vector sorted_columns; - for(auto c : columns) - sorted_columns.push_back(c); + for(auto c : columns) sorted_columns.push_back(c); // To make sure order of columns is deterministic - std::sort(sorted_columns.begin(),sorted_columns.end()); + std::sort(sorted_columns.begin(), sorted_columns.end()); output << "# Benchmark name"; - for(auto c : sorted_columns) - output << "," << c; + for(auto c : sorted_columns) output << "," << c; output << std::endl; for(const auto& benchmark : data) { output << benchmark.first; - for(auto c : sorted_columns) - output << "," << benchmark.second.at(c); + for(auto c : sorted_columns) output << "," << benchmark.second.at(c); output << std::endl; } data.clear(); - } void discard() override { @@ -132,4 +106,3 @@ class AppendingCsvResultConsumer : public ResultConsumer }; #endif - diff --git a/include/type_traits.h b/include/type_traits.h index cdb4f1b9..f374707b 100644 --- a/include/type_traits.h +++ b/include/type_traits.h @@ -1,14 +1,15 @@ #ifndef TYPE_TRAITS_H #define TYPE_TRAITS_H -template -struct ReadableTypename -{}; +template +struct ReadableTypename {}; -#define MAKE_READABLE_TYPENAME(T, str) \ -template<> \ -struct ReadableTypename \ -{ static const char* name; }; const char* ReadableTypename::name = str; +#define MAKE_READABLE_TYPENAME(T, str) \ + template <> \ + struct ReadableTypename { \ + static const char* name; \ + }; \ + const char* ReadableTypename::name = str; MAKE_READABLE_TYPENAME(char, "int8") MAKE_READABLE_TYPENAME(unsigned char, "uint8") diff --git a/include/utils.h b/include/utils.h new file mode 100644 index 00000000..6dd3586d --- /dev/null +++ b/include/utils.h @@ -0,0 +1,12 @@ +#include +#include + +template +void loop_impl(std::integer_sequence, F&& f) { + (f(std::integral_constant{}), ...); +} + +template +void loop(F&& f) { + loop_impl(std::make_index_sequence{}, std::forward(f)); +} \ No newline at end of file diff --git a/micro/DRAM.cpp b/micro/DRAM.cpp index 59c5b7d5..26c5252b 100644 --- a/micro/DRAM.cpp +++ b/micro/DRAM.cpp @@ -1,6 +1,6 @@ #include "common.h" -namespace s = cl::sycl; +namespace s = sycl; template class MicroBenchDRAMKernel; @@ -52,7 +52,7 @@ class MicroBenchDRAM { } void run(std::vector& events) { - events.push_back(args.device_queue.submit([&](cl::sycl::handler& cgh) { + events.push_back(args.device_queue.submit([&](sycl::handler& cgh) { auto in = input_buf.template get_access(cgh); auto out = output_buf.template get_access(cgh); // We spawn one work item for each buffer element to be copied. @@ -62,7 +62,7 @@ class MicroBenchDRAM { } bool verify(VerificationSetting& ver) { - auto result = output_buf.template get_access(); + auto result = output_buf.get_host_access(); for(size_t i = 0; i < buffer_size[0]; ++i) { for(size_t j = 0; j < (Dims < 2 ? 1 : buffer_size[1]); ++j) { for(size_t k = 0; k < (Dims < 3 ? 1 : buffer_size[2]); ++k) { @@ -102,11 +102,10 @@ int main(int argc, char** argv) { app.run>(); app.run>(); app.run>(); - if(app.deviceSupportsFP64()) { + if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) { app.run>(); app.run>(); app.run>(); } - return 0; } diff --git a/micro/arith.cpp b/micro/arith.cpp index b6962447..1029c74c 100644 --- a/micro/arith.cpp +++ b/micro/arith.cpp @@ -1,6 +1,6 @@ #include "common.h" -namespace s = cl::sycl; +namespace s = sycl; template class MicroBenchArithmeticKernel; @@ -46,13 +46,13 @@ class MicroBenchArithmetic { return {}; } - void run(std::vector& events) { - events.push_back(args.device_queue.submit([&](cl::sycl::handler& cgh) { + void run(std::vector& events) { + events.push_back(args.device_queue.submit([&](sycl::handler& cgh) { auto in = input_buf.template get_access(cgh); auto out = output_buf.template get_access(cgh); cgh.parallel_for>( - s::range<1>{args.problem_size}, [=](cl::sycl::id<1> gid) { + s::range<1>{args.problem_size}, [=](sycl::id<1> gid) { DataT a1 = in[gid]; const DataT a2 = a1; @@ -68,7 +68,7 @@ class MicroBenchArithmetic { } bool verify(VerificationSetting& ver) { - auto result = output_buf.template get_access(); + auto result = output_buf.get_host_access(); for(size_t i = 0; i < args.problem_size; ++i) { if(result[i] != DataT{1}) { return false; @@ -91,8 +91,8 @@ int main(int argc, char** argv) { app.run>(); app.run>(); - if(app.deviceSupportsFP64()) + if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) { app.run>(); - + } return 0; } diff --git a/micro/host_device_bandwidth.cpp b/micro/host_device_bandwidth.cpp index 24b03db3..bc0c32c8 100644 --- a/micro/host_device_bandwidth.cpp +++ b/micro/host_device_bandwidth.cpp @@ -1,6 +1,6 @@ #include "common.h" -namespace s = cl::sycl; +namespace s = sycl; // The data type to be copied. This was originally a single byte (char), however // this causes device-side initialization kernels to quickly reach the @@ -117,8 +117,10 @@ class MicroBenchHostDeviceBandwidth { // Initialize buffer on device args.device_queue.submit([&](s::handler& cgh) { auto acc = buffer->template get_access(cgh); - cgh.parallel_for>( - copy_size, getStridedCopyOffset(), [=](s::id gid) { acc[gid] = TEST_VALUE; }); + cgh.parallel_for>(copy_size, [=](s::id gid) { + auto offset = getStridedCopyOffset(); + acc[gid + offset] = TEST_VALUE; + }); }); } } @@ -142,7 +144,7 @@ class MicroBenchHostDeviceBandwidth { if constexpr(Direction == CopyDirection::DEVICE_TO_HOST) { // Request host accessor for data that has been written on device - buffer->template get_access(); + buffer->get_host_access(); } } @@ -200,13 +202,13 @@ class MicroBenchHostDeviceBandwidth { cgh.single_task>([=]() { /* NOP */ }); }); - auto acc = buffer->template get_access(); + auto acc = buffer->get_host_access(); return verifyAccessor(acc); } if constexpr(Direction == CopyDirection::DEVICE_TO_HOST) { if constexpr(!Strided) { - auto acc = buffer->template get_access(); + auto acc = buffer->get_host_access(); return verifyAccessor(acc); } @@ -254,4 +256,4 @@ int main(int argc, char** argv) { app.run>(); return 0; -} +} \ No newline at end of file diff --git a/micro/local_mem.cpp b/micro/local_mem.cpp index 72c6fdbe..8eea7dac 100644 --- a/micro/local_mem.cpp +++ b/micro/local_mem.cpp @@ -2,7 +2,7 @@ #include -namespace s = cl::sycl; +namespace s = sycl; template class MicroBenchLocalMemoryKernel; @@ -30,12 +30,12 @@ class MicroBenchLocalMemory { output_buf.initialize(args.device_queue, s::range<1>(args.problem_size)); } - void run(std::vector& events) { - events.push_back(args.device_queue.submit([&](cl::sycl::handler& cgh) { + void run(std::vector& events) { + events.push_back(args.device_queue.submit([&](sycl::handler& cgh) { auto in = input_buf.template get_access(cgh); auto out = output_buf.template get_access(cgh); // local memory definition - s::accessor local_mem(args.local_size, cgh); + s::local_accessor local_mem(args.local_size, cgh); s::nd_range<1> ndrange{{args.problem_size}, {args.local_size}}; @@ -43,20 +43,21 @@ class MicroBenchLocalMemory { DATA_TYPE r0; int gid = item.get_global_id(0); int lid = item.get_local_id(0); - int lid2 = (item.get_local_id(0)+1) % item.get_local_range()[0]; + int lid2 = (item.get_local_id(0) + 1) % item.get_local_range()[0]; local_mem[lid] = in[gid]; - item.barrier(s::access::fence_space::local_space); + s::group_barrier(item.get_group()); - // Note: this is dangerous, as a compiler could in principle be smart enough to figure out that it can just drop this - // so far, we haven't encountered such a compiler, and all options to make it "safer" + // Note: this is dangerous, as a compiler could in principle be smart enough to figure out that it can just drop + // this + // so far, we haven't encountered such a compiler, and all options to make it "safer" // introduce overhead on at least some platform / data type combinations for(int i = 0; i < COMP_ITERS; i++) { local_mem[lid2] = local_mem[lid]; } - item.barrier(s::access::fence_space::local_space); + s::group_barrier(item.get_group()); out[gid] = local_mem[lid]; }); @@ -72,7 +73,7 @@ class MicroBenchLocalMemory { } bool verify(VerificationSetting& ver) { - auto result = output_buf.template get_access(); + auto result = output_buf.get_host_access(); for(size_t i = 0; i < args.problem_size; ++i) { if(result[i] != 42) { return false; @@ -99,8 +100,8 @@ int main(int argc, char** argv) { app.run>(); // double precision - if(app.deviceSupportsFP64()) + if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) { app.run>(); - + } return 0; } diff --git a/micro/pattern_L2.cpp b/micro/pattern_L2.cpp index edd478c6..b6d03229 100644 --- a/micro/pattern_L2.cpp +++ b/micro/pattern_L2.cpp @@ -2,46 +2,43 @@ #include -namespace s = cl::sycl; +namespace s = sycl; -template class MicroBenchL2Kernel; +template +class MicroBenchL2Kernel; /* Microbenchmark stressing the main arithmetic units. */ template -class MicroBenchL2 -{ +class MicroBenchL2 { protected: - std::vector input; - BenchmarkArgs args; + std::vector input; + BenchmarkArgs args; + + PrefetchedBuffer input_buf; + PrefetchedBuffer output_buf; - PrefetchedBuffer input_buf; - PrefetchedBuffer output_buf; public: - MicroBenchL2(const BenchmarkArgs &_args) : args(_args) {} + MicroBenchL2(const BenchmarkArgs& _args) : args(_args) {} void setup() { - // buffers initialized to a default value - input. resize(args.problem_size, 10); + // buffers initialized to a default value + input.resize(args.problem_size, 10); input_buf.initialize(args.device_queue, input.data(), s::range<1>(args.problem_size)); output_buf.initialize(args.device_queue, s::range<1>(args.problem_size)); } - void run(std::vector& events){ - - events.push_back(args.device_queue.submit( - [&](cl::sycl::handler& cgh) { - auto in = input_buf.template get_access(cgh); + void run(std::vector& events) { + events.push_back(args.device_queue.submit([&](sycl::handler& cgh) { + auto in = input_buf.template get_access(cgh); auto out = output_buf.template get_access(cgh); - cl::sycl::range<1> ndrange {args.problem_size}; + sycl::range<1> ndrange{args.problem_size}; - cgh.parallel_for>(ndrange, - [=](cl::sycl::id<1> gid) - { + cgh.parallel_for>(ndrange, [=](sycl::id<1> gid) { DATA_TYPE r0; - for (int i=0;i >(); - app.run< MicroBenchL2 >(); - app.run< MicroBenchL2 >(); - app.run< MicroBenchL2 >(); - app.run< MicroBenchL2 >(); - - // single precision - app.run< MicroBenchL2 >(); - app.run< MicroBenchL2 >(); - app.run< MicroBenchL2 >(); - app.run< MicroBenchL2 >(); - app.run< MicroBenchL2 >(); + app.run>(); + app.run>(); + app.run>(); + app.run>(); + app.run>(); + + // single precision + app.run>(); + app.run>(); + app.run>(); + app.run>(); + app.run>(); + // double precision - if(app.deviceSupportsFP64()) { + if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) { app.run>(); app.run>(); app.run>(); app.run>(); app.run>(); } - return 0; } - - - diff --git a/micro/sf.cpp b/micro/sf.cpp index 47b9202d..90c5e9a8 100644 --- a/micro/sf.cpp +++ b/micro/sf.cpp @@ -1,6 +1,6 @@ #include "common.h" -namespace s = cl::sycl; +namespace s = sycl; template class MicroBenchSpecialFuncKernel; @@ -32,7 +32,7 @@ class MicroBenchSpecialFunc { return {OP / 1024.0 / 1024.0 / 1024.0, "GOP"}; } - void run(std::vector& events) { + void run(std::vector& events) { events.push_back(args.device_queue.submit([&](s::handler& cgh) { auto in = input_buf.template get_access(cgh); auto out = output_buf.template get_access(cgh); @@ -62,7 +62,7 @@ class MicroBenchSpecialFunc { v2 = s::tan(v0); } const DataT expected = v2; - auto result = output_buf.template get_access(); + auto result = output_buf.get_host_access(); for(size_t i = 0; i < args.problem_size; ++i) { constexpr DataT EPSILON = 1e-5; if(std::abs(result[i] - expected) > EPSILON) { @@ -85,8 +85,8 @@ int main(int argc, char** argv) { BenchmarkApp app(argc, argv); app.run>(); - if(app.deviceSupportsFP64()) + if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) { app.run>(); - + } return 0; } diff --git a/pattern/prefixsum.cpp b/pattern/prefixsum.cpp index 34adabac..d508fd7f 100644 --- a/pattern/prefixsum.cpp +++ b/pattern/prefixsum.cpp @@ -1 +1 @@ -TODO: SYCL code with a prefix sum +TODO : SYCL code with a prefix sum diff --git a/pattern/reduction.cpp b/pattern/reduction.cpp index bb5892fb..e1d938f4 100644 --- a/pattern/reduction.cpp +++ b/pattern/reduction.cpp @@ -2,81 +2,74 @@ #include "common.h" #include +#include #include #include #include -#include -using namespace cl; +using namespace sycl; -template class ReductionKernelNDRange; -template class ReductionKernelHierarchical; +template +class ReductionKernelNDRange; +template +class ReductionKernelHierarchical; template -class Reduction -{ +class Reduction { protected: - std::vector _input; - BenchmarkArgs _args; + std::vector _input; + BenchmarkArgs _args; + + PrefetchedBuffer _input_buff; + PrefetchedBuffer _output_buff; + sycl::buffer* _final_output_buff; + T _result; - PrefetchedBuffer _input_buff; - PrefetchedBuffer _output_buff; - sycl::buffer* _final_output_buff; - T _result; public: - Reduction(const BenchmarkArgs &args) - : _args{args} - { - assert(_args.problem_size % _args.local_size == 0); - } + Reduction(const BenchmarkArgs& args) : _args{args} { assert(_args.problem_size % _args.local_size == 0); } - void generate_input(std::vector& out) - { + void generate_input(std::vector& out) { out.resize(_args.problem_size); - for(std::size_t i = 0; i < out.size(); ++i) - out[i] = static_cast(i); + for(std::size_t i = 0; i < out.size(); ++i) out[i] = static_cast(i); } void setup() { generate_input(_input); - _input_buff.initialize(_args.device_queue, static_cast(_input.data()), sycl::range<1>(_args.problem_size)); + _input_buff.initialize( + _args.device_queue, static_cast(_input.data()), sycl::range<1>(_args.problem_size)); _output_buff.initialize(_args.device_queue, sycl::range<1>{_args.problem_size}); } - void submit_ndrange(std::vector& events){ - this->submit([this, &events](sycl::buffer *input, sycl::buffer *output, - const size_t reduction_size, const size_t num_groups) { + void submit_ndrange(std::vector& events) { + this->submit([this, &events](sycl::buffer* input, sycl::buffer* output, const size_t reduction_size, + const size_t num_groups) { events.push_back(this->local_reduce_ndrange(input, output, reduction_size, num_groups)); }); } - void submit_hierarchical(std::vector& events){ - this->submit([this, &events](sycl::buffer *input, sycl::buffer *output, - const size_t reduction_size, const size_t num_groups) { - events.push_back(this->local_reduce_hierarchical(input, output, reduction_size, - num_groups)); + void submit_hierarchical(std::vector& events) { + this->submit([this, &events](sycl::buffer* input, sycl::buffer* output, const size_t reduction_size, + const size_t num_groups) { + events.push_back(this->local_reduce_hierarchical(input, output, reduction_size, num_groups)); }); } - bool verify(VerificationSetting &ver) { + bool verify(VerificationSetting& ver) { T result = _final_output_buff->get_host_access()[0]; // Calculate CPU result in fp64 to avoid obtaining a wrong verification result std::vector input_fp64(_input.size()); - for(std::size_t i = 0; i < _input.size(); ++i) - input_fp64[i] = static_cast(_input[i]); - + for(std::size_t i = 0; i < _input.size(); ++i) input_fp64[i] = static_cast(i); double delta = - static_cast(result) - std::accumulate(input_fp64.begin(), input_fp64.end(), T{}); - + static_cast(result) - std::reduce(input_fp64.begin(), input_fp64.end(), 0, std::plus()); return std::abs(delta) < 1.e-5; } + private: - template - void submit(Kernel_invocation_function kernel) - { + template + void submit(Kernel_invocation_function kernel) { sycl::buffer* input_buff = &_input_buff.get(); sycl::buffer* output_buff = &_output_buff.get(); @@ -85,13 +78,11 @@ class Reduction do { // invoke local reduction - kernel(input_buff, output_buff, current_reduction_size, - current_num_groups); + kernel(input_buff, output_buff, current_reduction_size, current_num_groups); current_reduction_size = current_num_groups; if(current_num_groups > 1) - current_num_groups = - (current_reduction_size + _args.local_size - 1) / _args.local_size; + current_num_groups = (current_reduction_size + _args.local_size - 1) / _args.local_size; else // This was the final iteration current_num_groups = 0; @@ -101,113 +92,87 @@ class Reduction std::swap(input_buff, output_buff); } while(current_num_groups > 0); - + _final_output_buff = output_buff; } - sycl::event local_reduce_ndrange( - sycl::buffer* input, sycl::buffer* output, - const size_t reduction_size, const std::size_t num_groups) - { - return _args.device_queue.submit([&](sycl::handler &cgh) { - - sycl::nd_range<1> ndrange{num_groups * _args.local_size, - _args.local_size}; + sycl::event local_reduce_ndrange(sycl::buffer* input, sycl::buffer* output, const size_t reduction_size, + const std::size_t num_groups) { + return _args.device_queue.submit([&](sycl::handler& cgh) { + sycl::nd_range<1> ndrange{num_groups * _args.local_size, _args.local_size}; - using namespace cl::sycl::access; + using namespace sycl::access; - auto acc = input->template get_access(cgh); + auto acc = input->template get_access(cgh); auto acc_out = output->template get_access(cgh); - auto scratch = sycl::accessor - {_args.local_size, cgh}; - + auto scratch = sycl::local_accessor{_args.local_size, cgh}; const int group_size = _args.local_size; - cgh.parallel_for>( - ndrange, - [=](sycl::nd_item<1> item) { - - const int lid = item.get_local_id(0); - const auto gid = item.get_global_id(); - - scratch[lid] = (gid[0] < reduction_size) ? acc[gid] : 0; - - for(int i = group_size/2; i > 0; i /= 2) { - - item.barrier(); - if(lid < i) - scratch[lid] += scratch[lid + i]; - - } - if(lid == 0) - acc_out[item.get_group(0)] = scratch[0]; - }); + cgh.parallel_for>(ndrange, [=](sycl::nd_item<1> item) { + const int lid = item.get_local_id(0); + const auto gid = item.get_global_id(); + + scratch[lid] = (gid[0] < reduction_size) ? acc[gid] : 0; + + for(int i = group_size / 2; i > 0; i /= 2) { + sycl::group_barrier(item.get_group()); + if(lid < i) + scratch[lid] += scratch[lid + i]; + } + if(lid == 0) + acc_out[item.get_group(0)] = scratch[0]; + }); }); // submit } - sycl::event local_reduce_hierarchical( - sycl::buffer* input, sycl::buffer* output, - const size_t reduction_size, const std::size_t num_groups) - { - return _args.device_queue.submit( - [&](sycl::handler& cgh) { - + sycl::event local_reduce_hierarchical(sycl::buffer* input, sycl::buffer* output, + const size_t reduction_size, const std::size_t num_groups) { + return _args.device_queue.submit([&](sycl::handler& cgh) { using namespace sycl::access; - auto acc = input->template get_access(cgh); + auto acc = input->template get_access(cgh); auto acc_out = output->template get_access(cgh); - auto scratch = sycl::accessor - {_args.local_size, cgh}; + auto scratch = sycl::local_accessor{_args.local_size, cgh}; const int group_size = _args.local_size; cgh.parallel_for_work_group>( - sycl::range<1>{num_groups}, - sycl::range<1>{_args.local_size}, - [=](sycl::group<1> grp) { - - grp.parallel_for_work_item([&](sycl::h_item<1> idx){ - const int lid = idx.get_local_id(0); - const auto gid = idx.get_global_id(); - - scratch[lid] = (gid[0] < reduction_size) ? acc[gid] : 0; - }); - - for(int i = group_size/2; i > 0; i /= 2) { - grp.parallel_for_work_item([&](sycl::h_item<1> idx){ + sycl::range<1>{num_groups}, sycl::range<1>{_args.local_size}, [=](sycl::group<1> grp) { + grp.parallel_for_work_item([&](sycl::h_item<1> idx) { const int lid = idx.get_local_id(0); + const auto gid = idx.get_global_id(); - if (lid < i) - scratch[lid] += scratch[lid + i]; + scratch[lid] = (gid[0] < reduction_size) ? acc[gid] : 0; }); - } - - // Spawn another parallel_for_work_item to work around - // limitations in hipSYCL device implementation of - // hierarchical parallel for - grp.parallel_for_work_item([&](sycl::h_item<1> idx){ - if(idx.get_local_id(0) == 0) - acc_out[grp.get_id(0)] = scratch[0]; - }); - }); + for(int i = group_size / 2; i > 0; i /= 2) { + grp.parallel_for_work_item([&](sycl::h_item<1> idx) { + const int lid = idx.get_local_id(0); + + if(lid < i) + scratch[lid] += scratch[lid + i]; + }); + } + + // Spawn another parallel_for_work_item to work around + // limitations in hipSYCL device implementation of + // hierarchical parallel for + grp.parallel_for_work_item([&](sycl::h_item<1> idx) { + if(idx.get_local_id(0) == 0) + acc_out[grp.get_group_id(0)] = scratch[0]; + }); + }); }); // submit } - }; -template -class ReductionNDRange : public Reduction -{ +template +class ReductionNDRange : public Reduction { public: - ReductionNDRange(const BenchmarkArgs &args) - : Reduction{args} - {} + ReductionNDRange(const BenchmarkArgs& args) : Reduction{args} {} - void run(std::vector& events){ - this->submit_ndrange(events); - } + void run(std::vector& events) { this->submit_ndrange(events); } static std::string getBenchmarkName(BenchmarkArgs& args) { std::stringstream name; @@ -217,15 +182,12 @@ class ReductionNDRange : public Reduction } }; -template -class ReductionHierarchical : public Reduction -{ +template +class ReductionHierarchical : public Reduction { public: - ReductionHierarchical(const BenchmarkArgs &args) - : Reduction{args} - {} + ReductionHierarchical(const BenchmarkArgs& args) : Reduction{args} {} - void run(std::vector& events){ + void run(std::vector& events) { this->submit_hierarchical(events); // Waiting is not necessary as the BenchmarkManager will already call // wait_and_throw() here @@ -239,29 +201,26 @@ class ReductionHierarchical : public Reduction } }; -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { BenchmarkApp app(argc, argv); // Using short will lead to overflow even for // small problem sizes - //app.run< ReductionNDRange>(); - if(app.shouldRunNDRangeKernels()){ - app.run< ReductionNDRange>(); - app.run< ReductionNDRange>(); - app.run< ReductionNDRange>(); - if(app.deviceSupportsFP64()) + // app.run< ReductionNDRange>(); + if(app.shouldRunNDRangeKernels()) { + app.run>(); + app.run>(); + app.run>(); + if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) { app.run>(); + } } - //app.run< ReductionHierarchical>(); - app.run< ReductionHierarchical>(); - app.run< ReductionHierarchical>(); - app.run< ReductionHierarchical>(); - if(app.deviceSupportsFP64()) + // app.run< ReductionHierarchical>(); + app.run>(); + app.run>(); + app.run>(); + if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) { app.run>(); - + } return 0; } - - - diff --git a/pattern/scan.cpp b/pattern/scan.cpp index 1aaacc15..9258b297 100644 --- a/pattern/scan.cpp +++ b/pattern/scan.cpp @@ -1 +1 @@ -TODO: SYCL code with a scan +TODO : SYCL code with a scan diff --git a/pattern/segmentedreduction.cpp b/pattern/segmentedreduction.cpp index ea22f1f1..fe35c0f9 100644 --- a/pattern/segmentedreduction.cpp +++ b/pattern/segmentedreduction.cpp @@ -1,130 +1,108 @@ #include "common.h" +#include #include #include -#include -using namespace cl; +using namespace sycl; -template class ReductionKernelNDRange; -template class ReductionKernelHierarchical; +template +class ReductionKernelNDRange; +template +class ReductionKernelHierarchical; template -class SegmentedReduction -{ +class SegmentedReduction { protected: - std::vector _input; - BenchmarkArgs _args; - PrefetchedBuffer _buff; + std::vector _input; + BenchmarkArgs _args; + PrefetchedBuffer _buff; + public: - SegmentedReduction(const BenchmarkArgs &args) - : _args{args} - { - - assert(_args.problem_size % _args.local_size == 0); - } + SegmentedReduction(const BenchmarkArgs& args) : _args{args} { assert(_args.problem_size % _args.local_size == 0); } - void generate_input(std::vector& out) - { + void generate_input(std::vector& out) { out.resize(_args.problem_size); - for(std::size_t i = 0; i < out.size(); ++i) - out[i] = static_cast(i); + for(std::size_t i = 0; i < out.size(); ++i) out[i] = static_cast(i); } void setup() { generate_input(_input); - _buff.initialize(_args.device_queue,_input.data(), sycl::range<1>(_args.problem_size)); + _buff.initialize(_args.device_queue, _input.data(), sycl::range<1>(_args.problem_size)); } - void submit_ndrange(std::vector& events){ - - events.push_back(_args.device_queue.submit( - [&](sycl::handler& cgh) { - - sycl::nd_range<1> ndrange {_args.problem_size, _args.local_size}; + void submit_ndrange(std::vector& events) { + events.push_back(_args.device_queue.submit([&](sycl::handler& cgh) { + sycl::nd_range<1> ndrange{_args.problem_size, _args.local_size}; - using namespace cl::sycl::access; + using namespace sycl::access; auto acc = _buff.template get_access(cgh); - auto scratch = sycl::accessor - {_args.local_size, cgh}; + auto scratch = sycl::local_accessor{_args.local_size, cgh}; const int group_size = _args.local_size; - cgh.parallel_for>( - ndrange, - [=](sycl::nd_item<1> item) { - - const int lid = item.get_local_id(0); - const auto gid = item.get_global_id(); - - scratch[lid] = acc[gid]; - - for(int i = group_size/2; i > 0; i /= 2) { + cgh.parallel_for>(ndrange, [=](sycl::nd_item<1> item) { + const int lid = item.get_local_id(0); + const auto gid = item.get_global_id(); - item.barrier(); - if(lid < i) - scratch[lid] += scratch[lid + i]; + scratch[lid] = acc[gid]; - } - if(lid == 0) - acc[gid] = scratch[0]; - }); + for(int i = group_size / 2; i > 0; i /= 2) { + sycl::group_barrier(item.get_group()); + if(lid < i) + scratch[lid] += scratch[lid + i]; + } + if(lid == 0) + acc[gid] = scratch[0]; + }); })); // submit } - void submit_hierarchical(std::vector& events){ - - events.push_back(_args.device_queue.submit( - [&](sycl::handler& cgh) { - + void submit_hierarchical(std::vector& events) { + events.push_back(_args.device_queue.submit([&](sycl::handler& cgh) { using namespace sycl::access; auto acc = _buff.template get_access(cgh); - auto scratch = sycl::accessor - {_args.local_size, cgh}; + auto scratch = sycl::local_accessor{_args.local_size, cgh}; const int group_size = _args.local_size; - cgh.parallel_for_work_group>( - sycl::range<1>{_args.problem_size / _args.local_size}, - sycl::range<1>{_args.local_size}, - [=](sycl::group<1> grp) { - - grp.parallel_for_work_item([&](sycl::h_item<1> idx){ - const int lid = idx.get_local_id(0); - const auto gid = idx.get_global_id(); - - scratch[lid] = acc[gid]; - }); - - for(int i = group_size/2; i > 0; i /= 2) { - grp.parallel_for_work_item([&](sycl::h_item<1> idx){ + cgh.parallel_for_work_group>(sycl::range<1>{_args.problem_size / _args.local_size}, + sycl::range<1>{_args.local_size}, [=](sycl::group<1> grp) { + grp.parallel_for_work_item([&](sycl::h_item<1> idx) { const int lid = idx.get_local_id(0); + const auto gid = idx.get_global_id(); - if (lid < i) - scratch[lid] += scratch[lid + i]; + scratch[lid] = acc[gid]; }); - } - grp.parallel_for_work_item([&](sycl::h_item<1> idx){ - if(idx.get_local_id(0) == 0) - acc[idx.get_global_id()] = scratch[0]; + for(int i = group_size / 2; i > 0; i /= 2) { + grp.parallel_for_work_item([&](sycl::h_item<1> idx) { + const int lid = idx.get_local_id(0); + + if(lid < i) + scratch[lid] += scratch[lid + i]; + }); + } + + grp.parallel_for_work_item([&](sycl::h_item<1> idx) { + if(idx.get_local_id(0) == 0) + acc[idx.get_global_id()] = scratch[0]; + }); }); - }); })); // submit } - bool verify(VerificationSetting &ver) { + bool verify(VerificationSetting& ver) { std::vector original_input; generate_input(original_input); - auto acc = _buff.template get_access(); + auto acc = _buff.get_host_access(); size_t num_groups = _args.problem_size / _args.local_size; for(size_t group = 0; group < num_groups; ++group) { - size_t group_offset = group * _args.local_size; T sum = 0; @@ -144,19 +122,14 @@ class SegmentedReduction return true; } - - }; -template -class SegmentedReductionNDRange : public SegmentedReduction -{ +template +class SegmentedReductionNDRange : public SegmentedReduction { public: - SegmentedReductionNDRange(const BenchmarkArgs &args) - : SegmentedReduction{args} - {} + SegmentedReductionNDRange(const BenchmarkArgs& args) : SegmentedReduction{args} {} - void run(std::vector& events){ + void run(std::vector& events) { this->submit_ndrange(events); // Waiting is not necessary as the BenchmarkManager will already call // wait_and_throw() here @@ -170,15 +143,12 @@ class SegmentedReductionNDRange : public SegmentedReduction } }; -template -class SegmentedReductionHierarchical : public SegmentedReduction -{ +template +class SegmentedReductionHierarchical : public SegmentedReduction { public: - SegmentedReductionHierarchical(const BenchmarkArgs &args) - : SegmentedReduction{args} - {} + SegmentedReductionHierarchical(const BenchmarkArgs& args) : SegmentedReduction{args} {} - void run(std::vector& events){ + void run(std::vector& events) { this->submit_hierarchical(events); // Waiting is not necessary as the BenchmarkManager will already call // wait_and_throw() here @@ -192,28 +162,25 @@ class SegmentedReductionHierarchical : public SegmentedReduction } }; -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { BenchmarkApp app(argc, argv); if(app.shouldRunNDRangeKernels()) { - app.run< SegmentedReductionNDRange>(); - app.run< SegmentedReductionNDRange>(); - app.run< SegmentedReductionNDRange>(); - app.run< SegmentedReductionNDRange>(); - if(app.deviceSupportsFP64()) + app.run>(); + app.run>(); + app.run>(); + app.run>(); + if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) { app.run>(); + } } - app.run< SegmentedReductionHierarchical>(); - app.run< SegmentedReductionHierarchical>(); - app.run< SegmentedReductionHierarchical>(); - app.run< SegmentedReductionHierarchical>(); - if(app.deviceSupportsFP64()) + app.run>(); + app.run>(); + app.run>(); + app.run>(); + if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) { app.run>(); - + } return 0; } - - - diff --git a/pattern/segmentedscan.cpp b/pattern/segmentedscan.cpp index 64de7319..dc892c82 100644 --- a/pattern/segmentedscan.cpp +++ b/pattern/segmentedscan.cpp @@ -1 +1 @@ -TODO: SYCL code with segmented scan +TODO : SYCL code with segmented scan diff --git a/polybench/2DConvolution.cpp b/polybench/2DConvolution.cpp index 65c2d825..c35410cc 100644 --- a/polybench/2DConvolution.cpp +++ b/polybench/2DConvolution.cpp @@ -3,7 +3,7 @@ #include -#include +#include #include "common.h" #include "polybenchUtilFuncts.h" @@ -13,106 +13,108 @@ using DATA_TYPE = float; class conv2D; void init(DATA_TYPE* A, size_t size) { - const auto NI = size; - const auto NJ = size; - - for(size_t i = 0; i < NI; ++i) { - for(size_t j = 0; j < NJ; ++j) { - A[i * NJ + j] = (float)rand() / (float)RAND_MAX; - } - } + const auto NI = size; + const auto NJ = size; + + for(size_t i = 0; i < NI; ++i) { + for(size_t j = 0; j < NJ; ++j) { + A[i * NJ + j] = (float)rand() / (float)RAND_MAX; + } + } } void conv2D(DATA_TYPE* A, DATA_TYPE* B, size_t size) { - const auto NI = size; - const auto NJ = size; - - const DATA_TYPE c11 = +0.2, c21 = +0.5, c31 = -0.8; - const DATA_TYPE c12 = -0.3, c22 = +0.6, c32 = -0.9; - const DATA_TYPE c13 = +0.4, c23 = +0.7, c33 = +0.10; - - for(size_t i = 1; i < NI - 1; ++i) { - for(size_t j = 1; j < NJ - 1; ++j) { - B[i * NJ + j] = c11 * A[(i - 1) * NJ + (j - 1)] + c12 * A[(i + 0) * NJ + (j - 1)] + c13 * A[(i + 1) * NJ + (j - 1)] - + c21 * A[(i - 1) * NJ + (j + 0)] + c22 * A[(i + 0) * NJ + (j + 0)] + c23 * A[(i + 1) * NJ + (j + 0)] - + c31 * A[(i - 1) * NJ + (j + 1)] + c32 * A[(i + 0) * NJ + (j + 1)] + c33 * A[(i + 1) * NJ + (j + 1)]; - } - } + const auto NI = size; + const auto NJ = size; + + const DATA_TYPE c11 = +0.2, c21 = +0.5, c31 = -0.8; + const DATA_TYPE c12 = -0.3, c22 = +0.6, c32 = -0.9; + const DATA_TYPE c13 = +0.4, c23 = +0.7, c33 = +0.10; + + for(size_t i = 1; i < NI - 1; ++i) { + for(size_t j = 1; j < NJ - 1; ++j) { + B[i * NJ + j] = + c11 * A[(i - 1) * NJ + (j - 1)] + c12 * A[(i + 0) * NJ + (j - 1)] + c13 * A[(i + 1) * NJ + (j - 1)] + + c21 * A[(i - 1) * NJ + (j + 0)] + c22 * A[(i + 0) * NJ + (j + 0)] + c23 * A[(i + 1) * NJ + (j + 0)] + + c31 * A[(i - 1) * NJ + (j + 1)] + c32 * A[(i + 0) * NJ + (j + 1)] + c33 * A[(i + 1) * NJ + (j + 1)]; + } + } } class Polybench_2DConvolution { - public: - Polybench_2DConvolution(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} +public: + Polybench_2DConvolution(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} - void setup() { - A.resize(size * size); - B.resize(size * size); + void setup() { + A.resize(size * size); + B.resize(size * size); - init(A.data(), size); + init(A.data(), size); - A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<2>(size, size)); - B_buffer.initialize(args.device_queue, B.data(), cl::sycl::range<2>(size, size)); - } + A_buffer.initialize(args.device_queue, A.data(), sycl::range<2>(size, size)); + B_buffer.initialize(args.device_queue, B.data(), sycl::range<2>(size, size)); + } - void run(std::vector& events) { - using namespace cl::sycl; + void run(std::vector& events) { + using namespace sycl; - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto A = A_buffer.get_access(cgh); - auto B = B_buffer.get_access(cgh); + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto A = A_buffer.get_access(cgh); + auto B = B_buffer.get_access(cgh); - cgh.parallel_for(B_buffer.get_range(), [=, size_ = size](item<2> item) { - const auto i = item[0]; - const auto j = item[1]; + cgh.parallel_for(B_buffer.get_range(), [=, size_ = size](item<2> item) { + const auto i = item[0]; + const auto j = item[1]; - const DATA_TYPE c11 = +0.2, c21 = +0.5, c31 = -0.8; - const DATA_TYPE c12 = -0.3, c22 = +0.6, c32 = -0.9; - const DATA_TYPE c13 = +0.4, c23 = +0.7, c33 = +0.10; + const DATA_TYPE c11 = +0.2, c21 = +0.5, c31 = -0.8; + const DATA_TYPE c12 = -0.3, c22 = +0.6, c32 = -0.9; + const DATA_TYPE c13 = +0.4, c23 = +0.7, c33 = +0.10; - if((i > 0) && (j > 0) && (i < size_ - 1) && (j < size_ - 1)) { - B[item] = c11 * A[{(i - 1), (j - 1)}] + c12 * A[{(i + 0), (j - 1)}] + c13 * A[{(i + 1), (j - 1)}] + c21 * A[{(i - 1), (j + 0)}] - + c22 * A[{(i + 0), (j + 0)}] + c23 * A[{(i + 1), (j + 0)}] + c31 * A[{(i - 1), (j + 1)}] + c32 * A[{(i + 0), (j + 1)}] - + c33 * A[{(i + 1), (j + 1)}]; - } - }); - })); - } + if((i > 0) && (j > 0) && (i < size_ - 1) && (j < size_ - 1)) { + B[item] = c11 * A[{(i - 1), (j - 1)}] + c12 * A[{(i + 0), (j - 1)}] + c13 * A[{(i + 1), (j - 1)}] + + c21 * A[{(i - 1), (j + 0)}] + c22 * A[{(i + 0), (j + 0)}] + c23 * A[{(i + 1), (j + 0)}] + + c31 * A[{(i - 1), (j + 1)}] + c32 * A[{(i + 0), (j + 1)}] + c33 * A[{(i + 1), (j + 1)}]; + } + }); + })); + } - bool verify(VerificationSetting&) { - constexpr auto ERROR_THRESHOLD = 0.05; + bool verify(VerificationSetting&) { + constexpr auto ERROR_THRESHOLD = 0.05; - auto B_acc = B_buffer.get_access(); + auto B_acc = B_buffer.get_host_access(); - std::vector B_cpu(size * size); - conv2D(A.data(), B_cpu.data(), size); + std::vector B_cpu(size * size); + conv2D(A.data(), B_cpu.data(), size); - for(size_t i = 0; i < size; i++) { - for(size_t j = 0; j < size; j++) { - if((i > 0) && (j > 0) && (i < size - 1) && (j < size - 1)) { - const auto diff = percentDiff(B_cpu[i * size + j], B_acc.get_pointer()[i * size + j]); - if(diff > ERROR_THRESHOLD) return false; - } - } - } + for(size_t i = 0; i < size; i++) { + for(size_t j = 0; j < size; j++) { + if((i > 0) && (j > 0) && (i < size - 1) && (j < size - 1)) { + const auto diff = percentDiff(B_cpu[i * size + j], B_acc.get_pointer()[i * size + j]); + if(diff > ERROR_THRESHOLD) + return false; + } + } + } - return true; - } + return true; + } - static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_2DConvolution"; } + static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_2DConvolution"; } - private: - BenchmarkArgs args; +private: + BenchmarkArgs args; - const size_t size; - std::vector A; - std::vector B; + const size_t size; + std::vector A; + std::vector B; - PrefetchedBuffer A_buffer; - PrefetchedBuffer B_buffer; + PrefetchedBuffer A_buffer; + PrefetchedBuffer B_buffer; }; int main(int argc, char** argv) { - BenchmarkApp app(argc, argv); - app.run(); - return 0; + BenchmarkApp app(argc, argv); + app.run(); + return 0; } diff --git a/polybench/2mm.cpp b/polybench/2mm.cpp index 13286190..0f28d864 100644 --- a/polybench/2mm.cpp +++ b/polybench/2mm.cpp @@ -3,7 +3,7 @@ #include -#include +#include #include "common.h" #include "polybenchUtilFuncts.h" @@ -14,156 +14,157 @@ class Polybench_2mm_2; class Polybench_2mm_1; void init_array(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* C, DATA_TYPE* D, size_t size) { - const auto NI = size; - const auto NJ = size; - const auto NK = size; - const auto NL = size; - - for(size_t i = 0; i < NI; i++) { - for(size_t j = 0; j < NK; j++) { - A[i * NI + j] = ((DATA_TYPE)i * j) / NI; - } - } - - for(size_t i = 0; i < NK; i++) { - for(size_t j = 0; j < NJ; j++) { - B[i * NK + j] = ((DATA_TYPE)i * (j + 1)) / NJ; - } - } - - for(size_t i = 0; i < NL; i++) { - for(size_t j = 0; j < NJ; j++) { - C[i * NL + j] = ((DATA_TYPE)i * (j + 3)) / NL; - } - } - - for(size_t i = 0; i < NI; i++) { - for(size_t j = 0; j < NL; j++) { - D[i * NL + j] = ((DATA_TYPE)i * (j + 2)) / NK; - } - } + const auto NI = size; + const auto NJ = size; + const auto NK = size; + const auto NL = size; + + for(size_t i = 0; i < NI; i++) { + for(size_t j = 0; j < NK; j++) { + A[i * NI + j] = ((DATA_TYPE)i * j) / NI; + } + } + + for(size_t i = 0; i < NK; i++) { + for(size_t j = 0; j < NJ; j++) { + B[i * NK + j] = ((DATA_TYPE)i * (j + 1)) / NJ; + } + } + + for(size_t i = 0; i < NL; i++) { + for(size_t j = 0; j < NJ; j++) { + C[i * NL + j] = ((DATA_TYPE)i * (j + 3)) / NL; + } + } + + for(size_t i = 0; i < NI; i++) { + for(size_t j = 0; j < NL; j++) { + D[i * NL + j] = ((DATA_TYPE)i * (j + 2)) / NK; + } + } } void mm2_cpu(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* C, DATA_TYPE* D, DATA_TYPE* E, size_t size) { - const auto NI = size; - const auto NJ = size; - const auto NK = size; - const auto NL = size; - - for(size_t i = 0; i < NI; i++) { - for(size_t j = 0; j < NJ; j++) { - for(size_t k = 0; k < NK; ++k) { - C[i * NJ + j] += A[i * NK + k] * B[k * NJ + j]; - } - } - } - - for(size_t i = 0; i < NI; i++) { - for(size_t j = 0; j < NL; j++) { - E[i * NL + j] = 0; - for(size_t k = 0; k < NJ; ++k) { - E[i * NL + j] += C[i * NJ + k] * D[k * NL + j]; - } - } - } + const auto NI = size; + const auto NJ = size; + const auto NK = size; + const auto NL = size; + + for(size_t i = 0; i < NI; i++) { + for(size_t j = 0; j < NJ; j++) { + for(size_t k = 0; k < NK; ++k) { + C[i * NJ + j] += A[i * NK + k] * B[k * NJ + j]; + } + } + } + + for(size_t i = 0; i < NI; i++) { + for(size_t j = 0; j < NL; j++) { + E[i * NL + j] = 0; + for(size_t k = 0; k < NJ; ++k) { + E[i * NL + j] += C[i * NJ + k] * D[k * NL + j]; + } + } + } } class Polybench_2mm { - public: - Polybench_2mm(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} - - void setup() { - A.resize(size * size); - B.resize(size * size); - C.resize(size * size); - D.resize(size * size); - E.resize(size * size); - - init_array(A.data(), B.data(), C.data(), D.data(), size); - - A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<2>(size, size)); - B_buffer.initialize(args.device_queue, B.data(), cl::sycl::range<2>(size, size)); - C_buffer.initialize(args.device_queue, C.data(), cl::sycl::range<2>(size, size)); - D_buffer.initialize(args.device_queue, D.data(), cl::sycl::range<2>(size, size)); - E_buffer.initialize(args.device_queue, E.data(), cl::sycl::range<2>(size, size)); - } - - void run(std::vector& events) { - using namespace cl::sycl; - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto A = A_buffer.get_access(cgh); - auto B = B_buffer.get_access(cgh); - auto C = C_buffer.get_access(cgh); - - cgh.parallel_for(C_buffer.get_range(), [=, size_ = size](item<2> item) { - const auto i = item[0]; - const auto j = item[1]; - - for(size_t k = 0; k < size_; k++) { - C[item] += A[{i, k}] * B[{k, j}]; - } - }); - })); - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto C = C_buffer.get_access(cgh); - auto D = D_buffer.get_access(cgh); - auto E = E_buffer.get_access(cgh); - - cgh.parallel_for(E_buffer.get_range(), [=, size_ = size](item<2> item) { - const auto i = item[0]; - const auto j = item[1]; - - E[item] = 0; - for(size_t k = 0; k < size_; k++) { - E[item] += C[{i, k}] * D[{k, j}]; - } - }); - })); - } - - bool verify(VerificationSetting&) { - constexpr auto ERROR_THRESHOLD = 0.05; - - init_array(A.data(), B.data(), C.data(), D.data(), size); - - std::vector E_cpu(size * size); - mm2_cpu(A.data(), B.data(), C.data(), D.data(), E_cpu.data(), size); - - auto E_acc = E_buffer.get_access(); - - for(size_t i = 0; i < size; i++) { - for(size_t j = 0; j < size; j++) { - const auto diff = percentDiff(E_cpu[i * size + j], E_acc.get_pointer()[i * size + j]); - if(diff > ERROR_THRESHOLD) return false; - } - } - - return true; - } - - static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_2mm"; } - - private: - BenchmarkArgs args; - - const size_t size; - std::vector A; - std::vector B; - std::vector C; - std::vector D; - std::vector E; - - PrefetchedBuffer A_buffer; - PrefetchedBuffer B_buffer; - PrefetchedBuffer C_buffer; - PrefetchedBuffer D_buffer; - PrefetchedBuffer E_buffer; +public: + Polybench_2mm(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} + + void setup() { + A.resize(size * size); + B.resize(size * size); + C.resize(size * size); + D.resize(size * size); + E.resize(size * size); + + init_array(A.data(), B.data(), C.data(), D.data(), size); + + A_buffer.initialize(args.device_queue, A.data(), sycl::range<2>(size, size)); + B_buffer.initialize(args.device_queue, B.data(), sycl::range<2>(size, size)); + C_buffer.initialize(args.device_queue, C.data(), sycl::range<2>(size, size)); + D_buffer.initialize(args.device_queue, D.data(), sycl::range<2>(size, size)); + E_buffer.initialize(args.device_queue, E.data(), sycl::range<2>(size, size)); + } + + void run(std::vector& events) { + using namespace sycl; + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto A = A_buffer.get_access(cgh); + auto B = B_buffer.get_access(cgh); + auto C = C_buffer.get_access(cgh); + + cgh.parallel_for(C_buffer.get_range(), [=, size_ = size](item<2> item) { + const auto i = item[0]; + const auto j = item[1]; + + for(size_t k = 0; k < size_; k++) { + C[item] += A[{i, k}] * B[{k, j}]; + } + }); + })); + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto C = C_buffer.get_access(cgh); + auto D = D_buffer.get_access(cgh); + auto E = E_buffer.get_access(cgh); + + cgh.parallel_for(E_buffer.get_range(), [=, size_ = size](item<2> item) { + const auto i = item[0]; + const auto j = item[1]; + + E[item] = 0; + for(size_t k = 0; k < size_; k++) { + E[item] += C[{i, k}] * D[{k, j}]; + } + }); + })); + } + + bool verify(VerificationSetting&) { + constexpr auto ERROR_THRESHOLD = 0.05; + + init_array(A.data(), B.data(), C.data(), D.data(), size); + + std::vector E_cpu(size * size); + mm2_cpu(A.data(), B.data(), C.data(), D.data(), E_cpu.data(), size); + + auto E_acc = E_buffer.get_host_access(); + + for(size_t i = 0; i < size; i++) { + for(size_t j = 0; j < size; j++) { + const auto diff = percentDiff(E_cpu[i * size + j], E_acc.get_pointer()[i * size + j]); + if(diff > ERROR_THRESHOLD) + return false; + } + } + + return true; + } + + static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_2mm"; } + +private: + BenchmarkArgs args; + + const size_t size; + std::vector A; + std::vector B; + std::vector C; + std::vector D; + std::vector E; + + PrefetchedBuffer A_buffer; + PrefetchedBuffer B_buffer; + PrefetchedBuffer C_buffer; + PrefetchedBuffer D_buffer; + PrefetchedBuffer E_buffer; }; int main(int argc, char** argv) { - BenchmarkApp app(argc, argv); - app.run(); - return 0; + BenchmarkApp app(argc, argv); + app.run(); + return 0; } diff --git a/polybench/3DConvolution.cpp b/polybench/3DConvolution.cpp index 2b057ac7..d199b5a3 100644 --- a/polybench/3DConvolution.cpp +++ b/polybench/3DConvolution.cpp @@ -3,7 +3,7 @@ #include -#include +#include #include "common.h" #include "polybenchUtilFuncts.h" @@ -13,125 +13,135 @@ using DATA_TYPE = float; class conv3D; void init(DATA_TYPE* A, size_t size) { - const auto NI = size; - const auto NJ = size; - const auto NK = size; - - for(size_t i = 0; i < NI; ++i) { - for(size_t j = 0; j < NJ; ++j) { - for(size_t k = 0; k < NK; ++k) { - A[i * (NK * NJ) + j * NK + k] = i % 12 + 2 * (j % 7) + 3 * (k % 13); - } - } - } + const auto NI = size; + const auto NJ = size; + const auto NK = size; + + for(size_t i = 0; i < NI; ++i) { + for(size_t j = 0; j < NJ; ++j) { + for(size_t k = 0; k < NK; ++k) { + A[i * (NK * NJ) + j * NK + k] = i % 12 + 2 * (j % 7) + 3 * (k % 13); + } + } + } } void conv3D(DATA_TYPE* A, DATA_TYPE* B, size_t size) { - const auto NI = size; - const auto NJ = size; - const auto NK = size; - - const DATA_TYPE c11 = +2, c21 = +5, c31 = -8; - const DATA_TYPE c12 = -3, c22 = +6, c32 = -9; - const DATA_TYPE c13 = +4, c23 = +7, c33 = +10; - - for(size_t i = 1; i < NI - 1; ++i) { - for(size_t j = 1; j < NJ - 1; ++j) { - for(size_t k = 1; k < NK - 1; ++k) { - B[i * (NK * NJ) + j * NK + k] = c11 * A[(i - 1) * (NK * NJ) + (j - 1) * NK + (k - 1)] + c13 * A[(i + 1) * (NK * NJ) + (j - 1) * NK + (k - 1)] - + c21 * A[(i - 1) * (NK * NJ) + (j - 1) * NK + (k - 1)] + c23 * A[(i + 1) * (NK * NJ) + (j - 1) * NK + (k - 1)] - + c31 * A[(i - 1) * (NK * NJ) + (j - 1) * NK + (k - 1)] + c33 * A[(i + 1) * (NK * NJ) + (j - 1) * NK + (k - 1)] - + c12 * A[(i + 0) * (NK * NJ) + (j - 1) * NK + (k + 0)] + c22 * A[(i + 0) * (NK * NJ) + (j + 0) * NK + (k + 0)] - + c32 * A[(i + 0) * (NK * NJ) + (j + 1) * NK + (k + 0)] + c11 * A[(i - 1) * (NK * NJ) + (j - 1) * NK + (k + 1)] - + c13 * A[(i + 1) * (NK * NJ) + (j - 1) * NK + (k + 1)] + c21 * A[(i - 1) * (NK * NJ) + (j + 0) * NK + (k + 1)] - + c23 * A[(i + 1) * (NK * NJ) + (j + 0) * NK + (k + 1)] + c31 * A[(i - 1) * (NK * NJ) + (j + 1) * NK + (k + 1)] - + c33 * A[(i + 1) * (NK * NJ) + (j + 1) * NK + (k + 1)]; - } - } - } + const auto NI = size; + const auto NJ = size; + const auto NK = size; + + const DATA_TYPE c11 = +2, c21 = +5, c31 = -8; + const DATA_TYPE c12 = -3, c22 = +6, c32 = -9; + const DATA_TYPE c13 = +4, c23 = +7, c33 = +10; + + for(size_t i = 1; i < NI - 1; ++i) { + for(size_t j = 1; j < NJ - 1; ++j) { + for(size_t k = 1; k < NK - 1; ++k) { + B[i * (NK * NJ) + j * NK + k] = c11 * A[(i - 1) * (NK * NJ) + (j - 1) * NK + (k - 1)] + + c13 * A[(i + 1) * (NK * NJ) + (j - 1) * NK + (k - 1)] + + c21 * A[(i - 1) * (NK * NJ) + (j - 1) * NK + (k - 1)] + + c23 * A[(i + 1) * (NK * NJ) + (j - 1) * NK + (k - 1)] + + c31 * A[(i - 1) * (NK * NJ) + (j - 1) * NK + (k - 1)] + + c33 * A[(i + 1) * (NK * NJ) + (j - 1) * NK + (k - 1)] + + c12 * A[(i + 0) * (NK * NJ) + (j - 1) * NK + (k + 0)] + + c22 * A[(i + 0) * (NK * NJ) + (j + 0) * NK + (k + 0)] + + c32 * A[(i + 0) * (NK * NJ) + (j + 1) * NK + (k + 0)] + + c11 * A[(i - 1) * (NK * NJ) + (j - 1) * NK + (k + 1)] + + c13 * A[(i + 1) * (NK * NJ) + (j - 1) * NK + (k + 1)] + + c21 * A[(i - 1) * (NK * NJ) + (j + 0) * NK + (k + 1)] + + c23 * A[(i + 1) * (NK * NJ) + (j + 0) * NK + (k + 1)] + + c31 * A[(i - 1) * (NK * NJ) + (j + 1) * NK + (k + 1)] + + c33 * A[(i + 1) * (NK * NJ) + (j + 1) * NK + (k + 1)]; + } + } + } } class Polybench_3DConvolution { - public: - Polybench_3DConvolution(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} - - void setup() { - A.resize(size * size * size); - B.resize(size * size * size); - - init(A.data(), size); - - A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<3>(size, size, size)); - B_buffer.initialize(args.device_queue, B.data(), cl::sycl::range<3>(size, size, size)); - } - - void run(std::vector& events) { - using namespace cl::sycl; - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto A = A_buffer.get_access(cgh); - auto B = B_buffer.get_access(cgh); - - cgh.parallel_for(B_buffer.get_range(), [=, size_ = size](item<3> item) { - const auto i = item[0]; - const auto j = item[1]; - const auto k = item[2]; - - const DATA_TYPE c11 = +2, c21 = +5, c31 = -8; - const DATA_TYPE c12 = -3, c22 = +6, c32 = -9; - const DATA_TYPE c13 = +4, c23 = +7, c33 = +10; - - if((i > 0) && (j > 0) && (k > 0) && (i < (size_ - 1)) && (j < (size_ - 1)) && (k < (size_ - 1))) { - B[item] = c11 * A[{(i - 1), (j - 1), (k - 1)}] + c13 * A[{(i + 1), (j - 1), (k - 1)}] + c21 * A[{(i - 1), (j - 1), (k - 1)}] - + c23 * A[{(i + 1), (j - 1), (k - 1)}] + c31 * A[{(i - 1), (j - 1), (k - 1)}] + c33 * A[{(i + 1), (j - 1), (k - 1)}] - + c12 * A[{(i + 0), (j - 1), (k + 0)}] + c22 * A[{(i + 0), (j + 0), (k + 0)}] + c32 * A[{(i + 0), (j + 1), (k + 0)}] - + c11 * A[{(i - 1), (j - 1), (k + 1)}] + c13 * A[{(i + 1), (j - 1), (k + 1)}] + c21 * A[{(i - 1), (j + 0), (k + 1)}] - + c23 * A[{(i + 1), (j + 0), (k + 1)}] + c31 * A[{(i - 1), (j + 1), (k + 1)}] + c33 * A[{(i + 1), (j + 1), (k + 1)}]; - } - }); - })); - } - - - bool verify(VerificationSetting&) { - constexpr auto ERROR_THRESHOLD = 0.05; - - std::vector B_cpu(size * size * size); - conv3D(A.data(), B_cpu.data(), size); - - auto B_acc = B_buffer.get_access(); - - for(size_t i = 0; i < size; i++) { - for(size_t j = 0; j < size; j++) { - for(size_t k = 0; k < size; k++) { - if((i > 0) && (j > 0) && (k > 0) && (i < (size - 1)) && (j < (size - 1)) && (k < (size - 1))) { - const auto diff = percentDiff(B_cpu[i * (size * size) + j * size + k], - B_acc.get_pointer()[i * (size * size) + j * size + k]); - if(diff > ERROR_THRESHOLD) - return false; - } - } - } - } - - return true; - } - - static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_3DConvolution"; } - - private: - BenchmarkArgs args; - - const size_t size; - std::vector A; - std::vector B; - - PrefetchedBuffer A_buffer; - PrefetchedBuffer B_buffer; +public: + Polybench_3DConvolution(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} + + void setup() { + A.resize(size * size * size); + B.resize(size * size * size); + + init(A.data(), size); + + A_buffer.initialize(args.device_queue, A.data(), sycl::range<3>(size, size, size)); + B_buffer.initialize(args.device_queue, B.data(), sycl::range<3>(size, size, size)); + } + + void run(std::vector& events) { + using namespace sycl; + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto A = A_buffer.get_access(cgh); + auto B = B_buffer.get_access(cgh); + + cgh.parallel_for(B_buffer.get_range(), [=, size_ = size](item<3> item) { + const auto i = item[0]; + const auto j = item[1]; + const auto k = item[2]; + + const DATA_TYPE c11 = +2, c21 = +5, c31 = -8; + const DATA_TYPE c12 = -3, c22 = +6, c32 = -9; + const DATA_TYPE c13 = +4, c23 = +7, c33 = +10; + + if((i > 0) && (j > 0) && (k > 0) && (i < (size_ - 1)) && (j < (size_ - 1)) && (k < (size_ - 1))) { + B[item] = c11 * A[{(i - 1), (j - 1), (k - 1)}] + c13 * A[{(i + 1), (j - 1), (k - 1)}] + + c21 * A[{(i - 1), (j - 1), (k - 1)}] + c23 * A[{(i + 1), (j - 1), (k - 1)}] + + c31 * A[{(i - 1), (j - 1), (k - 1)}] + c33 * A[{(i + 1), (j - 1), (k - 1)}] + + c12 * A[{(i + 0), (j - 1), (k + 0)}] + c22 * A[{(i + 0), (j + 0), (k + 0)}] + + c32 * A[{(i + 0), (j + 1), (k + 0)}] + c11 * A[{(i - 1), (j - 1), (k + 1)}] + + c13 * A[{(i + 1), (j - 1), (k + 1)}] + c21 * A[{(i - 1), (j + 0), (k + 1)}] + + c23 * A[{(i + 1), (j + 0), (k + 1)}] + c31 * A[{(i - 1), (j + 1), (k + 1)}] + + c33 * A[{(i + 1), (j + 1), (k + 1)}]; + } + }); + })); + } + + + bool verify(VerificationSetting&) { + constexpr auto ERROR_THRESHOLD = 0.05; + + std::vector B_cpu(size * size * size); + conv3D(A.data(), B_cpu.data(), size); + + auto B_acc = B_buffer.get_host_access(); + + for(size_t i = 0; i < size; i++) { + for(size_t j = 0; j < size; j++) { + for(size_t k = 0; k < size; k++) { + if((i > 0) && (j > 0) && (k > 0) && (i < (size - 1)) && (j < (size - 1)) && (k < (size - 1))) { + const auto diff = percentDiff( + B_cpu[i * (size * size) + j * size + k], B_acc.get_pointer()[i * (size * size) + j * size + k]); + if(diff > ERROR_THRESHOLD) + return false; + } + } + } + } + + return true; + } + + static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_3DConvolution"; } + +private: + BenchmarkArgs args; + + const size_t size; + std::vector A; + std::vector B; + + PrefetchedBuffer A_buffer; + PrefetchedBuffer B_buffer; }; int main(int argc, char** argv) { - BenchmarkApp app(argc, argv); - app.run(); - return 0; + BenchmarkApp app(argc, argv); + app.run(); + return 0; } diff --git a/polybench/3mm.cpp b/polybench/3mm.cpp index d3c42467..f451b11f 100644 --- a/polybench/3mm.cpp +++ b/polybench/3mm.cpp @@ -3,7 +3,7 @@ #include -#include +#include #include "common.h" #include "polybenchUtilFuncts.h" @@ -15,196 +15,198 @@ class Polybench_3mm_2; class Polybench_3mm_3; void init_array(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* C, DATA_TYPE* D, size_t size) { - const auto NI = size; - const auto NJ = size; - const auto NK = size; - const auto NL = size; - const auto NM = size; - - for(size_t i = 0; i < NI; i++) { - for(size_t j = 0; j < NK; j++) { - A[i * NK + j] = ((DATA_TYPE)i * j) / NI; - } - } - - for(size_t i = 0; i < NK; i++) { - for(size_t j = 0; j < NJ; j++) { - B[i * NJ + j] = ((DATA_TYPE)i * (j + 1)) / NJ; - } - } - - for(size_t i = 0; i < NJ; i++) { - for(size_t j = 0; j < NM; j++) { - C[i * NM + j] = ((DATA_TYPE)i * (j + 3)) / NL; - } - } - - for(size_t i = 0; i < NM; i++) { - for(size_t j = 0; j < NL; j++) { - D[i * NL + j] = ((DATA_TYPE)i * (j + 2)) / NK; - } - } + const auto NI = size; + const auto NJ = size; + const auto NK = size; + const auto NL = size; + const auto NM = size; + + for(size_t i = 0; i < NI; i++) { + for(size_t j = 0; j < NK; j++) { + A[i * NK + j] = ((DATA_TYPE)i * j) / NI; + } + } + + for(size_t i = 0; i < NK; i++) { + for(size_t j = 0; j < NJ; j++) { + B[i * NJ + j] = ((DATA_TYPE)i * (j + 1)) / NJ; + } + } + + for(size_t i = 0; i < NJ; i++) { + for(size_t j = 0; j < NM; j++) { + C[i * NM + j] = ((DATA_TYPE)i * (j + 3)) / NL; + } + } + + for(size_t i = 0; i < NM; i++) { + for(size_t j = 0; j < NL; j++) { + D[i * NL + j] = ((DATA_TYPE)i * (j + 2)) / NK; + } + } } -void mm3_cpu(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* C, DATA_TYPE* D, DATA_TYPE* E, DATA_TYPE* F, DATA_TYPE* G, size_t size) { - const auto NI = size; - const auto NJ = size; - const auto NK = size; - const auto NL = size; - const auto NM = size; - - /* E := A*B */ - for(size_t i = 0; i < NI; i++) { - for(size_t j = 0; j < NJ; j++) { - E[i * NJ + j] = 0; - for(size_t k = 0; k < NK; ++k) { - E[i * NJ + j] += A[i * NK + k] * B[k * NJ + j]; - } - } - } - - /* F := C*D */ - for(size_t i = 0; i < NI; i++) { - for(size_t j = 0; j < NL; j++) { - F[i * NL + j] = 0; - for(size_t k = 0; k < NM; ++k) { - F[i * NL + j] += C[i * NM + k] * D[k * NL + j]; - } - } - } - - /* G := E*F */ - for(size_t i = 0; i < NI; i++) { - for(size_t j = 0; j < NL; j++) { - G[i * NL + j] = 0; - for(size_t k = 0; k < NJ; ++k) { - G[i * NL + j] += E[i * NJ + k] * F[k * NL + j]; - } - } - } +void mm3_cpu( + DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* C, DATA_TYPE* D, DATA_TYPE* E, DATA_TYPE* F, DATA_TYPE* G, size_t size) { + const auto NI = size; + const auto NJ = size; + const auto NK = size; + const auto NL = size; + const auto NM = size; + + /* E := A*B */ + for(size_t i = 0; i < NI; i++) { + for(size_t j = 0; j < NJ; j++) { + E[i * NJ + j] = 0; + for(size_t k = 0; k < NK; ++k) { + E[i * NJ + j] += A[i * NK + k] * B[k * NJ + j]; + } + } + } + + /* F := C*D */ + for(size_t i = 0; i < NI; i++) { + for(size_t j = 0; j < NL; j++) { + F[i * NL + j] = 0; + for(size_t k = 0; k < NM; ++k) { + F[i * NL + j] += C[i * NM + k] * D[k * NL + j]; + } + } + } + + /* G := E*F */ + for(size_t i = 0; i < NI; i++) { + for(size_t j = 0; j < NL; j++) { + G[i * NL + j] = 0; + for(size_t k = 0; k < NJ; ++k) { + G[i * NL + j] += E[i * NJ + k] * F[k * NL + j]; + } + } + } } class Polybench_3mm { - public: - Polybench_3mm(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} - - void setup() { - A.resize(size * size); - B.resize(size * size); - C.resize(size * size); - D.resize(size * size); - E.resize(size * size); - F.resize(size * size); - G.resize(size * size); - - init_array(A.data(), B.data(), C.data(), D.data(), size); - - A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<2>(size, size)); - B_buffer.initialize(args.device_queue, B.data(), cl::sycl::range<2>(size, size)); - C_buffer.initialize(args.device_queue, C.data(), cl::sycl::range<2>(size, size)); - D_buffer.initialize(args.device_queue, D.data(), cl::sycl::range<2>(size, size)); - E_buffer.initialize(args.device_queue, E.data(), cl::sycl::range<2>(size, size)); - F_buffer.initialize(args.device_queue, F.data(), cl::sycl::range<2>(size, size)); - G_buffer.initialize(args.device_queue, G.data(), cl::sycl::range<2>(size, size)); - } - - void run(std::vector& events) { - using namespace cl::sycl; - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto A = A_buffer.get_access(cgh); - auto B = B_buffer.get_access(cgh); - auto E = E_buffer.get_access(cgh); - - cgh.parallel_for(E_buffer.get_range(), [=, size_ = size](item<2> item) { - const auto i = item[0]; - const auto j = item[1]; - - for(size_t k = 0; k < size_; k++) { - E[item] += A[{i, k}] * B[{k, j}]; - } - }); - })); - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto C = C_buffer.get_access(cgh); - auto D = D_buffer.get_access(cgh); - auto F = F_buffer.get_access(cgh); - - cgh.parallel_for(F_buffer.get_range(), [=, size_ = size](item<2> item) { - const auto i = item[0]; - const auto j = item[1]; - - for(size_t k = 0; k < size_; k++) { - F[item] += C[{i, k}] * D[{k, j}]; - } - }); - })); - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto E = E_buffer.get_access(cgh); - auto F = F_buffer.get_access(cgh); - auto G = G_buffer.get_access(cgh); - - cgh.parallel_for(F_buffer.get_range(), [=, size_ = size](item<2> item) { - const auto i = item[0]; - const auto j = item[1]; - - for(size_t k = 0; k < size_; k++) { - G[item] += E[{i, k}] * F[{k, j}]; - } - }); - })); - } - - bool verify(VerificationSetting&) { - constexpr auto ERROR_THRESHOLD = 0.05; - - init_array(A.data(), B.data(), C.data(), D.data(), size); - - std::vector E_cpu(size * size); - std::vector F_cpu(size * size); - std::vector G_cpu(size * size); - - mm3_cpu(A.data(), B.data(), C.data(), D.data(), E_cpu.data(), F_cpu.data(), G_cpu.data(), size); - - auto G_acc = G_buffer.get_access(); - - for(size_t i = 0; i < size; i++) { - for(size_t j = 0; j < size; j++) { - const auto diff = percentDiff(G_cpu[i * size + j], G_acc.get_pointer()[i * size + j]); - if(diff > ERROR_THRESHOLD) return false; - } - } - - return true; - } - - static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_3mm"; } - - private: - BenchmarkArgs args; - - const size_t size; - std::vector A; - std::vector B; - std::vector C; - std::vector D; - std::vector E; - std::vector F; - std::vector G; - - PrefetchedBuffer A_buffer; - PrefetchedBuffer B_buffer; - PrefetchedBuffer C_buffer; - PrefetchedBuffer D_buffer; - PrefetchedBuffer E_buffer; - PrefetchedBuffer F_buffer; - PrefetchedBuffer G_buffer; +public: + Polybench_3mm(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} + + void setup() { + A.resize(size * size); + B.resize(size * size); + C.resize(size * size); + D.resize(size * size); + E.resize(size * size); + F.resize(size * size); + G.resize(size * size); + + init_array(A.data(), B.data(), C.data(), D.data(), size); + + A_buffer.initialize(args.device_queue, A.data(), sycl::range<2>(size, size)); + B_buffer.initialize(args.device_queue, B.data(), sycl::range<2>(size, size)); + C_buffer.initialize(args.device_queue, C.data(), sycl::range<2>(size, size)); + D_buffer.initialize(args.device_queue, D.data(), sycl::range<2>(size, size)); + E_buffer.initialize(args.device_queue, E.data(), sycl::range<2>(size, size)); + F_buffer.initialize(args.device_queue, F.data(), sycl::range<2>(size, size)); + G_buffer.initialize(args.device_queue, G.data(), sycl::range<2>(size, size)); + } + + void run(std::vector& events) { + using namespace sycl; + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto A = A_buffer.get_access(cgh); + auto B = B_buffer.get_access(cgh); + auto E = E_buffer.get_access(cgh); + + cgh.parallel_for(E_buffer.get_range(), [=, size_ = size](item<2> item) { + const auto i = item[0]; + const auto j = item[1]; + + for(size_t k = 0; k < size_; k++) { + E[item] += A[{i, k}] * B[{k, j}]; + } + }); + })); + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto C = C_buffer.get_access(cgh); + auto D = D_buffer.get_access(cgh); + auto F = F_buffer.get_access(cgh); + + cgh.parallel_for(F_buffer.get_range(), [=, size_ = size](item<2> item) { + const auto i = item[0]; + const auto j = item[1]; + + for(size_t k = 0; k < size_; k++) { + F[item] += C[{i, k}] * D[{k, j}]; + } + }); + })); + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto E = E_buffer.get_access(cgh); + auto F = F_buffer.get_access(cgh); + auto G = G_buffer.get_access(cgh); + + cgh.parallel_for(F_buffer.get_range(), [=, size_ = size](item<2> item) { + const auto i = item[0]; + const auto j = item[1]; + + for(size_t k = 0; k < size_; k++) { + G[item] += E[{i, k}] * F[{k, j}]; + } + }); + })); + } + + bool verify(VerificationSetting&) { + constexpr auto ERROR_THRESHOLD = 0.05; + + init_array(A.data(), B.data(), C.data(), D.data(), size); + + std::vector E_cpu(size * size); + std::vector F_cpu(size * size); + std::vector G_cpu(size * size); + + mm3_cpu(A.data(), B.data(), C.data(), D.data(), E_cpu.data(), F_cpu.data(), G_cpu.data(), size); + + auto G_acc = G_buffer.get_host_access(); + + for(size_t i = 0; i < size; i++) { + for(size_t j = 0; j < size; j++) { + const auto diff = percentDiff(G_cpu[i * size + j], G_acc.get_pointer()[i * size + j]); + if(diff > ERROR_THRESHOLD) + return false; + } + } + + return true; + } + + static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_3mm"; } + +private: + BenchmarkArgs args; + + const size_t size; + std::vector A; + std::vector B; + std::vector C; + std::vector D; + std::vector E; + std::vector F; + std::vector G; + + PrefetchedBuffer A_buffer; + PrefetchedBuffer B_buffer; + PrefetchedBuffer C_buffer; + PrefetchedBuffer D_buffer; + PrefetchedBuffer E_buffer; + PrefetchedBuffer F_buffer; + PrefetchedBuffer G_buffer; }; int main(int argc, char** argv) { - BenchmarkApp app(argc, argv); - app.run(); - return 0; + BenchmarkApp app(argc, argv); + app.run(); + return 0; } diff --git a/polybench/atax.cpp b/polybench/atax.cpp index 6321282d..e4aa7958 100644 --- a/polybench/atax.cpp +++ b/polybench/atax.cpp @@ -3,7 +3,7 @@ #include -#include +#include #include "common.h" #include "polybenchUtilFuncts.h" @@ -18,121 +18,122 @@ class Atax1; class Atax2; void init_array(DATA_TYPE* x, DATA_TYPE* A, size_t size) { - const auto NX = size; - const auto NY = size; - - for(size_t i = 0; i < NX; i++) { - x[i] = i * M_PI; - for(size_t j = 0; j < NY; j++) { - A[i * NY + j] = ((DATA_TYPE)i * (j)) / NX; - } - } + const auto NX = size; + const auto NY = size; + + for(size_t i = 0; i < NX; i++) { + x[i] = i * M_PI; + for(size_t j = 0; j < NY; j++) { + A[i * NY + j] = ((DATA_TYPE)i * (j)) / NX; + } + } } void atax_cpu(DATA_TYPE* A, DATA_TYPE* x, DATA_TYPE* y, DATA_TYPE* tmp, size_t size) { - const auto NX = size; - const auto NY = size; - - for(size_t i = 0; i < NX; i++) { - for(size_t j = 0; j < NY; j++) { - tmp[i] += A[i * NY + j] * x[j]; - } - - for(size_t j = 0; j < NY; j++) { - y[j] += A[i * NY + j] * tmp[i]; - } - } + const auto NX = size; + const auto NY = size; + + for(size_t i = 0; i < NX; i++) { + for(size_t j = 0; j < NY; j++) { + tmp[i] += A[i * NY + j] * x[j]; + } + + for(size_t j = 0; j < NY; j++) { + y[j] += A[i * NY + j] * tmp[i]; + } + } } class Polybench_Atax { - public: - Polybench_Atax(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} +public: + Polybench_Atax(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} - void setup() { - A.resize(size * size); - x.resize(size); - y.resize(size); - tmp.resize(size); + void setup() { + A.resize(size * size); + x.resize(size); + y.resize(size); + tmp.resize(size); - init_array(x.data(), A.data(), size); + init_array(x.data(), A.data(), size); - A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<2>{size, size}); - x_buffer.initialize(args.device_queue, x.data(), cl::sycl::range<1>{size}); - y_buffer.initialize(args.device_queue, y.data(), cl::sycl::range<1>{size}); - tmp_buffer.initialize(args.device_queue, tmp.data(), cl::sycl::range<1>{size}); - } + A_buffer.initialize(args.device_queue, A.data(), sycl::range<2>{size, size}); + x_buffer.initialize(args.device_queue, x.data(), sycl::range<1>{size}); + y_buffer.initialize(args.device_queue, y.data(), sycl::range<1>{size}); + tmp_buffer.initialize(args.device_queue, tmp.data(), sycl::range<1>{size}); + } - void run(std::vector& events) { - using namespace cl::sycl; + void run(std::vector& events) { + using namespace sycl; - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto A = A_buffer.get_access(cgh); - auto x = x_buffer.get_access(cgh); - auto tmp = tmp_buffer.get_access(cgh); + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto A = A_buffer.get_access(cgh); + auto x = x_buffer.get_access(cgh); + auto tmp = tmp_buffer.get_access(cgh); - cgh.parallel_for(tmp_buffer.get_range(), [=, size_ = size](item<1> item) { - const auto i = item[0]; + cgh.parallel_for(tmp_buffer.get_range(), [=, size_ = size](item<1> item) { + const auto i = item[0]; - for(size_t j = 0; j < size_; j++) { - tmp[item] += A[{i, j}] * x[j]; - } - }); - })); + for(size_t j = 0; j < size_; j++) { + tmp[item] += A[{i, j}] * x[j]; + } + }); + })); - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto A = A_buffer.get_access(cgh); - auto y = y_buffer.get_access(cgh); - auto tmp = tmp_buffer.get_access(cgh); + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto A = A_buffer.get_access(cgh); + auto y = y_buffer.get_access(cgh); + auto tmp = tmp_buffer.get_access(cgh); - cgh.parallel_for(y_buffer.get_range(), [=, size_ = size](item<1> item) { - const auto j = item[0]; + cgh.parallel_for(y_buffer.get_range(), [=, size_ = size](item<1> item) { + const auto j = item[0]; - for(size_t i = 0; i < size_; i++) { - y[item] += A[{i, j}] * tmp[i]; - } - }); - })); - } + for(size_t i = 0; i < size_; i++) { + y[item] += A[{i, j}] * tmp[i]; + } + }); + })); + } - bool verify(VerificationSetting&) { - constexpr auto ERROR_THRESHOLD = 0.05; + bool verify(VerificationSetting&) { + constexpr auto ERROR_THRESHOLD = 0.05; - init_array(x.data(), A.data(), size); + init_array(x.data(), A.data(), size); - std::vector y_cpu(size); - std::vector tmp_cpu(size); + std::vector y_cpu(size); + std::vector tmp_cpu(size); - atax_cpu(A.data(), x.data(), y_cpu.data(), tmp_cpu.data(), size); + atax_cpu(A.data(), x.data(), y_cpu.data(), tmp_cpu.data(), size); - auto y_acc = y_buffer.get_access(); + auto y_acc = y_buffer.get_host_access(); - for(size_t i = 0; i < size; i++) { - const auto diff = percentDiff(y_cpu[i], y_acc[i]); - if(diff > ERROR_THRESHOLD) return false; - } + for(size_t i = 0; i < size; i++) { + const auto diff = percentDiff(y_cpu[i], y_acc[i]); + if(diff > ERROR_THRESHOLD) + return false; + } - return true; - } + return true; + } - static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Atax"; } + static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Atax"; } - private: - BenchmarkArgs args; +private: + BenchmarkArgs args; - const size_t size; - std::vector A; - std::vector x; - std::vector y; - std::vector tmp; + const size_t size; + std::vector A; + std::vector x; + std::vector y; + std::vector tmp; - PrefetchedBuffer A_buffer; - PrefetchedBuffer x_buffer; - PrefetchedBuffer y_buffer; - PrefetchedBuffer tmp_buffer; + PrefetchedBuffer A_buffer; + PrefetchedBuffer x_buffer; + PrefetchedBuffer y_buffer; + PrefetchedBuffer tmp_buffer; }; int main(int argc, char** argv) { - BenchmarkApp app(argc, argv); - app.run(); - return 0; + BenchmarkApp app(argc, argv); + app.run(); + return 0; } diff --git a/polybench/bicg.cpp b/polybench/bicg.cpp index 0e569256..4655e602 100644 --- a/polybench/bicg.cpp +++ b/polybench/bicg.cpp @@ -3,7 +3,7 @@ #include -#include +#include #include "common.h" #include "polybenchUtilFuncts.h" @@ -18,130 +18,132 @@ class Bicg1; class Bicg2; void init_array(DATA_TYPE* A, DATA_TYPE* p, DATA_TYPE* r, size_t size) { - const auto NX = size; - const auto NY = size; + const auto NX = size; + const auto NY = size; - for(size_t i = 0; i < NX; i++) { - r[i] = i * M_PI; + for(size_t i = 0; i < NX; i++) { + r[i] = i * M_PI; - for(size_t j = 0; j < NY; j++) { - A[i * NY + j] = ((DATA_TYPE)i * j) / NX; - } - } + for(size_t j = 0; j < NY; j++) { + A[i * NY + j] = ((DATA_TYPE)i * j) / NX; + } + } - for(size_t i = 0; i < NY; i++) { - p[i] = i * M_PI; - } + for(size_t i = 0; i < NY; i++) { + p[i] = i * M_PI; + } } void bicg_cpu(DATA_TYPE* A, DATA_TYPE* r, DATA_TYPE* s, DATA_TYPE* p, DATA_TYPE* q, size_t size) { - const auto NX = size; - const auto NY = size; - - for(size_t i = 0; i < NX; i++) { - for(size_t j = 0; j < NY; j++) { - s[j] += r[i] * A[i * NY + j]; - q[i] += A[i * NY + j] * p[j]; - } - } + const auto NX = size; + const auto NY = size; + + for(size_t i = 0; i < NX; i++) { + for(size_t j = 0; j < NY; j++) { + s[j] += r[i] * A[i * NY + j]; + q[i] += A[i * NY + j] * p[j]; + } + } } class Polybench_Bicg { - public: - Polybench_Bicg(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} - - void setup() { - A.resize(size * size); - r.resize(size); - s.resize(size); - p.resize(size); - q.resize(size); - - init_array(A.data(), p.data(), r.data(), size); - - A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<2>(size, size)); - r_buffer.initialize(args.device_queue, r.data(), cl::sycl::range<1>(size)); - s_buffer.initialize(args.device_queue, s.data(), cl::sycl::range<1>(size)); - p_buffer.initialize(args.device_queue, p.data(), cl::sycl::range<1>(size)); - q_buffer.initialize(args.device_queue, q.data(), cl::sycl::range<1>(size)); - } - - void run(std::vector& events) { - using namespace cl::sycl; - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto A = A_buffer.get_access(cgh); - auto r = r_buffer.get_access(cgh); - auto s = s_buffer.get_access(cgh); - - cgh.parallel_for(s_buffer.get_range(), [=, size_ = size](item<1> item) { - const auto j = item[0]; - - for(size_t i = 0; i < size_; i++) { - s[item] += A[{i, j}] * r[i]; - } - }); - })); - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto A = A_buffer.get_access(cgh); - auto p = p_buffer.get_access(cgh); - auto q = q_buffer.get_access(cgh); - - cgh.parallel_for(q_buffer.get_range(), [=, size_ = size](item<1> item) { - const auto i = item[0]; - - for(size_t j = 0; j < size_; j++) { - q[item] += A[{i, j}] * p[j]; - } - }); - })); - } - - bool verify(VerificationSetting&) { - constexpr auto ERROR_THRESHOLD = 0.05; - - // Trigger writebacks - s_buffer.reset(); - q_buffer.reset(); - - std::vector s_cpu(size); - std::vector q_cpu(size); - - bicg_cpu(A.data(), r.data(), s_cpu.data(), p.data(), q_cpu.data(), size); - - for(size_t i = 0; i < size; i++) { - auto diff = percentDiff(s_cpu[i], s[i]); - if(diff > ERROR_THRESHOLD) return false; - - diff = percentDiff(q_cpu[i], q[i]); - if(diff > ERROR_THRESHOLD) return false; - } - - return true; - } - - static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Bicg"; } - - private: - BenchmarkArgs args; - - const size_t size; - std::vector A; - std::vector r; - std::vector s; - std::vector p; - std::vector q; - - PrefetchedBuffer A_buffer; - PrefetchedBuffer r_buffer; - PrefetchedBuffer s_buffer; - PrefetchedBuffer p_buffer; - PrefetchedBuffer q_buffer; +public: + Polybench_Bicg(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} + + void setup() { + A.resize(size * size); + r.resize(size); + s.resize(size); + p.resize(size); + q.resize(size); + + init_array(A.data(), p.data(), r.data(), size); + + A_buffer.initialize(args.device_queue, A.data(), sycl::range<2>(size, size)); + r_buffer.initialize(args.device_queue, r.data(), sycl::range<1>(size)); + s_buffer.initialize(args.device_queue, s.data(), sycl::range<1>(size)); + p_buffer.initialize(args.device_queue, p.data(), sycl::range<1>(size)); + q_buffer.initialize(args.device_queue, q.data(), sycl::range<1>(size)); + } + + void run(std::vector& events) { + using namespace sycl; + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto A = A_buffer.get_access(cgh); + auto r = r_buffer.get_access(cgh); + auto s = s_buffer.get_access(cgh); + + cgh.parallel_for(s_buffer.get_range(), [=, size_ = size](item<1> item) { + const auto j = item[0]; + + for(size_t i = 0; i < size_; i++) { + s[item] += A[{i, j}] * r[i]; + } + }); + })); + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto A = A_buffer.get_access(cgh); + auto p = p_buffer.get_access(cgh); + auto q = q_buffer.get_access(cgh); + + cgh.parallel_for(q_buffer.get_range(), [=, size_ = size](item<1> item) { + const auto i = item[0]; + + for(size_t j = 0; j < size_; j++) { + q[item] += A[{i, j}] * p[j]; + } + }); + })); + } + + bool verify(VerificationSetting&) { + constexpr auto ERROR_THRESHOLD = 0.05; + + // Trigger writebacks + s_buffer.reset(); + q_buffer.reset(); + + std::vector s_cpu(size); + std::vector q_cpu(size); + + bicg_cpu(A.data(), r.data(), s_cpu.data(), p.data(), q_cpu.data(), size); + + for(size_t i = 0; i < size; i++) { + auto diff = percentDiff(s_cpu[i], s[i]); + if(diff > ERROR_THRESHOLD) + return false; + + diff = percentDiff(q_cpu[i], q[i]); + if(diff > ERROR_THRESHOLD) + return false; + } + + return true; + } + + static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Bicg"; } + +private: + BenchmarkArgs args; + + const size_t size; + std::vector A; + std::vector r; + std::vector s; + std::vector p; + std::vector q; + + PrefetchedBuffer A_buffer; + PrefetchedBuffer r_buffer; + PrefetchedBuffer s_buffer; + PrefetchedBuffer p_buffer; + PrefetchedBuffer q_buffer; }; int main(int argc, char** argv) { - BenchmarkApp app(argc, argv); - app.run(); - return 0; + BenchmarkApp app(argc, argv); + app.run(); + return 0; } diff --git a/polybench/common/polybenchUtilFuncts.h b/polybench/common/polybenchUtilFuncts.h index 9cc0b0b7..f5390f82 100644 --- a/polybench/common/polybenchUtilFuncts.h +++ b/polybench/common/polybenchUtilFuncts.h @@ -14,34 +14,33 @@ #define SMALL_FLOAT_VAL 0.00000001f double rtclock() { - struct timezone Tzp; - struct timeval Tp; - int stat; - stat = gettimeofday(&Tp, &Tzp); - if(stat != 0) printf("Error return from gettimeofday: %d", stat); - return (Tp.tv_sec + Tp.tv_usec * 1.0e-6); + struct timezone Tzp; + struct timeval Tp; + int stat; + stat = gettimeofday(&Tp, &Tzp); + if(stat != 0) + printf("Error return from gettimeofday: %d", stat); + return (Tp.tv_sec + Tp.tv_usec * 1.0e-6); } float absVal(float a) { - if(a < 0) { - return (a * -1); - } else { - return a; - } + if(a < 0) { + return (a * -1); + } else { + return a; + } } float percentDiff(double val1, double val2) { - if((absVal(val1) < 0.01) && (absVal(val2) < 0.01)) { - return 0.0f; - } else { - return 100.0f * (absVal(absVal(val1 - val2) / absVal(val1 + SMALL_FLOAT_VAL))); - } + if((absVal(val1) < 0.01) && (absVal(val2) < 0.01)) { + return 0.0f; + } else { + return 100.0f * (absVal(absVal(val1 - val2) / absVal(val1 + SMALL_FLOAT_VAL))); + } } -static bool shouldDoCpu(void) { - return getenv("SYCL_BENCH_SKIP_CPU") == NULL; -} +static bool shouldDoCpu(void) { return getenv("SYCL_BENCH_SKIP_CPU") == NULL; } #endif // POLYBENCH_UTIL_FUNCTS_H diff --git a/polybench/common/syclUtilFuncts.h b/polybench/common/syclUtilFuncts.h index 77ab6fc8..bff71444 100644 --- a/polybench/common/syclUtilFuncts.h +++ b/polybench/common/syclUtilFuncts.h @@ -1,18 +1,18 @@ #ifndef SYCL_UTIL_FUNCTS_H #define SYCL_UTIL_FUNCTS_H -#include +#include template -void initDeviceBuffer(cl::sycl::queue& queue, cl::sycl::buffer& buffer, T* data) { - using namespace cl::sycl; +void initDeviceBuffer(sycl::queue& queue, sycl::buffer& buffer, T* data) { + using namespace sycl; - queue.submit([&](handler& cgh) { - auto accessor = buffer.template get_access(cgh); - cgh.copy(data, accessor); - }); + queue.submit([&](handler& cgh) { + auto accessor = buffer.template get_access(cgh); + cgh.copy(data, accessor); + }); - queue.wait(); + queue.wait(); } #endif diff --git a/polybench/correlation.cpp b/polybench/correlation.cpp index 7e098417..ab552b91 100644 --- a/polybench/correlation.cpp +++ b/polybench/correlation.cpp @@ -4,7 +4,7 @@ #include #include -#include +#include #include "common.h" #include "polybenchUtilFuncts.h" @@ -23,210 +23,219 @@ class CorrelationCorr; class Correlation5; void init_arrays(DATA_TYPE* data, size_t size) { - const auto M = size; - const auto N = size; - - for(size_t i = 0; i <= M; i++) { - for(size_t j = 0; j <= N; j++) { - data[i * N + j] = ((DATA_TYPE)i * j) / (M + 1); - } - } + const auto M = size; + const auto N = size; + + for(size_t i = 0; i <= M; i++) { + for(size_t j = 0; j <= N; j++) { + data[i * N + j] = ((DATA_TYPE)i * j) / (M + 1); + } + } } void correlation(DATA_TYPE* data, DATA_TYPE* mean, DATA_TYPE* stddev, DATA_TYPE* symmat, size_t size) { - const auto M = size; - const auto N = size; - - // Determine mean of column vectors of input data matrix - for(size_t j = 1; j <= M; j++) { - mean[j] = 0.0; - - for(size_t i = 1; i <= N; i++) { - mean[j] += data[i * (M + 1) + j]; - } - - mean[j] /= (DATA_TYPE)FLOAT_N; - } - - // Determine standard deviations of column vectors of data matrix. - for(size_t j = 1; j <= M; j++) { - stddev[j] = 0.0; - - for(size_t i = 1; i <= N; i++) { - stddev[j] += (data[i * (M + 1) + j] - mean[j]) * (data[i * (M + 1) + j] - mean[j]); - } - - stddev[j] /= FLOAT_N; - stddev[j] = sqrt_of_array_cell(stddev, j); - stddev[j] = stddev[j] <= EPS ? 1.0 : stddev[j]; - } - - // Center and reduce the column vectors. - for(size_t i = 1; i <= N; i++) { - for(size_t j = 1; j <= M; j++) { - data[i * (M + 1) + j] -= mean[j]; - data[i * (M + 1) + j] /= sqrt(FLOAT_N); - data[i * (M + 1) + j] /= stddev[j]; - } - } - - // Calculate the m * m correlation matrix. - for(size_t j1 = 1; j1 <= M - 1; j1++) { - symmat[j1 * (M + 1) + j1] = 1.0; - - for(size_t j2 = j1 + 1; j2 <= M; j2++) { - symmat[j1 * (M + 1) + j2] = 0.0; - - for(size_t i = 1; i <= N; i++) { - symmat[j1 * (M + 1) + j2] += (data[i * (M + 1) + j1] * data[i * (M + 1) + j2]); - } - - symmat[j2 * (M + 1) + j1] = symmat[j1 * (M + 1) + j2]; - } - } - - symmat[M * (M + 1) + M] = 1.0; + const auto M = size; + const auto N = size; + + // Determine mean of column vectors of input data matrix + for(size_t j = 1; j <= M; j++) { + mean[j] = 0.0; + + for(size_t i = 1; i <= N; i++) { + mean[j] += data[i * (M + 1) + j]; + } + + mean[j] /= (DATA_TYPE)FLOAT_N; + } + + // Determine standard deviations of column vectors of data matrix. + for(size_t j = 1; j <= M; j++) { + stddev[j] = 0.0; + + for(size_t i = 1; i <= N; i++) { + stddev[j] += (data[i * (M + 1) + j] - mean[j]) * (data[i * (M + 1) + j] - mean[j]); + } + + stddev[j] /= FLOAT_N; + stddev[j] = sqrt_of_array_cell(stddev, j); + stddev[j] = stddev[j] <= EPS ? 1.0 : stddev[j]; + } + + // Center and reduce the column vectors. + for(size_t i = 1; i <= N; i++) { + for(size_t j = 1; j <= M; j++) { + data[i * (M + 1) + j] -= mean[j]; + data[i * (M + 1) + j] /= sqrt(FLOAT_N); + data[i * (M + 1) + j] /= stddev[j]; + } + } + + // Calculate the m * m correlation matrix. + for(size_t j1 = 1; j1 <= M - 1; j1++) { + symmat[j1 * (M + 1) + j1] = 1.0; + + for(size_t j2 = j1 + 1; j2 <= M; j2++) { + symmat[j1 * (M + 1) + j2] = 0.0; + + for(size_t i = 1; i <= N; i++) { + symmat[j1 * (M + 1) + j2] += (data[i * (M + 1) + j1] * data[i * (M + 1) + j2]); + } + + symmat[j2 * (M + 1) + j1] = symmat[j1 * (M + 1) + j2]; + } + } + + symmat[M * (M + 1) + M] = 1.0; } class Polybench_Correlation { - public: - Polybench_Correlation(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} - - void setup() { - data.resize((size + 1) * (size + 1)); - mean.resize(size + 1); - stddev.resize(size + 1); - symmat.resize((size + 1) * (size + 1)); - - init_arrays(data.data(), size); - - data_buffer.initialize(args.device_queue, data.data(), cl::sycl::range<2>(size + 1, size + 1)); - mean_buffer.initialize(args.device_queue, mean.data(), cl::sycl::range<1>(size + 1)); - stddev_buffer.initialize(args.device_queue, stddev.data(), cl::sycl::range<1>(size + 1)); - symmat_buffer.initialize(args.device_queue, symmat.data(), cl::sycl::range<2>(size + 1, size + 1)); - } - - void run(std::vector& events) { - using namespace cl::sycl; - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto data = data_buffer.get_access(cgh); - auto mean = mean_buffer.get_access(cgh); - - cgh.parallel_for(range<1>(size), id<1>(1), [=, N_ = size](item<1> item) { - const auto j = item[0]; - - for(size_t i = 1; i <= N_; i++) { - mean[item] += data[{i, j}]; - } - mean[item] /= ((DATA_TYPE)FLOAT_N); - }); - })); - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto data = data_buffer.get_access(cgh); - auto mean = mean_buffer.get_access(cgh); - auto stddev = stddev_buffer.get_access(cgh); - - cgh.parallel_for(range<1>(size), id<1>(1), [=, N_ = size](item<1> item) { - const auto j = item[0]; - - for(size_t i = 1; i <= N_; i++) { - stddev[item] += (data[{i, j}] - mean[item]) * (data[{i, j}] - mean[item]); - } - - stddev[item] /= FLOAT_N; - stddev[item] = cl::sycl::sqrt(stddev[item]); - stddev[item] = stddev[item] <= EPS ? 1.0 : stddev[item]; - }); - })); - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto data = data_buffer.get_access(cgh); - auto mean = mean_buffer.get_access(cgh); - auto stddev = stddev_buffer.get_access(cgh); - - cgh.parallel_for(range<2>(size, size), id<2>(1, 1), [=](item<2> item) { - const auto j = item[1]; - - data[item] -= mean[j]; - data[item] /= cl::sycl::sqrt(FLOAT_N); - data[item] /= stddev[j]; - }); - })); - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto data = data_buffer.get_access(cgh); - auto symmat = symmat_buffer.get_access(cgh); - - cgh.parallel_for(range<1>(size), id<1>(1), [=, M_ = size, N_ = size](item<1> item) { - // if(item[0] >= M_ - 1) return; - - const auto j1 = item[0]; - - symmat[{j1, j1}] = 1.0; - - for(size_t j2 = j1 + 1; j2 <= M_; j2++) { - symmat[{j1, j2}] = 0.0; - - for(size_t i = 1; i <= N_; i++) { - symmat[{j1, j2}] += data[{i, j1}] * data[{i, j2}]; - } - - symmat[{j2, j1}] = symmat[{j1, j2}]; - } - }); - })); - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto symmat = symmat_buffer.get_access(cgh); - cgh.parallel_for(range<2>(1, 1), id<2>(size, size), [=](item<2> item) { symmat[item] = 1.0; }); - })); - } - - bool verify(VerificationSetting&) { - constexpr auto ERROR_THRESHOLD = 0.05; - - std::vector data_cpu((size + 1) * (size + 1)); - std::vector mean_cpu(size + 1); - std::vector stddev_cpu(size + 1); - std::vector symmat_cpu((size + 1) * (size + 1)); - - // Trigger writeback - symmat_buffer.reset(); - - init_arrays(data_cpu.data(), size); - correlation(data_cpu.data(), mean_cpu.data(), stddev_cpu.data(), symmat_cpu.data(), size); - - for(size_t i = 1; i < size + 1; i++) { - for(size_t j = 1; j < size + 1; j++) { - const auto diff = percentDiff(symmat_cpu[i * (size + 1) + j], symmat[i * (size + 1) + j]); - if(diff > ERROR_THRESHOLD) return false; - } - } - - return true; - } - - static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Correlation"; } - - private: - BenchmarkArgs args; - - const size_t size; - std::vector data; - std::vector mean; - std::vector stddev; - std::vector symmat; - - PrefetchedBuffer data_buffer; +public: + Polybench_Correlation(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} + + void setup() { + data.resize((size + 1) * (size + 1)); + mean.resize(size + 1); + stddev.resize(size + 1); + symmat.resize((size + 1) * (size + 1)); + + init_arrays(data.data(), size); + + data_buffer.initialize(args.device_queue, data.data(), sycl::range<2>(size + 1, size + 1)); + mean_buffer.initialize(args.device_queue, mean.data(), sycl::range<1>(size + 1)); + stddev_buffer.initialize(args.device_queue, stddev.data(), sycl::range<1>(size + 1)); + symmat_buffer.initialize(args.device_queue, symmat.data(), sycl::range<2>(size + 1, size + 1)); + } + + void run(std::vector& events) { + using namespace sycl; + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto data = data_buffer.get_access(cgh); + auto mean = mean_buffer.get_access(cgh); + + cgh.parallel_for(range<1>(size), [=, N_ = size](id<1> gid) { + const id<1> offset(1); + const auto j = gid[0] + offset[0]; + + for(size_t i = 1; i <= N_; i++) { + mean[gid + offset] += data[{i, j}]; + } + mean[gid + offset] /= ((DATA_TYPE)FLOAT_N); + }); + })); + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto data = data_buffer.get_access(cgh); + auto mean = mean_buffer.get_access(cgh); + auto stddev = stddev_buffer.get_access(cgh); + + cgh.parallel_for(range<1>(size), [=, N_ = size](id<1> gid) { + const id<1> offset(1); + const auto adj_id = gid + offset; + const auto j = gid[0] + offset[0]; + + for(size_t i = 1; i <= N_; i++) { + stddev[adj_id] += (data[{i, j}] - mean[adj_id]) * (data[{i, j}] - mean[adj_id]); + } + + stddev[adj_id] /= FLOAT_N; + stddev[adj_id] = sycl::sqrt(stddev[adj_id]); + stddev[adj_id] = stddev[adj_id] <= EPS ? 1.0 : stddev[adj_id]; + }); + })); + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto data = data_buffer.get_access(cgh); + auto mean = mean_buffer.get_access(cgh); + auto stddev = stddev_buffer.get_access(cgh); + + cgh.parallel_for(range<2>(size, size), [=](id<2> gid) { + const id<2> offset(1, 1); + const auto adj_id = gid + offset; + const auto j = gid[1] + offset[1]; + + data[adj_id] -= mean[j]; + data[adj_id] /= sycl::sqrt(FLOAT_N); + data[adj_id] /= stddev[j]; + }); + })); + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto data = data_buffer.get_access(cgh); + auto symmat = symmat_buffer.get_access(cgh); + + cgh.parallel_for(range<1>(size), [=, M_ = size, N_ = size](id<1> gid) { + // if(item[0] >= M_ - 1) return; + const id<1> offset(1); + const auto j1 = gid[0] + offset[0]; + + symmat[{j1, j1}] = 1.0; + + for(size_t j2 = j1 + 1; j2 <= M_; j2++) { + symmat[{j1, j2}] = 0.0; + + for(size_t i = 1; i <= N_; i++) { + symmat[{j1, j2}] += data[{i, j1}] * data[{i, j2}]; + } + + symmat[{j2, j1}] = symmat[{j1, j2}]; + } + }); + })); + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto symmat = symmat_buffer.get_access(cgh); + cgh.parallel_for(range<2>(1, 1), [=, M_ = size](id<2> gid) { + const id<2> offset(M_, M_); + symmat[gid + offset] = 1.0; + }); + })); + } + + bool verify(VerificationSetting&) { + constexpr auto ERROR_THRESHOLD = 0.05; + + std::vector data_cpu((size + 1) * (size + 1)); + std::vector mean_cpu(size + 1); + std::vector stddev_cpu(size + 1); + std::vector symmat_cpu((size + 1) * (size + 1)); + + // Trigger writeback + symmat_buffer.reset(); + + init_arrays(data_cpu.data(), size); + correlation(data_cpu.data(), mean_cpu.data(), stddev_cpu.data(), symmat_cpu.data(), size); + + for(size_t i = 1; i < size + 1; i++) { + for(size_t j = 1; j < size + 1; j++) { + const auto diff = percentDiff(symmat_cpu[i * (size + 1) + j], symmat[i * (size + 1) + j]); + if(diff > ERROR_THRESHOLD) + return false; + } + } + + return true; + } + + static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Correlation"; } + +private: + BenchmarkArgs args; + + const size_t size; + std::vector data; + std::vector mean; + std::vector stddev; + std::vector symmat; + + PrefetchedBuffer data_buffer; PrefetchedBuffer mean_buffer; - PrefetchedBuffer stddev_buffer; - PrefetchedBuffer symmat_buffer; + PrefetchedBuffer stddev_buffer; + PrefetchedBuffer symmat_buffer; }; int main(int argc, char** argv) { - BenchmarkApp app(argc, argv); - app.run(); - return 0; -} + BenchmarkApp app(argc, argv); + app.run(); + return 0; +} \ No newline at end of file diff --git a/polybench/covariance.cpp b/polybench/covariance.cpp index c15c37b4..88b22f06 100644 --- a/polybench/covariance.cpp +++ b/polybench/covariance.cpp @@ -4,7 +4,7 @@ #include #include -#include +#include #include "common.h" #include "polybenchUtilFuncts.h" @@ -18,155 +18,159 @@ class CovarianceCovar; constexpr DATA_TYPE float_n = 3214212.01; void init_arrays(DATA_TYPE* data, size_t size) { - const auto M = size; - const auto N = size; - - for(size_t i = 0; i < M; i++) { - for(size_t j = 0; j < N; j++) { - data[i * (N + 1) + j] = ((DATA_TYPE)i * j) / M; - } - } + const auto M = size; + const auto N = size; + + for(size_t i = 0; i < M; i++) { + for(size_t j = 0; j < N; j++) { + data[i * (N + 1) + j] = ((DATA_TYPE)i * j) / M; + } + } } void covariance(DATA_TYPE* data, DATA_TYPE* symmat, DATA_TYPE* mean, size_t size) { - const auto M = size; - const auto N = size; - - // Determine mean of column vectors of input data matrix - for(size_t j = 1; j <= M; j++) { - mean[j] = 0.0; - for(size_t i = 1; i <= N; i++) { - mean[j] += data[i * (M + 1) + j]; - } - mean[j] /= float_n; - } - - // Center the column vectors. - for(size_t i = 1; i <= N; i++) { - for(size_t j = 1; j <= M; j++) { - data[i * (M + 1) + j] -= mean[j]; - } - } - - // Calculate the m * m covariance matrix. - for(size_t j1 = 1; j1 <= M; j1++) { - for(size_t j2 = j1; j2 <= M; j2++) { - symmat[j1 * (M + 1) + j2] = 0.0; - for(size_t i = 1; i <= N; i++) { - symmat[j1 * (M + 1) + j2] += data[i * (M + 1) + j1] * data[i * (M + 1) + j2]; - } - symmat[j2 * (M + 1) + j1] = symmat[j1 * (M + 1) + j2]; - } - } + const auto M = size; + const auto N = size; + + // Determine mean of column vectors of input data matrix + for(size_t j = 1; j <= M; j++) { + mean[j] = 0.0; + for(size_t i = 1; i <= N; i++) { + mean[j] += data[i * (M + 1) + j]; + } + mean[j] /= float_n; + } + + // Center the column vectors. + for(size_t i = 1; i <= N; i++) { + for(size_t j = 1; j <= M; j++) { + data[i * (M + 1) + j] -= mean[j]; + } + } + + // Calculate the m * m covariance matrix. + for(size_t j1 = 1; j1 <= M; j1++) { + for(size_t j2 = j1; j2 <= M; j2++) { + symmat[j1 * (M + 1) + j2] = 0.0; + for(size_t i = 1; i <= N; i++) { + symmat[j1 * (M + 1) + j2] += data[i * (M + 1) + j1] * data[i * (M + 1) + j2]; + } + symmat[j2 * (M + 1) + j1] = symmat[j1 * (M + 1) + j2]; + } + } } class Polybench_Covariance { public: - Polybench_Covariance(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} + Polybench_Covariance(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} - void setup() { - data.resize((size + 1) * (size + 1)); - symmat.resize((size + 1) * (size + 1)); - mean.resize(size + 1); + void setup() { + data.resize((size + 1) * (size + 1)); + symmat.resize((size + 1) * (size + 1)); + mean.resize(size + 1); - init_arrays(data.data(), size); + init_arrays(data.data(), size); - data_buffer.initialize(args.device_queue, data.data(), cl::sycl::range<2>(size + 1, size + 1)); - symmat_buffer.initialize(args.device_queue, symmat.data(), cl::sycl::range<2>(size + 1, size + 1)); - mean_buffer.initialize(args.device_queue, mean.data(), cl::sycl::range<1>(size + 1)); + data_buffer.initialize(args.device_queue, data.data(), sycl::range<2>(size + 1, size + 1)); + symmat_buffer.initialize(args.device_queue, symmat.data(), sycl::range<2>(size + 1, size + 1)); + mean_buffer.initialize(args.device_queue, mean.data(), sycl::range<1>(size + 1)); } - void run(std::vector& events) { - using namespace cl::sycl; - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto data = data_buffer.get_access(cgh); - auto mean = mean_buffer.get_access(cgh); - - cgh.parallel_for(range<1>(size), id<1>(1), [=, N_ = size](item<1> item) { - const auto j = item[0]; - - mean[item] = 0; - for(size_t i = 1; i <= N_; i++) { - mean[item] += data[{i, j}]; - } - mean[item] /= float_n; - }); - })); - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto mean = mean_buffer.get_access(cgh); - auto data = data_buffer.get_access(cgh); - - cgh.parallel_for(range<2>(size, size), id<2>(1, 1), [=](item<2> item) { - const auto j = item[1]; - data[item] -= mean[j]; - }); - })); - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto data = data_buffer.get_access(cgh); - auto symmat = symmat_buffer.get_access(cgh); - auto symmat2 = symmat_buffer.get_access(cgh); - - cgh.parallel_for(range<1>(size), id<1>(1), [=, M_ = size, N_ = size](item<1> item) { - const auto j1 = item[0]; - - symmat[{j1, j1}] = 1.0; - - for(size_t j2 = j1; j2 <= M_; j2++) { - symmat[{j1, j2}] = 0.0; - for(size_t i = 1; i <= N_; i++) { - symmat[{j1, j2}] += data[{i, j1}] * data[{i, j2}]; - } - - symmat2[{j2, j1}] = symmat[{j1, j2}]; - } - }); - })); - } + void run(std::vector& events) { + using namespace sycl; + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto data = data_buffer.get_access(cgh); + auto mean = mean_buffer.get_access(cgh); + + cgh.parallel_for(range<1>(size), [=, N_ = size](id<1> gid) { + const id<1> offset(1); + const auto j = gid[0] + offset[0]; + + mean[gid + offset] = 0; + for(size_t i = 1; i <= N_; i++) { + mean[gid + offset] += data[{i, j}]; + } + mean[gid + offset] /= float_n; + }); + })); + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto mean = mean_buffer.get_access(cgh); + auto data = data_buffer.get_access(cgh); + + cgh.parallel_for(range<2>(size, size), [=](id<2> gid) { + const id<2> offset(1, 1); + const auto j = gid[1] + offset[1]; + data[gid + offset] -= mean[j]; + }); + })); + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto data = data_buffer.get_access(cgh); + auto symmat = symmat_buffer.get_access(cgh); + auto symmat2 = symmat_buffer.get_access(cgh); + + cgh.parallel_for(range<1>(size), [=, M_ = size, N_ = size](id<1> gid) { + const id<1> offset(1); + const auto j1 = gid[0] + offset[0]; + + symmat[{j1, j1}] = 1.0; + + for(size_t j2 = j1; j2 <= M_; j2++) { + symmat[{j1, j2}] = 0.0; + for(size_t i = 1; i <= N_; i++) { + symmat[{j1, j2}] += data[{i, j1}] * data[{i, j2}]; + } + + symmat2[{j2, j1}] = symmat[{j1, j2}]; + } + }); + })); + } - bool verify(VerificationSetting&) { - constexpr auto ERROR_THRESHOLD = 0.05; + bool verify(VerificationSetting&) { + constexpr auto ERROR_THRESHOLD = 0.05; - std::vector data_cpu((size + 1) * (size + 1)); - std::vector symmat_cpu((size + 1) * (size + 1)); - std::vector mean_cpu(size + 1); + std::vector data_cpu((size + 1) * (size + 1)); + std::vector symmat_cpu((size + 1) * (size + 1)); + std::vector mean_cpu(size + 1); - // Trigger writeback - symmat_buffer.reset(); + // Trigger writeback + symmat_buffer.reset(); - init_arrays(data_cpu.data(), size); + init_arrays(data_cpu.data(), size); - covariance(data_cpu.data(), symmat_cpu.data(), mean_cpu.data(), size); + covariance(data_cpu.data(), symmat_cpu.data(), mean_cpu.data(), size); - for(size_t i = 1; i < size + 1; i++) { - for(size_t j = 1; j < size + 1; j++) { - const auto diff = percentDiff(symmat_cpu[i * (size + 1) + j], symmat[i * (size + 1) + j]); - if(diff > ERROR_THRESHOLD) return false; - } - } + for(size_t i = 1; i < size + 1; i++) { + for(size_t j = 1; j < size + 1; j++) { + const auto diff = percentDiff(symmat_cpu[i * (size + 1) + j], symmat[i * (size + 1) + j]); + if(diff > ERROR_THRESHOLD) + return false; + } + } - return true; - } + return true; + } - static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Covariance"; } + static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Covariance"; } private: - BenchmarkArgs args; + BenchmarkArgs args; - const size_t size; - std::vector data; - std::vector symmat; - std::vector mean; + const size_t size; + std::vector data; + std::vector symmat; + std::vector mean; - PrefetchedBuffer data_buffer; - PrefetchedBuffer symmat_buffer; - PrefetchedBuffer mean_buffer; + PrefetchedBuffer data_buffer; + PrefetchedBuffer symmat_buffer; + PrefetchedBuffer mean_buffer; }; int main(int argc, char** argv) { - BenchmarkApp app(argc, argv); - app.run(); - return 0; -} + BenchmarkApp app(argc, argv); + app.run(); + return 0; +} \ No newline at end of file diff --git a/polybench/fdtd2d.cpp b/polybench/fdtd2d.cpp index d7f8444a..42d17953 100644 --- a/polybench/fdtd2d.cpp +++ b/polybench/fdtd2d.cpp @@ -3,7 +3,7 @@ #include -#include +#include #include "common.h" #include "polybenchUtilFuncts.h" @@ -17,188 +17,192 @@ class Fdtd2d3; constexpr auto TMAX = 500; void init_arrays(DATA_TYPE* fict, DATA_TYPE* ex, DATA_TYPE* ey, DATA_TYPE* hz, size_t size) { - const auto NX = size; - const auto NY = size; - - for(size_t i = 0; i < TMAX; i++) { - fict[i] = (DATA_TYPE)i; - } - - for(size_t i = 0; i < NX; i++) { - for(size_t j = 0; j < NY; j++) { - ex[i * NY + j] = ((DATA_TYPE)i * (j + 1) + 1) / NX; - ey[i * NY + j] = ((DATA_TYPE)(i - 1) * (j + 2) + 2) / NX; - hz[i * NY + j] = ((DATA_TYPE)(i - 9) * (j + 4) + 3) / NX; - } - } + const auto NX = size; + const auto NY = size; + + for(size_t i = 0; i < TMAX; i++) { + fict[i] = (DATA_TYPE)i; + } + + for(size_t i = 0; i < NX; i++) { + for(size_t j = 0; j < NY; j++) { + ex[i * NY + j] = ((DATA_TYPE)i * (j + 1) + 1) / NX; + ey[i * NY + j] = ((DATA_TYPE)(i - 1) * (j + 2) + 2) / NX; + hz[i * NY + j] = ((DATA_TYPE)(i - 9) * (j + 4) + 3) / NX; + } + } } void runFdtd(DATA_TYPE* fict, DATA_TYPE* ex, DATA_TYPE* ey, DATA_TYPE* hz, size_t size) { - const auto NX = size; - const auto NY = size; - - for(size_t t = 0; t < TMAX; t++) { - for(size_t j = 0; j < NY; j++) { - ey[0 * NY + j] = fict[t]; - } - - for(size_t i = 1; i < NX; i++) { - for(size_t j = 0; j < NY; j++) { - ey[i * NY + j] = ey[i * NY + j] - 0.5 * (hz[i * NY + j] - hz[(i - 1) * NY + j]); - } - } - - for(size_t i = 0; i < NX; i++) { - for(size_t j = 1; j < NY; j++) { - ex[i * (NY + 1) + j] = ex[i * (NY + 1) + j] - 0.5 * (hz[i * NY + j] - hz[i * NY + (j - 1)]); - } - } - - for(size_t i = 0; i < NX; i++) { - for(size_t j = 0; j < NY; j++) { - hz[i * NY + j] = hz[i * NY + j] - 0.7 * (ex[i * (NY + 1) + (j + 1)] - ex[i * (NY + 1) + j] + ey[(i + 1) * NY + j] - ey[i * NY + j]); - } - } - } + const auto NX = size; + const auto NY = size; + + for(size_t t = 0; t < TMAX; t++) { + for(size_t j = 0; j < NY; j++) { + ey[0 * NY + j] = fict[t]; + } + + for(size_t i = 1; i < NX; i++) { + for(size_t j = 0; j < NY; j++) { + ey[i * NY + j] = ey[i * NY + j] - 0.5 * (hz[i * NY + j] - hz[(i - 1) * NY + j]); + } + } + + for(size_t i = 0; i < NX; i++) { + for(size_t j = 1; j < NY; j++) { + ex[i * (NY + 1) + j] = ex[i * (NY + 1) + j] - 0.5 * (hz[i * NY + j] - hz[i * NY + (j - 1)]); + } + } + + for(size_t i = 0; i < NX; i++) { + for(size_t j = 0; j < NY; j++) { + hz[i * NY + j] = hz[i * NY + j] - 0.7 * (ex[i * (NY + 1) + (j + 1)] - ex[i * (NY + 1) + j] + + ey[(i + 1) * NY + j] - ey[i * NY + j]); + } + } + } } class Polybench_Fdtd2d { - public: - Polybench_Fdtd2d(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} - - void setup() { - fict.resize(TMAX); - ex.resize(size * (size + 1)); - ey.resize((size + 1) * size); - hz.resize(size * size); - - init_arrays(fict.data(), ex.data(), ey.data(), hz.data(), size); - - fict_buffer.initialize(args.device_queue, fict.data(), cl::sycl::range<1>(TMAX)); - ex_buffer.initialize(args.device_queue, ex.data(), cl::sycl::range<2>(size, size + 1)); - ey_buffer.initialize(args.device_queue, ey.data(), cl::sycl::range<2>(size + 1, size)); - hz_buffer.initialize(args.device_queue, hz.data(), cl::sycl::range<2>(size, size)); - } - - void run(std::vector& events) { - using namespace cl::sycl; - - for(size_t t = 0; t < TMAX; t++) { - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto fict = fict_buffer.get_access(cgh); - auto ey = ey_buffer.get_access(cgh); - auto hz = hz_buffer.get_access(cgh); - - cgh.parallel_for(range<2>(size, size), [=](item<2> item) { - const auto i = item[0]; - const auto j = item[1]; - - if(i == 0) { - ey[item] = fict[t]; - } else { - ey[item] = ey[item] - 0.5 * (hz[item] - hz[{(i - 1), j}]); - } - }); - })); - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto ex = ex_buffer.get_access(cgh); - auto hz = hz_buffer.get_access(cgh); - - cgh.parallel_for(range<2>(size, size), [=, NX_ = size, NY_ = size](item<2> item) { - const auto i = item[0]; - const auto j = item[1]; - - if(j > 0) ex[item] = ex[item] - 0.5 * (hz[item] - hz[{i, (j - 1)}]); - }); - })); - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto ex = ex_buffer.get_access(cgh); - auto ey = ey_buffer.get_access(cgh); - auto hz = hz_buffer.get_access(cgh); - - cgh.parallel_for(hz_buffer.get_range(), [=](item<2> item) { - const auto i = item[0]; - const auto j = item[1]; - - hz[item] = hz[item] - 0.7 * (ex[{i, (j + 1)}] - ex[item] + ey[{(i + 1), j}] - ey[item]); - }); - })); - } - } - - bool verify(VerificationSetting&) { - // Yes, this is threshold is used by polybench/CUDA/fdtd2d. Numbers in - // this benchmark can get pretty large and regular floats don't provide - // enough precision. This verification may fail on some problem sizes. - constexpr auto ERROR_THRESHOLD = 10.05; - - std::vector fict_cpu(TMAX); - std::vector ex_cpu(size * (size + 1)); - std::vector ey_cpu((size + 1) * size); - std::vector hz_cpu(size * size); - - // Trigger writebacks - hz_buffer.reset(); - - init_arrays(fict_cpu.data(), ex_cpu.data(), ey_cpu.data(), hz_cpu.data(), size); - - runFdtd(fict_cpu.data(), ex_cpu.data(), ey_cpu.data(), hz_cpu.data(), size); - - // for(size_t i = 0; i < size; i++) { - // for(size_t j = 0; j < size; j++) { - // const auto diff = percentDiff(ex_cpu[i * size + j], ex[i * size + j]); - // if(diff > ERROR_THRESHOLD) { - // printf("%ld %ld: %f %f %f\n", i, j, ex_cpu[i * size + j], ex[i * size + j], diff); - // return false; - // } - // } - // } - - // for(size_t i = 0; i < size; i++) { - // for(size_t j = 0; j < size; j++) { - // const auto diff = percentDiff(ey_cpu[i * size + j], ey[i * size + j]); - // if(diff > ERROR_THRESHOLD) { - // printf("%ld %ld: %f %f %f\n", i, j, ey_cpu[i * size + j], ey[i * size + j], diff); - // return false; - // } - // } - // } - - for(size_t i = 0; i < size; i++) { - for(size_t j = 0; j < size; j++) { - const auto diff = percentDiff(hz_cpu[i * size + j], hz[i * size + j]); - if(diff > ERROR_THRESHOLD) { - printf("%ld %ld: %f %f %f\n", i, j, hz_cpu[i * size + j], hz[i * size + j], diff); - return false; - } - } - } - - return true; - } - - static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Fdtd2d"; } - - private: - BenchmarkArgs args; - - const size_t size; - std::vector fict; - std::vector ex; - std::vector ey; - std::vector hz; - - PrefetchedBuffer fict_buffer; - PrefetchedBuffer ex_buffer; - PrefetchedBuffer ey_buffer; - PrefetchedBuffer hz_buffer; +public: + Polybench_Fdtd2d(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} + + void setup() { + fict.resize(TMAX); + ex.resize(size * (size + 1)); + ey.resize((size + 1) * size); + hz.resize(size * size); + + init_arrays(fict.data(), ex.data(), ey.data(), hz.data(), size); + + fict_buffer.initialize(args.device_queue, fict.data(), sycl::range<1>(TMAX)); + ex_buffer.initialize(args.device_queue, ex.data(), sycl::range<2>(size, size + 1)); + ey_buffer.initialize(args.device_queue, ey.data(), sycl::range<2>(size + 1, size)); + hz_buffer.initialize(args.device_queue, hz.data(), sycl::range<2>(size, size)); + } + + void run(std::vector& events) { + using namespace sycl; + + for(size_t t = 0; t < TMAX; t++) { + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto fict = fict_buffer.get_access(cgh); + auto ey = ey_buffer.get_access(cgh); + auto hz = hz_buffer.get_access(cgh); + + cgh.parallel_for(range<2>(size, size), [=](item<2> item) { + const auto i = item[0]; + const auto j = item[1]; + + if(i == 0) { + ey[item] = fict[t]; + } else { + ey[item] = ey[item] - 0.5 * (hz[item] - hz[{(i - 1), j}]); + } + }); + })); + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto ex = ex_buffer.get_access(cgh); + auto hz = hz_buffer.get_access(cgh); + + cgh.parallel_for(range<2>(size, size), [=, NX_ = size, NY_ = size](item<2> item) { + const auto i = item[0]; + const auto j = item[1]; + + if(j > 0) + ex[item] = ex[item] - 0.5 * (hz[item] - hz[{i, (j - 1)}]); + }); + })); + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto ex = ex_buffer.get_access(cgh); + auto ey = ey_buffer.get_access(cgh); + auto hz = hz_buffer.get_access(cgh); + + cgh.parallel_for(hz_buffer.get_range(), [=](item<2> item) { + const auto i = item[0]; + const auto j = item[1]; + + hz[item] = hz[item] - 0.7 * (ex[{i, (j + 1)}] - ex[item] + ey[{(i + 1), j}] - ey[item]); + }); + })); + } + } + + bool verify(VerificationSetting&) { + // Yes, this is threshold is used by polybench/CUDA/fdtd2d. Numbers in + // this benchmark can get pretty large and regular floats don't provide + // enough precision. This verification may fail on some problem sizes. + constexpr auto ERROR_THRESHOLD = 10.05; + + std::vector fict_cpu(TMAX); + std::vector ex_cpu(size * (size + 1)); + std::vector ey_cpu((size + 1) * size); + std::vector hz_cpu(size * size); + + // Trigger writebacks + hz_buffer.reset(); + + init_arrays(fict_cpu.data(), ex_cpu.data(), ey_cpu.data(), hz_cpu.data(), size); + + runFdtd(fict_cpu.data(), ex_cpu.data(), ey_cpu.data(), hz_cpu.data(), size); + + // for(size_t i = 0; i < size; i++) { + // for(size_t j = 0; j < size; j++) { + // const auto diff = percentDiff(ex_cpu[i * size + j], ex[i * size + j]); + // if(diff > ERROR_THRESHOLD) { + // printf("%ld %ld: %f %f %f\n", i, j, ex_cpu[i * size + j], ex[i * size + j], diff); + // return false; + // } + // } + // } + + // for(size_t i = 0; i < size; i++) { + // for(size_t j = 0; j < size; j++) { + // const auto diff = percentDiff(ey_cpu[i * size + j], ey[i * size + j]); + // if(diff > ERROR_THRESHOLD) { + // printf("%ld %ld: %f %f %f\n", i, j, ey_cpu[i * size + j], ey[i * size + j], diff); + // return false; + // } + // } + // } + + for(size_t i = 0; i < size; i++) { + for(size_t j = 0; j < size; j++) { + const auto diff = percentDiff(hz_cpu[i * size + j], hz[i * size + j]); + if(diff > ERROR_THRESHOLD) { + printf("%ld %ld: %f %f %f\n", i, j, hz_cpu[i * size + j], hz[i * size + j], diff); + return false; + } + } + } + + return true; + } + + static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Fdtd2d"; } + +private: + BenchmarkArgs args; + + const size_t size; + std::vector fict; + std::vector ex; + std::vector ey; + std::vector hz; + + PrefetchedBuffer fict_buffer; + PrefetchedBuffer ex_buffer; + PrefetchedBuffer ey_buffer; + PrefetchedBuffer hz_buffer; }; int main(int argc, char** argv) { - BenchmarkApp app(argc, argv); - if(app.deviceSupportsFP64()) - app.run(); - return 0; + BenchmarkApp app(argc, argv); + + if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) { + app.run(); + } + return 0; } diff --git a/polybench/gemm.cpp b/polybench/gemm.cpp index e1bff491..417763d4 100644 --- a/polybench/gemm.cpp +++ b/polybench/gemm.cpp @@ -3,7 +3,7 @@ #include -#include +#include #include "common.h" #include "polybenchUtilFuncts.h" @@ -16,121 +16,122 @@ using DATA_TYPE = float; class Gemm; void init(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* C, size_t size) { - const auto NI = size; - const auto NJ = size; - const auto NK = size; - - for(size_t i = 0; i < NI; i++) { - for(size_t j = 0; j < NK; j++) { - A[i * NK + j] = ((DATA_TYPE)i * j) / NI; - } - } - - for(size_t i = 0; i < NK; i++) { - for(size_t j = 0; j < NJ; j++) { - B[i * NJ + j] = ((DATA_TYPE)i * j + 1) / NJ; - } - } - - for(size_t i = 0; i < NI; i++) { - for(size_t j = 0; j < NJ; j++) { - C[i * NJ + j] = ((DATA_TYPE)i * j + 2) / NJ; - } - } + const auto NI = size; + const auto NJ = size; + const auto NK = size; + + for(size_t i = 0; i < NI; i++) { + for(size_t j = 0; j < NK; j++) { + A[i * NK + j] = ((DATA_TYPE)i * j) / NI; + } + } + + for(size_t i = 0; i < NK; i++) { + for(size_t j = 0; j < NJ; j++) { + B[i * NJ + j] = ((DATA_TYPE)i * j + 1) / NJ; + } + } + + for(size_t i = 0; i < NI; i++) { + for(size_t j = 0; j < NJ; j++) { + C[i * NJ + j] = ((DATA_TYPE)i * j + 2) / NJ; + } + } } void gemm(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* C, size_t size) { - const auto NI = size; - const auto NJ = size; - const auto NK = size; - - for(size_t i = 0; i < NI; i++) { - for(size_t j = 0; j < NJ; j++) { - C[i * NJ + j] *= BETA; - - for(size_t k = 0; k < NK; ++k) { - C[i * NJ + j] += ALPHA * A[i * NK + k] * B[k * NJ + j]; - } - } - } + const auto NI = size; + const auto NJ = size; + const auto NK = size; + + for(size_t i = 0; i < NI; i++) { + for(size_t j = 0; j < NJ; j++) { + C[i * NJ + j] *= BETA; + + for(size_t k = 0; k < NK; ++k) { + C[i * NJ + j] += ALPHA * A[i * NK + k] * B[k * NJ + j]; + } + } + } } class Polybench_Gemm { - public: - Polybench_Gemm(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} +public: + Polybench_Gemm(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} - void setup() { - A.resize(size * size); - B.resize(size * size); - C.resize(size * size); + void setup() { + A.resize(size * size); + B.resize(size * size); + C.resize(size * size); - init(A.data(), B.data(), C.data(), size); + init(A.data(), B.data(), C.data(), size); - A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<2>(size, size)); - B_buffer.initialize(args.device_queue, B.data(), cl::sycl::range<2>(size, size)); - C_buffer.initialize(args.device_queue, C.data(), cl::sycl::range<2>(size, size)); - } + A_buffer.initialize(args.device_queue, A.data(), sycl::range<2>(size, size)); + B_buffer.initialize(args.device_queue, B.data(), sycl::range<2>(size, size)); + C_buffer.initialize(args.device_queue, C.data(), sycl::range<2>(size, size)); + } - void run(std::vector& events) { - using namespace cl::sycl; + void run(std::vector& events) { + using namespace sycl; - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto A = A_buffer.get_access(cgh); - auto B = B_buffer.get_access(cgh); - auto C = C_buffer.get_access(cgh); + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto A = A_buffer.get_access(cgh); + auto B = B_buffer.get_access(cgh); + auto C = C_buffer.get_access(cgh); - cgh.parallel_for(C_buffer.get_range(), [=, NK_ = size](item<2> item) { - const auto i = item[0]; - const auto j = item[1]; + cgh.parallel_for(C_buffer.get_range(), [=, NK_ = size](item<2> item) { + const auto i = item[0]; + const auto j = item[1]; - C[item] *= BETA; + C[item] *= BETA; - for(size_t k = 0; k < NK_; k++) { - C[item] += ALPHA * A[{i, k}] * B[{k, j}]; - } - }); - })); - } + for(size_t k = 0; k < NK_; k++) { + C[item] += ALPHA * A[{i, k}] * B[{k, j}]; + } + }); + })); + } - bool verify(VerificationSetting&) { - constexpr auto ERROR_THRESHOLD = 0.05; + bool verify(VerificationSetting&) { + constexpr auto ERROR_THRESHOLD = 0.05; - // Trigger writeback - C_buffer.reset(); + // Trigger writeback + C_buffer.reset(); - std::vector C_cpu(size * size); + std::vector C_cpu(size * size); - init(A.data(), B.data(), C_cpu.data(), size); + init(A.data(), B.data(), C_cpu.data(), size); - gemm(A.data(), B.data(), C_cpu.data(), size); + gemm(A.data(), B.data(), C_cpu.data(), size); - for(size_t i = 0; i < size; i++) { - for(size_t j = 0; j < size; j++) { - const auto diff = percentDiff(C_cpu[i * size + j], C[i * size + j]); - if(diff > ERROR_THRESHOLD) return false; - } - } + for(size_t i = 0; i < size; i++) { + for(size_t j = 0; j < size; j++) { + const auto diff = percentDiff(C_cpu[i * size + j], C[i * size + j]); + if(diff > ERROR_THRESHOLD) + return false; + } + } - return true; - } + return true; + } - static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Gemm"; } + static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Gemm"; } private: - BenchmarkArgs args; + BenchmarkArgs args; - const size_t size; - std::vector A; - std::vector B; - std::vector C; + const size_t size; + std::vector A; + std::vector B; + std::vector C; - PrefetchedBuffer A_buffer; - PrefetchedBuffer B_buffer; - PrefetchedBuffer C_buffer; + PrefetchedBuffer A_buffer; + PrefetchedBuffer B_buffer; + PrefetchedBuffer C_buffer; }; int main(int argc, char** argv) { - BenchmarkApp app(argc, argv); - app.run(); - return 0; + BenchmarkApp app(argc, argv); + app.run(); + return 0; } diff --git a/polybench/gesummv.cpp b/polybench/gesummv.cpp index 9b3429f2..606e6786 100644 --- a/polybench/gesummv.cpp +++ b/polybench/gesummv.cpp @@ -3,7 +3,7 @@ #include -#include +#include #include "common.h" #include "polybenchUtilFuncts.h" @@ -23,120 +23,122 @@ constexpr DATA_TYPE BETA = 1; // if(percentDiff(y[i], y_outputFromGpu[i]) > PERCENT_DIFF_ERROR_THRESHOLD) fail++; // } -// printf("Non-Matching CPU-GPU Outputs Beyond Error Threshold of %4.2f Percent: %d\n", PERCENT_DIFF_ERROR_THRESHOLD, fail); +// printf("Non-Matching CPU-GPU Outputs Beyond Error Threshold of %4.2f Percent: %d\n", +// PERCENT_DIFF_ERROR_THRESHOLD, fail); // } void init(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* x, size_t size) { - const auto N = size; + const auto N = size; - for(size_t i = 0; i < N; i++) { - x[i] = 1; + for(size_t i = 0; i < N; i++) { + x[i] = 1; - for(size_t j = 0; j < N; j++) { - A[i * N + j] = 2; - B[i * N + j] = 3; - } - } + for(size_t j = 0; j < N; j++) { + A[i * N + j] = 2; + B[i * N + j] = 3; + } + } } void gesummv(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* x, DATA_TYPE* y, DATA_TYPE* tmp, size_t size) { - const auto N = size; - - for(size_t i = 0; i < N; i++) { - tmp[i] = 0; - y[i] = 0; - for(size_t j = 0; j < N; j++) { - tmp[i] = A[i * N + j] * x[j] + tmp[i]; - y[i] = B[i * N + j] * x[j] + y[i]; - } - - y[i] = ALPHA * tmp[i] + BETA * y[i]; - } + const auto N = size; + + for(size_t i = 0; i < N; i++) { + tmp[i] = 0; + y[i] = 0; + for(size_t j = 0; j < N; j++) { + tmp[i] = A[i * N + j] * x[j] + tmp[i]; + y[i] = B[i * N + j] * x[j] + y[i]; + } + + y[i] = ALPHA * tmp[i] + BETA * y[i]; + } } class Polybench_Gesummv { public: - Polybench_Gesummv(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} + Polybench_Gesummv(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} - void setup() { - A.resize(size * size); - B.resize(size * size); - x.resize(size); - y.resize(size); - tmp.resize(size); + void setup() { + A.resize(size * size); + B.resize(size * size); + x.resize(size); + y.resize(size); + tmp.resize(size); - init(A.data(), B.data(), x.data(), size); + init(A.data(), B.data(), x.data(), size); - A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<2>(size, size)); - B_buffer.initialize(args.device_queue, B.data(), cl::sycl::range<2>(size, size)); - x_buffer.initialize(args.device_queue, x.data(), cl::sycl::range<1>(size)); - y_buffer.initialize(args.device_queue, y.data(), cl::sycl::range<1>(size)); - tmp_buffer.initialize(args.device_queue, tmp.data(), cl::sycl::range<1>(size)); - } + A_buffer.initialize(args.device_queue, A.data(), sycl::range<2>(size, size)); + B_buffer.initialize(args.device_queue, B.data(), sycl::range<2>(size, size)); + x_buffer.initialize(args.device_queue, x.data(), sycl::range<1>(size)); + y_buffer.initialize(args.device_queue, y.data(), sycl::range<1>(size)); + tmp_buffer.initialize(args.device_queue, tmp.data(), sycl::range<1>(size)); + } - void run(std::vector& events) { - using namespace cl::sycl; + void run(std::vector& events) { + using namespace sycl; - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto A = A_buffer.get_access(cgh); - auto B = B_buffer.get_access(cgh); - auto x = x_buffer.get_access(cgh); - auto y = y_buffer.get_access(cgh); - auto tmp = tmp_buffer.get_access(cgh); + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto A = A_buffer.get_access(cgh); + auto B = B_buffer.get_access(cgh); + auto x = x_buffer.get_access(cgh); + auto y = y_buffer.get_access(cgh); + auto tmp = tmp_buffer.get_access(cgh); - cgh.parallel_for(y.get_range(), [=, N_ = size](item<1> item) { - const auto i = item[0]; + cgh.parallel_for(y.get_range(), [=, N_ = size](item<1> item) { + const auto i = item[0]; - for(size_t j = 0; j < N_; j++) { - tmp[item] += A[{i, j}] * x[j]; - y[item] += B[{i, j}] * x[j]; - } + for(size_t j = 0; j < N_; j++) { + tmp[item] += A[{i, j}] * x[j]; + y[item] += B[{i, j}] * x[j]; + } - y[item] = ALPHA * tmp[item] + BETA * y[item]; - }); - })); - } + y[item] = ALPHA * tmp[item] + BETA * y[item]; + }); + })); + } - bool verify(VerificationSetting&) { - constexpr auto ERROR_THRESHOLD = 0.05; + bool verify(VerificationSetting&) { + constexpr auto ERROR_THRESHOLD = 0.05; - // Trigger writeback - y_buffer.reset(); + // Trigger writeback + y_buffer.reset(); - std::vector y_cpu(size); - std::vector tmp_cpu(size); + std::vector y_cpu(size); + std::vector tmp_cpu(size); - gesummv(A.data(), B.data(), x.data(), y_cpu.data(), tmp_cpu.data(), size); + gesummv(A.data(), B.data(), x.data(), y_cpu.data(), tmp_cpu.data(), size); - for(size_t i = 0; i < size; i++) { - const auto diff = percentDiff(y_cpu[i], y[i]); - if(diff > ERROR_THRESHOLD) return false; - } + for(size_t i = 0; i < size; i++) { + const auto diff = percentDiff(y_cpu[i], y[i]); + if(diff > ERROR_THRESHOLD) + return false; + } - return true; - } + return true; + } - static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Gesummv"; } + static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Gesummv"; } private: - BenchmarkArgs args; - - const size_t size; - std::vector A; - std::vector B; - std::vector x; - std::vector y; - std::vector tmp; - - PrefetchedBuffer A_buffer; - PrefetchedBuffer B_buffer; - PrefetchedBuffer x_buffer; - PrefetchedBuffer y_buffer; - PrefetchedBuffer tmp_buffer; + BenchmarkArgs args; + + const size_t size; + std::vector A; + std::vector B; + std::vector x; + std::vector y; + std::vector tmp; + + PrefetchedBuffer A_buffer; + PrefetchedBuffer B_buffer; + PrefetchedBuffer x_buffer; + PrefetchedBuffer y_buffer; + PrefetchedBuffer tmp_buffer; }; int main(int argc, char** argv) { - BenchmarkApp app(argc, argv); - app.run(); - return 0; + BenchmarkApp app(argc, argv); + app.run(); + return 0; } diff --git a/polybench/gramschmidt.cpp b/polybench/gramschmidt.cpp index ba447e17..368b4446 100644 --- a/polybench/gramschmidt.cpp +++ b/polybench/gramschmidt.cpp @@ -4,7 +4,7 @@ #include #include -#include +#include #include "common.h" #include "polybenchUtilFuncts.h" @@ -16,148 +16,153 @@ class Gramschmidt2; class Gramschmidt3; void init_array(DATA_TYPE* A, size_t size) { - const auto M = size; - const auto N = size; - - for(size_t i = 0; i < M; i++) { - for(size_t j = 0; j < N; j++) { - A[i * N + j] = ((DATA_TYPE)(i + 1) * (j + 1)) / (M + 1); - } - } + const auto M = size; + const auto N = size; + + for(size_t i = 0; i < M; i++) { + for(size_t j = 0; j < N; j++) { + A[i * N + j] = ((DATA_TYPE)(i + 1) * (j + 1)) / (M + 1); + } + } } void gramschmidt(DATA_TYPE* A, DATA_TYPE* R, DATA_TYPE* Q, size_t size) { - const auto M = size; - const auto N = size; - - for(size_t k = 0; k < N; k++) { - DATA_TYPE nrm = 0; - for(size_t i = 0; i < M; i++) { - nrm += A[i * N + k] * A[i * N + k]; - } - - R[k * N + k] = sqrt(nrm); - for(size_t i = 0; i < M; i++) { - Q[i * N + k] = A[i * N + k] / R[k * N + k]; - } - - for(size_t j = k + 1; j < N; j++) { - R[k * N + j] = 0; - for(size_t i = 0; i < M; i++) { - R[k * N + j] += Q[i * N + k] * A[i * N + j]; - } - for(size_t i = 0; i < M; i++) { - A[i * N + j] = A[i * N + j] - Q[i * N + k] * R[k * N + j]; - } - } - } + const auto M = size; + const auto N = size; + + for(size_t k = 0; k < N; k++) { + DATA_TYPE nrm = 0; + for(size_t i = 0; i < M; i++) { + nrm += A[i * N + k] * A[i * N + k]; + } + + R[k * N + k] = sqrt(nrm); + for(size_t i = 0; i < M; i++) { + Q[i * N + k] = A[i * N + k] / R[k * N + k]; + } + + for(size_t j = k + 1; j < N; j++) { + R[k * N + j] = 0; + for(size_t i = 0; i < M; i++) { + R[k * N + j] += Q[i * N + k] * A[i * N + j]; + } + for(size_t i = 0; i < M; i++) { + A[i * N + j] = A[i * N + j] - Q[i * N + k] * R[k * N + j]; + } + } + } } class Polybench_Gramschmidt { - public: - Polybench_Gramschmidt(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} - - void setup() { - A.resize(size * size); - R.resize(size * size); - Q.resize(size * size); - - init_array(A.data(), size); - - A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<2>(size, size)); - R_buffer.initialize(args.device_queue, R.data(), cl::sycl::range<2>(size, size)); - Q_buffer.initialize(args.device_queue, Q.data(), cl::sycl::range<2>(size, size)); - } - - void run(std::vector& events) { - using namespace cl::sycl; - - for(size_t k = 0; k < size; k++) { - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto A = A_buffer.get_access(cgh); - auto R = R_buffer.get_access(cgh); - - cgh.parallel_for(range<2>(1, 1), [=, M_ = size](item<2> item) { - DATA_TYPE nrm = 0; - for(size_t i = 0; i < M_; i++) { - nrm += A[{i, k}] * A[{i, k}]; - } - R[{k, k}] = cl::sycl::sqrt(nrm); - }); - })); - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto A = A_buffer.get_access(cgh); - auto R = R_buffer.get_access(cgh); - auto Q = Q_buffer.get_access(cgh); - - cgh.parallel_for(range<2>(size, 1), id<2>(0, k), [=](item<2> item) { Q[item] = A[item] / R[{k, k}]; }); - })); - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto A = A_buffer.get_access(cgh); - auto R = R_buffer.get_access(cgh); - auto Q = Q_buffer.get_access(cgh); - - cgh.parallel_for(range<2>(size, 1), [=, M_ = size, N_ = size](item<2> item) { - const auto j = item[0]; - - if(j <= k || j >= N_) return; - - R[item] = 0; - for(size_t i = 0; i < M_; i++) { - R[item] += Q[{i, k}] * A[{i, j}]; - } - - for(size_t i = 0; i < M_; i++) { - A[{i, j}] -= Q[{i, k}] * R[item]; - } - }); - })); - } - } - - bool verify(VerificationSetting&) { - constexpr auto ERROR_THRESHOLD = 0.05; - - std::vector A_cpu(size * size); - std::vector R_cpu(size * size); - std::vector Q_cpu(size * size); - - // Trigger writeback - A_buffer.reset(); - - init_array(A_cpu.data(), size); - - gramschmidt(A_cpu.data(), R_cpu.data(), Q_cpu.data(), size); - - for(size_t i = 0; i < size; i++) { - for(size_t j = 0; j < size; j++) { - const auto diff = percentDiff(A_cpu[i * size + j], A[i * size + j]); - if(diff > ERROR_THRESHOLD) return false; - } - } - - return true; - } - - static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Gramschmidt"; } - - private: - BenchmarkArgs args; - - const size_t size; - std::vector A; - std::vector R; - std::vector Q; - - PrefetchedBuffer A_buffer; - PrefetchedBuffer R_buffer; - PrefetchedBuffer Q_buffer; +public: + Polybench_Gramschmidt(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} + + void setup() { + A.resize(size * size); + R.resize(size * size); + Q.resize(size * size); + + init_array(A.data(), size); + + A_buffer.initialize(args.device_queue, A.data(), sycl::range<2>(size, size)); + R_buffer.initialize(args.device_queue, R.data(), sycl::range<2>(size, size)); + Q_buffer.initialize(args.device_queue, Q.data(), sycl::range<2>(size, size)); + } + + void run(std::vector& events) { + using namespace sycl; + + for(size_t k = 0; k < size; k++) { + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto A = A_buffer.get_access(cgh); + auto R = R_buffer.get_access(cgh); + + cgh.parallel_for(range<2>(1, 1), [=, M_ = size](item<2> item) { + DATA_TYPE nrm = 0; + for(size_t i = 0; i < M_; i++) { + nrm += A[{i, k}] * A[{i, k}]; + } + R[{k, k}] = sycl::sqrt(nrm); + }); + })); + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto A = A_buffer.get_access(cgh); + auto R = R_buffer.get_access(cgh); + auto Q = Q_buffer.get_access(cgh); + + cgh.parallel_for(range<2>(size, 1), [=](item<2> gid) { + const id<2> offset(0, k); + Q[gid + offset] = A[gid + offset] / R[{k, k}]; + }); + })); + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto A = A_buffer.get_access(cgh); + auto R = R_buffer.get_access(cgh); + auto Q = Q_buffer.get_access(cgh); + + cgh.parallel_for(range<2>(size, 1), [=, M_ = size, N_ = size](item<2> item) { + const auto j = item[0]; + + if(j <= k || j >= N_) + return; + + R[item] = 0; + for(size_t i = 0; i < M_; i++) { + R[item] += Q[{i, k}] * A[{i, j}]; + } + + for(size_t i = 0; i < M_; i++) { + A[{i, j}] -= Q[{i, k}] * R[item]; + } + }); + })); + } + } + + bool verify(VerificationSetting&) { + constexpr auto ERROR_THRESHOLD = 0.05; + + std::vector A_cpu(size * size); + std::vector R_cpu(size * size); + std::vector Q_cpu(size * size); + + // Trigger writeback + A_buffer.reset(); + + init_array(A_cpu.data(), size); + + gramschmidt(A_cpu.data(), R_cpu.data(), Q_cpu.data(), size); + + for(size_t i = 0; i < size; i++) { + for(size_t j = 0; j < size; j++) { + const auto diff = percentDiff(A_cpu[i * size + j], A[i * size + j]); + if(diff > ERROR_THRESHOLD) + return false; + } + } + + return true; + } + + static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Gramschmidt"; } + +private: + BenchmarkArgs args; + + const size_t size; + std::vector A; + std::vector R; + std::vector Q; + + PrefetchedBuffer A_buffer; + PrefetchedBuffer R_buffer; + PrefetchedBuffer Q_buffer; }; int main(int argc, char** argv) { - BenchmarkApp app(argc, argv); - app.run(); - return 0; -} + BenchmarkApp app(argc, argv); + app.run(); + return 0; +} \ No newline at end of file diff --git a/polybench/mvt.cpp b/polybench/mvt.cpp index 497d14c4..b3077062 100644 --- a/polybench/mvt.cpp +++ b/polybench/mvt.cpp @@ -3,7 +3,7 @@ #include -#include +#include #include "common.h" #include "polybenchUtilFuncts.h" @@ -14,134 +14,136 @@ class Mvt1; class Mvt2; void init_arrays(DATA_TYPE* a, DATA_TYPE* x1, DATA_TYPE* x2, DATA_TYPE* y_1, DATA_TYPE* y_2, size_t size) { - const auto N = size; - - for(size_t i = 0; i < N; i++) { - x1[i] = 0.0; - x2[i] = 0.0; - y_1[i] = 1.0; - y_2[i] = 1.0; - - for(size_t j = 0; j < N; j++) { - a[i * N + j] = (DATA_TYPE)(i + j + 1.0) / N; - } - } + const auto N = size; + + for(size_t i = 0; i < N; i++) { + x1[i] = 0.0; + x2[i] = 0.0; + y_1[i] = 1.0; + y_2[i] = 1.0; + + for(size_t j = 0; j < N; j++) { + a[i * N + j] = (DATA_TYPE)(i + j + 1.0) / N; + } + } } void runMvt(DATA_TYPE* a, DATA_TYPE* x1, DATA_TYPE* x2, DATA_TYPE* y1, DATA_TYPE* y2, size_t size) { - const auto N = size; - - for(size_t i = 0; i < N; i++) { - for(size_t j = 0; j < N; j++) { - x1[i] = x1[i] + a[i * N + j] * y1[j]; - } - } - - for(size_t k = 0; k < N; k++) { - for(size_t l = 0; l < N; l++) { - x2[k] = x2[k] + a[k * N + l] * y2[l]; - } - } + const auto N = size; + + for(size_t i = 0; i < N; i++) { + for(size_t j = 0; j < N; j++) { + x1[i] = x1[i] + a[i * N + j] * y1[j]; + } + } + + for(size_t k = 0; k < N; k++) { + for(size_t l = 0; l < N; l++) { + x2[k] = x2[k] + a[k * N + l] * y2[l]; + } + } } class Polybench_Mvt { - public: - Polybench_Mvt(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} - - void setup() { - a.resize(size * size); - x1.resize(size); - x2.resize(size); - y1.resize(size); - y2.resize(size); - - init_arrays(a.data(), x1.data(), x2.data(), y1.data(), y2.data(), size); - - a_buffer .initialize(args.device_queue, a.data(), cl::sycl::range<2>(size, size)); - x1_buffer.initialize(args.device_queue, x1.data(), cl::sycl::range<1>(size)); - x2_buffer.initialize(args.device_queue, x2.data(), cl::sycl::range<1>(size)); - y1_buffer.initialize(args.device_queue, y1.data(), cl::sycl::range<1>(size)); - y2_buffer.initialize(args.device_queue, y2.data(), cl::sycl::range<1>(size)); - } - - void run(std::vector& events) { - using namespace cl::sycl; - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto a = a_buffer.get_access(cgh); - auto y1 = y1_buffer.get_access(cgh); - auto x1 = x1_buffer.get_access(cgh); - - cgh.parallel_for(x1_buffer.get_range(), [=, N_ = size](item<1> item) { - const auto i = item[0]; - - for(size_t j = 0; j < N_; j++) { - x1[i] += a[{i, j}] * y1[j]; - } - }); - })); - - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto a = a_buffer.get_access(cgh); - auto y2 = y2_buffer.get_access(cgh); - auto x2 = x2_buffer.get_access(cgh); - - cgh.parallel_for(x1_buffer.get_range(), [=, N_ = size](item<1> item) { - const auto k = item[0]; - - for(size_t l = 0; l < N_; l++) { - x2[k] += a[{k, l}] * y2[l]; - } - }); - })); - } - - bool verify(VerificationSetting&) { - constexpr auto ERROR_THRESHOLD = 0.05; - - std::vector x1_cpu(size); - std::vector x2_cpu(size); - - // Trigger writeback - x1_buffer.reset(); - x2_buffer.reset(); - - init_arrays(a.data(), x1_cpu.data(), x2_cpu.data(), y1.data(), y2.data(), size); - - runMvt(a.data(), x1_cpu.data(), x2_cpu.data(), y1.data(), y2.data(), size); - - for(size_t i = 0; i < size; i++) { - auto diff = percentDiff(x1_cpu[i], x1[i]); - if(diff > ERROR_THRESHOLD) return false; - - diff = percentDiff(x2_cpu[i], x2[i]); - if(diff > ERROR_THRESHOLD) return false; - } - - return true; - } - - static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Mvt"; } - - private: - BenchmarkArgs args; - - const size_t size; - std::vector a; - std::vector x1; - std::vector x2; - std::vector y1; - std::vector y2; - - PrefetchedBuffer a_buffer; - PrefetchedBuffer x1_buffer; - PrefetchedBuffer x2_buffer; - PrefetchedBuffer y1_buffer; - PrefetchedBuffer y2_buffer; +public: + Polybench_Mvt(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} + + void setup() { + a.resize(size * size); + x1.resize(size); + x2.resize(size); + y1.resize(size); + y2.resize(size); + + init_arrays(a.data(), x1.data(), x2.data(), y1.data(), y2.data(), size); + + a_buffer.initialize(args.device_queue, a.data(), sycl::range<2>(size, size)); + x1_buffer.initialize(args.device_queue, x1.data(), sycl::range<1>(size)); + x2_buffer.initialize(args.device_queue, x2.data(), sycl::range<1>(size)); + y1_buffer.initialize(args.device_queue, y1.data(), sycl::range<1>(size)); + y2_buffer.initialize(args.device_queue, y2.data(), sycl::range<1>(size)); + } + + void run(std::vector& events) { + using namespace sycl; + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto a = a_buffer.get_access(cgh); + auto y1 = y1_buffer.get_access(cgh); + auto x1 = x1_buffer.get_access(cgh); + + cgh.parallel_for(x1_buffer.get_range(), [=, N_ = size](item<1> item) { + const auto i = item[0]; + + for(size_t j = 0; j < N_; j++) { + x1[i] += a[{i, j}] * y1[j]; + } + }); + })); + + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto a = a_buffer.get_access(cgh); + auto y2 = y2_buffer.get_access(cgh); + auto x2 = x2_buffer.get_access(cgh); + + cgh.parallel_for(x1_buffer.get_range(), [=, N_ = size](item<1> item) { + const auto k = item[0]; + + for(size_t l = 0; l < N_; l++) { + x2[k] += a[{k, l}] * y2[l]; + } + }); + })); + } + + bool verify(VerificationSetting&) { + constexpr auto ERROR_THRESHOLD = 0.05; + + std::vector x1_cpu(size); + std::vector x2_cpu(size); + + // Trigger writeback + x1_buffer.reset(); + x2_buffer.reset(); + + init_arrays(a.data(), x1_cpu.data(), x2_cpu.data(), y1.data(), y2.data(), size); + + runMvt(a.data(), x1_cpu.data(), x2_cpu.data(), y1.data(), y2.data(), size); + + for(size_t i = 0; i < size; i++) { + auto diff = percentDiff(x1_cpu[i], x1[i]); + if(diff > ERROR_THRESHOLD) + return false; + + diff = percentDiff(x2_cpu[i], x2[i]); + if(diff > ERROR_THRESHOLD) + return false; + } + + return true; + } + + static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Mvt"; } + +private: + BenchmarkArgs args; + + const size_t size; + std::vector a; + std::vector x1; + std::vector x2; + std::vector y1; + std::vector y2; + + PrefetchedBuffer a_buffer; + PrefetchedBuffer x1_buffer; + PrefetchedBuffer x2_buffer; + PrefetchedBuffer y1_buffer; + PrefetchedBuffer y2_buffer; }; int main(int argc, char** argv) { - BenchmarkApp app(argc, argv); - app.run(); - return 0; + BenchmarkApp app(argc, argv); + app.run(); + return 0; } diff --git a/polybench/syr2k.cpp b/polybench/syr2k.cpp index d62fb7aa..63e231c6 100644 --- a/polybench/syr2k.cpp +++ b/polybench/syr2k.cpp @@ -3,7 +3,7 @@ #include -#include +#include #include "common.h" #include "polybenchUtilFuncts.h" @@ -16,117 +16,118 @@ constexpr DATA_TYPE ALPHA = 1; constexpr DATA_TYPE BETA = 1; void init_arrays(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* C, size_t size) { - const auto N = size; - const auto M = size; - - for(size_t i = 0; i < N; i++) { - for(size_t j = 0; j < N; j++) { - C[i * N + j] = ((DATA_TYPE)i * j + 2) / N; - } - - for(size_t j = 0; j < M; j++) { - A[i * N + j] = ((DATA_TYPE)i * j) / N; - B[i * N + j] = ((DATA_TYPE)i * j + 1) / N; - } - } + const auto N = size; + const auto M = size; + + for(size_t i = 0; i < N; i++) { + for(size_t j = 0; j < N; j++) { + C[i * N + j] = ((DATA_TYPE)i * j + 2) / N; + } + + for(size_t j = 0; j < M; j++) { + A[i * N + j] = ((DATA_TYPE)i * j) / N; + B[i * N + j] = ((DATA_TYPE)i * j + 1) / N; + } + } } void syr2k(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* C, size_t size) { - const auto N = size; - const auto M = size; - - for(size_t i = 0; i < N; i++) { - for(size_t j = 0; j < N; j++) { - C[i * N + j] *= BETA; - } - } - - for(size_t i = 0; i < N; i++) { - for(size_t j = 0; j < N; j++) { - for(size_t k = 0; k < M; k++) { - C[i * N + j] += ALPHA * A[i * M + k] * B[j * M + k]; - C[i * N + j] += ALPHA * B[i * M + k] * A[j * M + k]; - } - } - } + const auto N = size; + const auto M = size; + + for(size_t i = 0; i < N; i++) { + for(size_t j = 0; j < N; j++) { + C[i * N + j] *= BETA; + } + } + + for(size_t i = 0; i < N; i++) { + for(size_t j = 0; j < N; j++) { + for(size_t k = 0; k < M; k++) { + C[i * N + j] += ALPHA * A[i * M + k] * B[j * M + k]; + C[i * N + j] += ALPHA * B[i * M + k] * A[j * M + k]; + } + } + } } class Polybench_Syr2k { - public: - Polybench_Syr2k(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} +public: + Polybench_Syr2k(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} - void setup() { - A.resize(size * size); - B.resize(size * size); - C.resize(size * size); + void setup() { + A.resize(size * size); + B.resize(size * size); + C.resize(size * size); - init_arrays(A.data(), B.data(), C.data(), size); + init_arrays(A.data(), B.data(), C.data(), size); - A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<2>(size, size)); - B_buffer.initialize(args.device_queue, B.data(), cl::sycl::range<2>(size, size)); - C_buffer.initialize(args.device_queue, C.data(), cl::sycl::range<2>(size, size)); - } + A_buffer.initialize(args.device_queue, A.data(), sycl::range<2>(size, size)); + B_buffer.initialize(args.device_queue, B.data(), sycl::range<2>(size, size)); + C_buffer.initialize(args.device_queue, C.data(), sycl::range<2>(size, size)); + } - void run(std::vector& events) { - using namespace cl::sycl; + void run(std::vector& events) { + using namespace sycl; - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto A = A_buffer.get_access(cgh); - auto B = B_buffer.get_access(cgh); - auto C = C_buffer.get_access(cgh); + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto A = A_buffer.get_access(cgh); + auto B = B_buffer.get_access(cgh); + auto C = C_buffer.get_access(cgh); - cgh.parallel_for(C_buffer.get_range(), [=, M_ = size](item<2> item) { - const auto i = item[0]; - const auto j = item[1]; + cgh.parallel_for(C_buffer.get_range(), [=, M_ = size](item<2> item) { + const auto i = item[0]; + const auto j = item[1]; - C[item] *= BETA; + C[item] *= BETA; - for(size_t k = 0; k < M_; k++) { - C[item] += ALPHA * A[{i, k}] * B[{j, k}] + ALPHA * B[{i, k}] * A[{j, k}]; - } - }); - })); - } + for(size_t k = 0; k < M_; k++) { + C[item] += ALPHA * A[{i, k}] * B[{j, k}] + ALPHA * B[{i, k}] * A[{j, k}]; + } + }); + })); + } - bool verify(VerificationSetting&) { - constexpr auto ERROR_THRESHOLD = 0.05; + bool verify(VerificationSetting&) { + constexpr auto ERROR_THRESHOLD = 0.05; - std::vector C_cpu(size * size); + std::vector C_cpu(size * size); - init_arrays(A.data(), B.data(), C_cpu.data(), size); + init_arrays(A.data(), B.data(), C_cpu.data(), size); - // Trigger writeback - C_buffer.reset(); + // Trigger writeback + C_buffer.reset(); - syr2k(A.data(), B.data(), C_cpu.data(), size); + syr2k(A.data(), B.data(), C_cpu.data(), size); - for(size_t i = 0; i < size; i++) { - for(size_t j = 0; j < size; j++) { - const auto diff = percentDiff(C_cpu[i * size + j], C[i * size + j]); - if(diff > ERROR_THRESHOLD) return false; - } - } + for(size_t i = 0; i < size; i++) { + for(size_t j = 0; j < size; j++) { + const auto diff = percentDiff(C_cpu[i * size + j], C[i * size + j]); + if(diff > ERROR_THRESHOLD) + return false; + } + } - return true; - } + return true; + } - static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Syr2k"; } + static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Syr2k"; } - private: - BenchmarkArgs args; +private: + BenchmarkArgs args; - const size_t size; - std::vector A; - std::vector B; - std::vector C; + const size_t size; + std::vector A; + std::vector B; + std::vector C; - PrefetchedBuffer A_buffer; - PrefetchedBuffer B_buffer; - PrefetchedBuffer C_buffer; + PrefetchedBuffer A_buffer; + PrefetchedBuffer B_buffer; + PrefetchedBuffer C_buffer; }; int main(int argc, char** argv) { - BenchmarkApp app(argc, argv); - app.run(); - return 0; + BenchmarkApp app(argc, argv); + app.run(); + return 0; } diff --git a/polybench/syrk.cpp b/polybench/syrk.cpp index a5b9ae6a..1d017f7c 100644 --- a/polybench/syrk.cpp +++ b/polybench/syrk.cpp @@ -3,7 +3,7 @@ #include -#include +#include #include "common.h" #include "polybenchUtilFuncts.h" @@ -16,111 +16,112 @@ constexpr DATA_TYPE alpha = 123; constexpr DATA_TYPE beta = 14512; void init_arrays(DATA_TYPE* A, DATA_TYPE* C, size_t size) { - const auto N = size; - const auto M = size; - - for(size_t i = 0; i < N; i++) { - for(size_t j = 0; j < M; j++) { - A[i * M + j] = ((DATA_TYPE)i * j) / N; - } - - for(size_t j = 0; j < N; j++) { - C[i * M + j] = ((DATA_TYPE)i * j + 2) / N; - } - } + const auto N = size; + const auto M = size; + + for(size_t i = 0; i < N; i++) { + for(size_t j = 0; j < M; j++) { + A[i * M + j] = ((DATA_TYPE)i * j) / N; + } + + for(size_t j = 0; j < N; j++) { + C[i * M + j] = ((DATA_TYPE)i * j + 2) / N; + } + } } void syrk(DATA_TYPE* A, DATA_TYPE* C, size_t size) { - const auto N = size; - const auto M = size; - - /* C := alpha*A*A' + beta*C */ - for(size_t i = 0; i < N; i++) { - for(size_t j = 0; j < N; j++) { - C[i * M + j] *= beta; - } - } - - for(size_t i = 0; i < N; i++) { - for(size_t j = 0; j < N; j++) { - for(size_t k = 0; k < M; k++) { - C[i * N + j] += alpha * A[i * M + k] * A[j * M + k]; - } - } - } + const auto N = size; + const auto M = size; + + /* C := alpha*A*A' + beta*C */ + for(size_t i = 0; i < N; i++) { + for(size_t j = 0; j < N; j++) { + C[i * M + j] *= beta; + } + } + + for(size_t i = 0; i < N; i++) { + for(size_t j = 0; j < N; j++) { + for(size_t k = 0; k < M; k++) { + C[i * N + j] += alpha * A[i * M + k] * A[j * M + k]; + } + } + } } class Polybench_Syrk { - public: - Polybench_Syrk(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} +public: + Polybench_Syrk(const BenchmarkArgs& args) : args(args), size(args.problem_size) {} - void setup() { - A.resize(size * size); - C.resize(size * size); + void setup() { + A.resize(size * size); + C.resize(size * size); - init_arrays(A.data(), C.data(), size); + init_arrays(A.data(), C.data(), size); - A_buffer.initialize(args.device_queue, A.data(), cl::sycl::range<2>(size, size)); - C_buffer.initialize(args.device_queue, C.data(), cl::sycl::range<2>(size, size)); - } + A_buffer.initialize(args.device_queue, A.data(), sycl::range<2>(size, size)); + C_buffer.initialize(args.device_queue, C.data(), sycl::range<2>(size, size)); + } - void run(std::vector& events) { - using namespace cl::sycl; + void run(std::vector& events) { + using namespace sycl; - events.push_back(args.device_queue.submit([&](handler& cgh) { - auto A = A_buffer.get_access(cgh); - auto C = C_buffer.get_access(cgh); + events.push_back(args.device_queue.submit([&](handler& cgh) { + auto A = A_buffer.get_access(cgh); + auto C = C_buffer.get_access(cgh); - cgh.parallel_for(C_buffer.get_range(), [=, M_ = size](item<2> item) { - const auto i = item[0]; - const auto j = item[1]; + cgh.parallel_for(C_buffer.get_range(), [=, M_ = size](item<2> item) { + const auto i = item[0]; + const auto j = item[1]; - C[item] *= beta; + C[item] *= beta; - for(size_t k = 0; k < M_; k++) { - C[item] += alpha * A[{i, k}] * A[{j, k}]; - } - }); - })); - } + for(size_t k = 0; k < M_; k++) { + C[item] += alpha * A[{i, k}] * A[{j, k}]; + } + }); + })); + } - bool verify(VerificationSetting&) { - constexpr auto ERROR_THRESHOLD = 0.05; + bool verify(VerificationSetting&) { + constexpr auto ERROR_THRESHOLD = 0.05; - // Trigger writeback - C_buffer.reset(); + // Trigger writeback + C_buffer.reset(); - std::vector C_cpu(size * size); + std::vector C_cpu(size * size); - init_arrays(A.data(), C_cpu.data(), size); + init_arrays(A.data(), C_cpu.data(), size); - syrk(A.data(), C_cpu.data(), size); + syrk(A.data(), C_cpu.data(), size); - for(size_t i = 0; i < size; i++) { - for(size_t j = 0; j < size; j++) { - const auto diff = percentDiff(C_cpu[i * size + j], C[i * size + j]); - if(diff > ERROR_THRESHOLD) return false; - } - } + for(size_t i = 0; i < size; i++) { + for(size_t j = 0; j < size; j++) { + const auto diff = percentDiff(C_cpu[i * size + j], C[i * size + j]); + if(diff > ERROR_THRESHOLD) + return false; + } + } - return true; - } + return true; + } - static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Syrk"; } + static std::string getBenchmarkName(BenchmarkArgs& args) { return "Polybench_Syrk"; } - private: - BenchmarkArgs args; +private: + BenchmarkArgs args; - const size_t size; - std::vector A; - std::vector C; + const size_t size; + std::vector A; + std::vector C; - PrefetchedBuffer A_buffer; - PrefetchedBuffer C_buffer; + PrefetchedBuffer A_buffer; + PrefetchedBuffer C_buffer; }; int main(int argc, char** argv) { - BenchmarkApp app(argc, argv); - app.run(); - return 0; + BenchmarkApp app(argc, argv); + app.run(); + return 0; } diff --git a/runtime/blocked_transform.cpp b/runtime/blocked_transform.cpp index 10109b71..77c9e7d0 100644 --- a/runtime/blocked_transform.cpp +++ b/runtime/blocked_transform.cpp @@ -1,39 +1,32 @@ - - - - #include "common.h" +#include +#include #include #include -#include -#include -using namespace cl; -using complex = sycl::vec; +using complex = sycl::vec; -inline complex mandelbrot_iteration(complex z, complex c) -{ +inline complex mandelbrot_iteration(complex z, complex c) { complex result = c; - result.x() += z.x()*z.x() - z.y()*z.y(); - result.y() += 2 * z.x()*z.y(); + result.x() += z.x() * z.x() - z.y() * z.y(); + result.y() += 2 * z.x() * z.y(); return result; } -template -complex mandelbrot_sequence(complex z0, complex c) -{ +template +complex mandelbrot_sequence(complex z0, complex c) { complex z = z0; - for(int i = 0; i < Num_iterations; ++i){ + for(int i = 0; i < Num_iterations; ++i) { z = mandelbrot_iteration(z, c); } return z; } -template +template class MandelbrotKernel; /// Performs a blocked transform operation using the mandelbrot sequence @@ -47,57 +40,47 @@ class MandelbrotKernel; /// accessed ranges are non-overlapping. In order for the benchmark to stress /// these aspects, \c Num_iterations should be tuned such that the kernel /// runtime is similar to the data transfer time of one block. -template -class BlockedTransform -{ +template +class BlockedTransform { private: - std::vector data; - BenchmarkArgs args; - std::size_t block_size; + std::vector data; + BenchmarkArgs args; + std::size_t block_size; + public: - BlockedTransform( - const BenchmarkArgs &_args, - std::size_t _block_size) - : args(_args), block_size{_block_size} - { - assert(block_size > 0); - } - - void setup() { - init_data(data); + BlockedTransform(const BenchmarkArgs& _args, std::size_t _block_size) : args(_args), block_size{_block_size} { + assert(block_size > 0); } - void run(){ - sycl::buffer buff {data.data(), sycl::range<1>{data.size()}}; + void setup() { init_data(data); } - sycl::id<1> begin {0}; - sycl::range<1> current_batch_size {block_size}; - for(;begin[0] < data.size(); begin[0] += this->block_size) { + void run() { + sycl::buffer buff{data.data(), sycl::range<1>{data.size()}}; - current_batch_size[0] = std::min(this->block_size, data.size()-begin[0]); + sycl::id<1> begin{0}; + sycl::range<1> current_batch_size{block_size}; + for(; begin[0] < data.size(); begin[0] += this->block_size) { + current_batch_size[0] = std::min(this->block_size, data.size() - begin[0]); - args.device_queue.submit([&](sycl::handler &cgh) { + args.device_queue.submit([&](sycl::handler& cgh) { + auto acc = buff.get_access(cgh, current_batch_size, begin); - auto acc = buff.get_access( - cgh, current_batch_size, begin); - - cgh.parallel_for>( - current_batch_size, begin, [=](cl::sycl::id<1> idx) { - const complex z0{0.0f, 0.0f}; - acc[idx] = mandelbrot_sequence(z0, acc[idx]); - }); + cgh.parallel_for>(current_batch_size, [=](sycl::id<1> idx) { + const complex z0{0.0f, 0.0f}; + acc[idx] = mandelbrot_sequence(z0, acc[idx]); + }); }); } } - bool verify(VerificationSetting &ver) { - std::vector v; + bool verify(VerificationSetting& ver) { + std::vector v; init_data(v); const double tol = 1.e-5; for(std::size_t i = 0; i < v.size(); ++i) { - v[i] = mandelbrot_sequence(complex{0.0f,0.0f}, v[i]); + v[i] = mandelbrot_sequence(complex{0.0f, 0.0f}, v[i]); if(std::abs(v[i].x() - data[i].x()) > tol) return false; @@ -107,7 +90,7 @@ class BlockedTransform return true; } - + std::string getBenchmarkName(BenchmarkArgs& args) { std::stringstream name; name << "Runtime_BlockedTransform_iter_"; @@ -117,25 +100,21 @@ class BlockedTransform } private: - void init_data(std::vector& initial_data) - { + void init_data(std::vector& initial_data) { initial_data.clear(); initial_data.resize(args.problem_size); - for(std::size_t i = 0; i < initial_data.size(); ++i) - { - initial_data[i].x() = 0.8*std::cos(i/args.problem_size); - initial_data[i].y() = 0.8*std::sin(i/args.problem_size); + for(std::size_t i = 0; i < initial_data.size(); ++i) { + initial_data[i].x() = 0.8 * std::cos(i / args.problem_size); + initial_data[i].y() = 0.8 * std::sin(i / args.problem_size); } } }; -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { BenchmarkApp app(argc, argv); - for (std::size_t block_size = app.getArgs().local_size; - block_size < app.getArgs().problem_size; block_size *= 2) { + for(std::size_t block_size = app.getArgs().local_size; block_size < app.getArgs().problem_size; block_size *= 2) { app.run>(block_size); app.run>(block_size); app.run>(block_size); @@ -144,7 +123,3 @@ int main(int argc, char** argv) return 0; } - - - - diff --git a/runtime/dag_task_throughput_independent.cpp b/runtime/dag_task_throughput_independent.cpp index bd848341..dc136dc7 100644 --- a/runtime/dag_task_throughput_independent.cpp +++ b/runtime/dag_task_throughput_independent.cpp @@ -2,7 +2,7 @@ #include -using namespace cl; +using namespace sycl; class IndependentDagTaskThroughputKernelSingleTask; class IndependentDagTaskThroughputKernelBasicPF; @@ -10,106 +10,82 @@ class DagTaskThroughputKernelNdrangePF; class DagTaskThroughputKernelHierarchicalPF; // Measures the time it takes to run trivial single_task and parallel_for kernels -// that are *independent*. +// that are *independent*. // This benchmark can be used to see how well a SYCL implementation // can utilize hardware concurrency. -class IndependentDagTaskThroughput -{ +class IndependentDagTaskThroughput { std::vector> dummy_buffers; BenchmarkArgs args; + public: - IndependentDagTaskThroughput(const BenchmarkArgs &_args) - : args(_args) - {} - - void setup() - { - for (std::size_t i = 0; i < args.problem_size; ++i) { + IndependentDagTaskThroughput(const BenchmarkArgs& _args) : args(_args) {} + + void setup() { + for(std::size_t i = 0; i < args.problem_size; ++i) { dummy_buffers.push_back(sycl::buffer{sycl::range<1>{1}}); forceDataAllocation(args.device_queue, dummy_buffers.back()); } } - void submit_single_task() - { + void submit_single_task() { for(std::size_t i = 0; i < args.problem_size; ++i) { - - args.device_queue.submit( - [&](cl::sycl::handler& cgh) { + args.device_queue.submit([&](sycl::handler& cgh) { auto acc = dummy_buffers[i].get_access(cgh); - - cgh.single_task( - [=]() - { - acc[0] = i; - }); + + cgh.single_task([=]() { acc[0] = i; }); }); // submit } } - void submit_basic_parallel_for() - { + void submit_basic_parallel_for() { for(std::size_t i = 0; i < args.problem_size; ++i) { - args.device_queue.submit( - [&](cl::sycl::handler& cgh) { + args.device_queue.submit([&](sycl::handler& cgh) { auto acc = dummy_buffers[i].get_access(cgh); - + cgh.parallel_for( - - sycl::range<1>{args.local_size}, - [=](sycl::id<1> idx) - { - if(idx[0] == 0) - acc[0] = i; - }); + + sycl::range<1>{args.local_size}, [=](sycl::id<1> idx) { + if(idx[0] == 0) + acc[0] = i; + }); }); // submit } } - void submit_ndrange_parallel_for() - { + void submit_ndrange_parallel_for() { for(std::size_t i = 0; i < args.problem_size; ++i) { - args.device_queue.submit( - [&](cl::sycl::handler& cgh) { + args.device_queue.submit([&](sycl::handler& cgh) { auto acc = dummy_buffers[i].get_access(cgh); - + cgh.parallel_for( - sycl::nd_range<1>{ - sycl::range<1>{args.local_size}, - sycl::range<1>{args.local_size}}, - [=](sycl::nd_item<1> idx) - { - if(idx.get_global_id(0) == 0) - acc[0] = i; - }); + sycl::nd_range<1>{sycl::range<1>{args.local_size}, sycl::range<1>{args.local_size}}, + [=](sycl::nd_item<1> idx) { + if(idx.get_global_id(0) == 0) + acc[0] = i; + }); }); // submit } } - void submit_hierarchical_parallel_for() - { + void submit_hierarchical_parallel_for() { for(std::size_t i = 0; i < args.problem_size; ++i) { - args.device_queue.submit( - [&](cl::sycl::handler& cgh) { + args.device_queue.submit([&](sycl::handler& cgh) { auto acc = dummy_buffers[i].get_access(cgh); - + cgh.parallel_for_work_group( - sycl::range<1>{1}, sycl::range<1>{args.local_size}, - [=](sycl::group<1> grp) - { - grp.parallel_for_work_item([&](sycl::h_item<1> idx){ - if(idx.get_global_id(0) == 0) - acc[0] = i; - }); - }); + sycl::range<1>{1}, sycl::range<1>{args.local_size}, [=](sycl::group<1> grp) { + grp.parallel_for_work_item([&](sycl::h_item<1> idx) { + if(idx.get_global_id(0) == 0) + acc[0] = i; + }); + }); }); // submit } } - bool verify(VerificationSetting &ver) { - for(std::size_t i = 0; i < dummy_buffers.size(); ++i){ - auto host_acc = - dummy_buffers[i].get_access(); + bool verify(VerificationSetting& ver) { + for(std::size_t i = 0; i < dummy_buffers.size(); ++i) { + auto host_acc = dummy_buffers[i].get_host_access(); if(host_acc[0] != i) return false; @@ -119,72 +95,49 @@ class IndependentDagTaskThroughput } }; -class IndependentDagTaskThroughputSingleTask - : public IndependentDagTaskThroughput -{ +class IndependentDagTaskThroughputSingleTask : public IndependentDagTaskThroughput { public: - IndependentDagTaskThroughputSingleTask(const BenchmarkArgs& args) - : IndependentDagTaskThroughput{args} {} + IndependentDagTaskThroughputSingleTask(const BenchmarkArgs& args) : IndependentDagTaskThroughput{args} {} - void run(){ - submit_single_task(); - } + void run() { submit_single_task(); } - static std::string getBenchmarkName(BenchmarkArgs& args) { - return "Runtime_IndependentDAGTaskThroughput_SingleTask"; - } + static std::string getBenchmarkName(BenchmarkArgs& args) { return "Runtime_IndependentDAGTaskThroughput_SingleTask"; } }; -class IndependentDagTaskThroughputBasicPF - : public IndependentDagTaskThroughput -{ +class IndependentDagTaskThroughputBasicPF : public IndependentDagTaskThroughput { public: - IndependentDagTaskThroughputBasicPF(const BenchmarkArgs& args) - : IndependentDagTaskThroughput{args} {} + IndependentDagTaskThroughputBasicPF(const BenchmarkArgs& args) : IndependentDagTaskThroughput{args} {} - void run(){ - submit_basic_parallel_for(); - } + void run() { submit_basic_parallel_for(); } static std::string getBenchmarkName(BenchmarkArgs& args) { return "Runtime_IndependentDAGTaskThroughput_BasicParallelFor"; } }; -class IndependentDagTaskThroughputNDRangePF - : public IndependentDagTaskThroughput -{ +class IndependentDagTaskThroughputNDRangePF : public IndependentDagTaskThroughput { public: - IndependentDagTaskThroughputNDRangePF(const BenchmarkArgs& args) - : IndependentDagTaskThroughput{args} {} + IndependentDagTaskThroughputNDRangePF(const BenchmarkArgs& args) : IndependentDagTaskThroughput{args} {} - void run(){ - submit_ndrange_parallel_for(); - } + void run() { submit_ndrange_parallel_for(); } static std::string getBenchmarkName(BenchmarkArgs& args) { return "Runtime_IndependentDAGTaskThroughput_NDRangeParallelFor"; } }; -class IndependentDagTaskThroughputHierarchicalPF - : public IndependentDagTaskThroughput -{ +class IndependentDagTaskThroughputHierarchicalPF : public IndependentDagTaskThroughput { public: - IndependentDagTaskThroughputHierarchicalPF(const BenchmarkArgs& args) - : IndependentDagTaskThroughput{args} {} + IndependentDagTaskThroughputHierarchicalPF(const BenchmarkArgs& args) : IndependentDagTaskThroughput{args} {} - void run(){ - submit_hierarchical_parallel_for(); - } + void run() { submit_hierarchical_parallel_for(); } static std::string getBenchmarkName(BenchmarkArgs& args) { return "Runtime_IndependentDAGTaskThroughput_HierarchicalParallelFor"; } }; -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { BenchmarkApp app(argc, argv); app.run(); @@ -194,6 +147,6 @@ int main(int argc, char** argv) // or triSYCL, this will be prohibitively slow if(app.shouldRunNDRangeKernels()) app.run(); - + return 0; } diff --git a/runtime/dag_task_throughput_sequential.cpp b/runtime/dag_task_throughput_sequential.cpp index 2f9eee54..f00032e1 100644 --- a/runtime/dag_task_throughput_sequential.cpp +++ b/runtime/dag_task_throughput_sequential.cpp @@ -1,6 +1,6 @@ #include "common.h" -using namespace cl; +using namespace sycl; class DagTaskThroughputKernelSingleTask; class DagTaskThroughputKernelBasicPF; @@ -10,166 +10,121 @@ class DagTaskThroughputKernelHierarchicalPF; // Measures the time it takes to run trivial single_task and parallel_for kernels // that depend on each other, and have to be executed in-order (-> Utilization of // parallel hardware is *not* tested) -// This is influenced by +// This is influenced by // * latencies in task submission to the backend, e.g. GPU kernel latencies // * scheduling latencies caused by the SYCL implementation // * other overheads -class DagTaskThroughput -{ +class DagTaskThroughput { const int initial_value; PrefetchedBuffer dummy_counter; BenchmarkArgs args; + public: - DagTaskThroughput(const BenchmarkArgs &_args) - : initial_value{0}, args(_args) - {} + DagTaskThroughput(const BenchmarkArgs& _args) : initial_value{0}, args(_args) {} void setup() { dummy_counter.initialize(args.device_queue, &initial_value, sycl::range<1>{1}); } - void submit_single_task() - { + void submit_single_task() { // Behold! The weirdest, most inefficient summation algorithm ever conceived! for(std::size_t i = 0; i < args.problem_size; ++i) { - - args.device_queue.submit( - [&](cl::sycl::handler& cgh) { + args.device_queue.submit([&](sycl::handler& cgh) { auto acc = dummy_counter.get_access(cgh); - - cgh.single_task( - [=]() - { - acc[0] += 1; - }); + + cgh.single_task([=]() { acc[0] += 1; }); }); // submit } } - void submit_basic_parallel_for() - { + void submit_basic_parallel_for() { for(std::size_t i = 0; i < args.problem_size; ++i) { - args.device_queue.submit( - [&](cl::sycl::handler& cgh) { + args.device_queue.submit([&](sycl::handler& cgh) { auto acc = dummy_counter.get_access(cgh); - + cgh.parallel_for( - // while we cannot control it, let's hope the SYCL implementation - // spawns a single work group. - sycl::range<1>{args.local_size}, - [=](sycl::id<1> idx) - { - if(idx[0] == 0) - acc[0] += 1; - }); + // while we cannot control it, let's hope the SYCL implementation + // spawns a single work group. + sycl::range<1>{args.local_size}, [=](sycl::id<1> idx) { + if(idx[0] == 0) + acc[0] += 1; + }); }); // submit } } - void submit_ndrange_parallel_for() - { + void submit_ndrange_parallel_for() { for(std::size_t i = 0; i < args.problem_size; ++i) { - args.device_queue.submit( - [&](cl::sycl::handler& cgh) { + args.device_queue.submit([&](sycl::handler& cgh) { auto acc = dummy_counter.get_access(cgh); - + cgh.parallel_for( - sycl::nd_range<1>{ - sycl::range<1>{args.local_size}, - sycl::range<1>{args.local_size}}, - [=](sycl::nd_item<1> idx) - { - if(idx.get_global_id(0) == 0) - acc[0] += 1; - }); + sycl::nd_range<1>{sycl::range<1>{args.local_size}, sycl::range<1>{args.local_size}}, + [=](sycl::nd_item<1> idx) { + if(idx.get_global_id(0) == 0) + acc[0] += 1; + }); }); // submit } } - void submit_hierarchical_parallel_for() - { + void submit_hierarchical_parallel_for() { for(std::size_t i = 0; i < args.problem_size; ++i) { - args.device_queue.submit( - [&](cl::sycl::handler& cgh) { + args.device_queue.submit([&](sycl::handler& cgh) { auto acc = dummy_counter.get_access(cgh); - + cgh.parallel_for_work_group( - sycl::range<1>{1}, sycl::range<1>{args.local_size}, - [=](sycl::group<1> grp) - { - grp.parallel_for_work_item([&](sycl::h_item<1> idx){ - if(idx.get_global_id(0) == 0) - acc[0] += 1; - }); - }); + sycl::range<1>{1}, sycl::range<1>{args.local_size}, [=](sycl::group<1> grp) { + grp.parallel_for_work_item([&](sycl::h_item<1> idx) { + if(idx.get_global_id(0) == 0) + acc[0] += 1; + }); + }); }); // submit } } - bool verify(VerificationSetting &ver) { - auto host_acc = - dummy_counter.get_access(); + bool verify(VerificationSetting& ver) { + auto host_acc = dummy_counter.get_host_access(); return host_acc[0] == args.problem_size; } }; -class DagTaskThroughputSingleTask : public DagTaskThroughput -{ +class DagTaskThroughputSingleTask : public DagTaskThroughput { public: - DagTaskThroughputSingleTask(const BenchmarkArgs& args) - : DagTaskThroughput{args} {} + DagTaskThroughputSingleTask(const BenchmarkArgs& args) : DagTaskThroughput{args} {} - void run(){ - submit_single_task(); - } + void run() { submit_single_task(); } - static std::string getBenchmarkName(BenchmarkArgs& args) { - return "Runtime_DAGTaskThroughput_SingleTask"; - } + static std::string getBenchmarkName(BenchmarkArgs& args) { return "Runtime_DAGTaskThroughput_SingleTask"; } }; -class DagTaskThroughputBasicPF : public DagTaskThroughput -{ +class DagTaskThroughputBasicPF : public DagTaskThroughput { public: - DagTaskThroughputBasicPF(const BenchmarkArgs& args) - : DagTaskThroughput{args} {} + DagTaskThroughputBasicPF(const BenchmarkArgs& args) : DagTaskThroughput{args} {} - void run(){ - submit_basic_parallel_for(); - } + void run() { submit_basic_parallel_for(); } - static std::string getBenchmarkName(BenchmarkArgs& args) { - return "Runtime_DAGTaskThroughput_BasicParallelFor"; - } + static std::string getBenchmarkName(BenchmarkArgs& args) { return "Runtime_DAGTaskThroughput_BasicParallelFor"; } }; -class DagTaskThroughputNDRangePF : public DagTaskThroughput -{ +class DagTaskThroughputNDRangePF : public DagTaskThroughput { public: - DagTaskThroughputNDRangePF(const BenchmarkArgs& args) - : DagTaskThroughput{args} {} + DagTaskThroughputNDRangePF(const BenchmarkArgs& args) : DagTaskThroughput{args} {} - void run(){ - submit_ndrange_parallel_for(); - } + void run() { submit_ndrange_parallel_for(); } - static std::string getBenchmarkName(BenchmarkArgs& args) { - return "Runtime_DAGTaskThroughput_NDRangeParallelFor"; - } + static std::string getBenchmarkName(BenchmarkArgs& args) { return "Runtime_DAGTaskThroughput_NDRangeParallelFor"; } }; -class DagTaskThroughputHierarchicalPF : public DagTaskThroughput -{ +class DagTaskThroughputHierarchicalPF : public DagTaskThroughput { public: - DagTaskThroughputHierarchicalPF(const BenchmarkArgs& args) - : DagTaskThroughput{args} {} + DagTaskThroughputHierarchicalPF(const BenchmarkArgs& args) : DagTaskThroughput{args} {} - void run(){ - submit_hierarchical_parallel_for(); - } + void run() { submit_hierarchical_parallel_for(); } static std::string getBenchmarkName(BenchmarkArgs& args) { return "Runtime_DAGTaskThroughput_HierarchicalParallelFor"; @@ -177,8 +132,7 @@ class DagTaskThroughputHierarchicalPF : public DagTaskThroughput }; -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { BenchmarkApp app(argc, argv); app.run(); diff --git a/runtime/matmulchain.cpp b/runtime/matmulchain.cpp index e8b89f33..44635921 100644 --- a/runtime/matmulchain.cpp +++ b/runtime/matmulchain.cpp @@ -10,38 +10,38 @@ template class MatmulChain; template -void multiply(cl::sycl::queue& queue, cl::sycl::buffer& mat_a, cl::sycl::buffer& mat_b, - cl::sycl::buffer& mat_c, const size_t mat_size) { - queue.submit([&](cl::sycl::handler& cgh) { - auto a = mat_a.template get_access(cgh); - auto b = mat_b.template get_access(cgh); - auto c = mat_c.template get_access(cgh); - - cgh.parallel_for>(cl::sycl::range<2>(mat_size, mat_size), [=](cl::sycl::item<2> item) { - auto sum = 0; - for(size_t k = 0; k < mat_size; ++k) { - const auto a_ik = a[{item[0], k}]; - const auto b_kj = b[{k, item[1]}]; - sum += a_ik * b_kj; - } - c[item] = sum; - }); +void multiply(sycl::queue& queue, sycl::buffer& mat_a, sycl::buffer& mat_b, sycl::buffer& mat_c, + const size_t mat_size) { + queue.submit([&](sycl::handler& cgh) { + auto a = mat_a.template get_access(cgh); + auto b = mat_b.template get_access(cgh); + auto c = mat_c.template get_access(cgh); + + cgh.parallel_for>(sycl::range<2>(mat_size, mat_size), [=](sycl::item<2> item) { + auto sum = 0; + for(size_t k = 0; k < mat_size; ++k) { + const auto a_ik = a[{item[0], k}]; + const auto b_kj = b[{k, item[1]}]; + sum += a_ik * b_kj; + } + c[item] = sum; + }); }); } template class MatmulChain { -protected: - std::vector mat_a; - std::vector mat_b; - std::vector mat_c; - std::vector mat_d; - std::vector mat_res; - BenchmarkArgs args; - int mat_size; - - PrefetchedBuffer mat_a_buf; +protected: + std::vector mat_a; + std::vector mat_b; + std::vector mat_c; + std::vector mat_d; + std::vector mat_res; + BenchmarkArgs args; + int mat_size; + + PrefetchedBuffer mat_a_buf; PrefetchedBuffer mat_b_buf; PrefetchedBuffer mat_c_buf; PrefetchedBuffer mat_d_buf; @@ -50,68 +50,68 @@ class MatmulChain { PrefetchedBuffer mat_q_buf; public: - MatmulChain(const BenchmarkArgs &_args) : args(_args) { - mat_size = args.problem_size; - } - - void setup() { - mat_a = std::vector(mat_size * mat_size); - mat_b = std::vector(mat_size * mat_size); - mat_c = std::vector(mat_size * mat_size); - mat_d = std::vector(mat_size * mat_size); - mat_res = std::vector(mat_size * mat_size); - - // Initialize matrices to the identity - for(size_t i = 0; i < mat_size; ++i) { - for(size_t j = 0; j < mat_size; ++j) { - mat_a[i * mat_size + j] = i == j; - mat_b[i * mat_size + j] = i == j; - mat_c[i * mat_size + j] = i == j; - mat_d[i * mat_size + j] = i == j; - } - } - - mat_a_buf.initialize(args.device_queue, mat_a.data(), cl::sycl::range<2>(mat_size, mat_size)); - mat_b_buf.initialize(args.device_queue, mat_b.data(), cl::sycl::range<2>(mat_size, mat_size)); - mat_c_buf.initialize(args.device_queue, mat_c.data(), cl::sycl::range<2>(mat_size, mat_size)); - mat_d_buf.initialize(args.device_queue, mat_d.data(), cl::sycl::range<2>(mat_size, mat_size)); - mat_res_buf.initialize(args.device_queue, mat_res.data(), cl::sycl::range<2>(mat_size, mat_size)); - mat_p_buf.initialize(args.device_queue, cl::sycl::range<2>(mat_size, mat_size)); - mat_q_buf.initialize(args.device_queue, cl::sycl::range<2>(mat_size, mat_size)); - } - - void run() { - multiply(args.device_queue, mat_a_buf.get(), mat_b_buf.get(), mat_p_buf.get(), mat_size); - multiply(args.device_queue, mat_c_buf.get(), mat_d_buf.get(), mat_q_buf.get(), mat_size); - multiply(args.device_queue, mat_p_buf.get(), mat_q_buf.get(), mat_res_buf.get(), mat_size); - } + MatmulChain(const BenchmarkArgs& _args) : args(_args) { mat_size = args.problem_size; } + + void setup() { + mat_a = std::vector(mat_size * mat_size); + mat_b = std::vector(mat_size * mat_size); + mat_c = std::vector(mat_size * mat_size); + mat_d = std::vector(mat_size * mat_size); + mat_res = std::vector(mat_size * mat_size); + + // Initialize matrices to the identity + for(size_t i = 0; i < mat_size; ++i) { + for(size_t j = 0; j < mat_size; ++j) { + mat_a[i * mat_size + j] = i == j; + mat_b[i * mat_size + j] = i == j; + mat_c[i * mat_size + j] = i == j; + mat_d[i * mat_size + j] = i == j; + } + } + + mat_a_buf.initialize(args.device_queue, mat_a.data(), sycl::range<2>(mat_size, mat_size)); + mat_b_buf.initialize(args.device_queue, mat_b.data(), sycl::range<2>(mat_size, mat_size)); + mat_c_buf.initialize(args.device_queue, mat_c.data(), sycl::range<2>(mat_size, mat_size)); + mat_d_buf.initialize(args.device_queue, mat_d.data(), sycl::range<2>(mat_size, mat_size)); + mat_res_buf.initialize(args.device_queue, mat_res.data(), sycl::range<2>(mat_size, mat_size)); + mat_p_buf.initialize(args.device_queue, sycl::range<2>(mat_size, mat_size)); + mat_q_buf.initialize(args.device_queue, sycl::range<2>(mat_size, mat_size)); + } + + void run() { + multiply(args.device_queue, mat_a_buf.get(), mat_b_buf.get(), mat_p_buf.get(), mat_size); + multiply(args.device_queue, mat_c_buf.get(), mat_d_buf.get(), mat_q_buf.get(), mat_size); + multiply(args.device_queue, mat_p_buf.get(), mat_q_buf.get(), mat_res_buf.get(), mat_size); + } static std::string getBenchmarkName(BenchmarkArgs& args) { return "MatmulChain"; } - bool verify(VerificationSetting &ver) { - // Triggers writeback - mat_res_buf.reset(); - bool verification_passed = true; - - for(size_t i = 0; i < mat_size; ++i) { - for(size_t j = 0; j < mat_size; ++j) { - const T kernel_value = mat_res[i * mat_size + j]; - const T host_value = i == j; - if(kernel_value != host_value) { - fprintf(stderr, "VERIFICATION FAILED for element %ld,%ld: %f != %f\n", i, j, kernel_value, host_value); - verification_passed = false; - break; - } - } - if(!verification_passed) { break; } - } - return verification_passed; - } + bool verify(VerificationSetting& ver) { + // Triggers writeback + mat_res_buf.reset(); + bool verification_passed = true; + + for(size_t i = 0; i < mat_size; ++i) { + for(size_t j = 0; j < mat_size; ++j) { + const T kernel_value = mat_res[i * mat_size + j]; + const T host_value = i == j; + if(kernel_value != host_value) { + fprintf(stderr, "VERIFICATION FAILED for element %ld,%ld: %f != %f\n", i, j, kernel_value, host_value); + verification_passed = false; + break; + } + } + if(!verification_passed) { + break; + } + } + return verification_passed; + } }; int main(int argc, char** argv) { - BenchmarkApp app(argc, argv); - - // float - app.run< MatmulChain >(); + BenchmarkApp app(argc, argv); + + // float + app.run>(); } diff --git a/runtime/short_long.cpp b/runtime/short_long.cpp index 83224239..736539ec 100644 --- a/runtime/short_long.cpp +++ b/runtime/short_long.cpp @@ -1 +1,2 @@ -TODO SYCL code with multiple tasks A -> b1 ... bn -> D and A -> C -> D where b1...bn are short task and C is a long task, so that runtime capabilities of kernel overlapping is evaluated +TODO SYCL code with multiple tasks A->b1... bn->D and A->C->D where b1... bn are short task and C is a long task, + so that runtime capabilities of kernel overlapping is evaluated diff --git a/single-kernel/kmeans.cpp b/single-kernel/kmeans.cpp index 974912ea..4c8da8c1 100644 --- a/single-kernel/kmeans.cpp +++ b/single-kernel/kmeans.cpp @@ -5,37 +5,38 @@ #define FLT_MAX 500000.0 #endif -//using namespace cl::sycl; -namespace s = cl::sycl; -template class KmeansKernel; +// using namespace sycl; +namespace s = sycl; +template +class KmeansKernel; template -class KmeansBench -{ -protected: - std::vector features; - std::vector clusters; - std::vector membership; - int nfeatures; - int nclusters; - int feature_size; - int cluster_size; - BenchmarkArgs args; - - - PrefetchedBuffer features_buf; - PrefetchedBuffer clusters_buf; - PrefetchedBuffer membership_buf; +class KmeansBench { +protected: + std::vector features; + std::vector clusters; + std::vector membership; + int nfeatures; + int nclusters; + int feature_size; + int cluster_size; + BenchmarkArgs args; + + + PrefetchedBuffer features_buf; + PrefetchedBuffer clusters_buf; + PrefetchedBuffer membership_buf; + public: - KmeansBench(const BenchmarkArgs &_args) : args(_args) {} - - void setup() { + KmeansBench(const BenchmarkArgs& _args) : args(_args) {} + + void setup() { // host memory allocation and initialization nfeatures = 2; nclusters = 3; - feature_size = nfeatures*args.problem_size; - cluster_size = nclusters*args.problem_size; + feature_size = nfeatures * args.problem_size; + cluster_size = nclusters * args.problem_size; features.resize(feature_size, 2.0f); clusters.resize(cluster_size, 1.0f); @@ -46,43 +47,41 @@ class KmeansBench membership_buf.initialize(args.device_queue, membership.data(), s::range<1>(args.problem_size)); } - void run(std::vector& events) { - events.push_back(args.device_queue.submit([&](cl::sycl::handler& cgh) { + void run(std::vector& events) { + events.push_back(args.device_queue.submit([&](sycl::handler& cgh) { auto features = features_buf.template get_access(cgh); auto clusters = clusters_buf.template get_access(cgh); auto membership = membership_buf.template get_access(cgh); - cl::sycl::range<1> ndrange(args.problem_size); - - cgh.parallel_for>(ndrange, - [features, clusters, membership, problem_size = args.problem_size, - nclusters_ = nclusters, nfeatures_ = nfeatures] - (cl::sycl::id<1> idx){ - - size_t gid = idx[0]; - - if(gid < problem_size) { - int index = 0; - T min_dist = FLT_MAX; - for(size_t i = 0; i < nclusters_; i++) { - T dist = 0; - for(size_t l = 0; l < nfeatures_; l++) { - dist += (features[l * problem_size + gid] - clusters[i * nfeatures_ + l]) * - (features[l * problem_size + gid] - clusters[i * nfeatures_ + l]); - } - if(dist < min_dist) { - min_dist = dist; - index = gid; + sycl::range<1> ndrange(args.problem_size); + + cgh.parallel_for>( + ndrange, [features, clusters, membership, problem_size = args.problem_size, nclusters_ = nclusters, + nfeatures_ = nfeatures](sycl::id<1> idx) { + size_t gid = idx[0]; + + if(gid < problem_size) { + int index = 0; + T min_dist = FLT_MAX; + for(size_t i = 0; i < nclusters_; i++) { + T dist = 0; + for(size_t l = 0; l < nfeatures_; l++) { + dist += (features[l * problem_size + gid] - clusters[i * nfeatures_ + l]) * + (features[l * problem_size + gid] - clusters[i * nfeatures_ + l]); + } + if(dist < min_dist) { + min_dist = dist; + index = gid; + } + } + membership[gid] = index; } - } - membership[gid] = index; - } - }); + }); })); } - bool verify(VerificationSetting &ver) { - auto membership_acc = membership_buf.template get_access(); + bool verify(VerificationSetting& ver) { + auto membership_acc = membership_buf.get_host_access(); bool pass = true; unsigned int equal = 1; @@ -118,15 +117,15 @@ class KmeansBench std::stringstream name; name << "Kmeans_"; name << ReadableTypename::name; - return name.str(); + return name.str(); } }; -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { BenchmarkApp app(argc, argv); - app.run> (); - if(app.deviceSupportsFP64()) + app.run>(); + if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) { app.run>(); + } return 0; } diff --git a/single-kernel/lin_reg_coeff.cpp b/single-kernel/lin_reg_coeff.cpp index 47ad637d..29fb7a87 100644 --- a/single-kernel/lin_reg_coeff.cpp +++ b/single-kernel/lin_reg_coeff.cpp @@ -1,35 +1,36 @@ #include "common.h" #include -//using namespace cl::sycl; -namespace s = cl::sycl; +// using namespace sycl; +namespace s = sycl; -template class VecProductKernel; -template class VecReduceKernel; +template +class VecProductKernel; +template +class VecReduceKernel; template -class LinearRegressionCoeffBench -{ -protected: - std::vector input1; - std::vector input2; - std::vector output; - T coeff_b1; - T coeff_b0; - - // Only needed for verification as reduction is done inplace which modifies the input - std::vector input1ver; - std::vector input2ver; - BenchmarkArgs args; - - PrefetchedBuffer input1_buf; - PrefetchedBuffer input2_buf; - PrefetchedBuffer output_buf; +class LinearRegressionCoeffBench { +protected: + std::vector input1; + std::vector input2; + std::vector output; + T coeff_b1; + T coeff_b0; + + // Only needed for verification as reduction is done inplace which modifies the input + std::vector input1ver; + std::vector input2ver; + BenchmarkArgs args; + + PrefetchedBuffer input1_buf; + PrefetchedBuffer input2_buf; + PrefetchedBuffer output_buf; public: - LinearRegressionCoeffBench(const BenchmarkArgs &_args) : args(_args) {} - - void setup() { + LinearRegressionCoeffBench(const BenchmarkArgs& _args) : args(_args) {} + + void setup() { // host memory allocation and initialization input1.resize(args.problem_size); input2.resize(args.problem_size); @@ -38,9 +39,9 @@ class LinearRegressionCoeffBench input1ver.resize(args.problem_size); input2ver.resize(args.problem_size); - for (size_t i = 0; i < args.problem_size; i++) { - input1ver[i] = input1[i] = 1.0; - input2ver[i] = input2[i] = 2.0; + for(size_t i = 0; i < args.problem_size; i++) { + input1ver[i] = input1[i] = 1.0; + input2ver[i] = input2[i] = 2.0; } input1_buf.initialize(args.device_queue, input1.data(), s::range<1>(args.problem_size)); @@ -48,82 +49,77 @@ class LinearRegressionCoeffBench output_buf.initialize(args.device_queue, output.data(), s::range<1>(args.problem_size)); } - void vec_product(std::vector& events, s::buffer &input1_buf, s::buffer &input2_buf, s::buffer &output_buf) { - events.push_back(args.device_queue.submit( - [&](cl::sycl::handler& cgh) { + void vec_product(std::vector& events, s::buffer& input1_buf, s::buffer& input2_buf, + s::buffer& output_buf) { + events.push_back(args.device_queue.submit([&](sycl::handler& cgh) { auto in1 = input1_buf.template get_access(cgh); auto in2 = input2_buf.template get_access(cgh); - - // Use discard_write here, otherwise the content of the host buffer must first be copied to device + + // Use discard_write here, otherwise the content of the host buffer must first be copied to device auto intermediate_product = output_buf.template get_access(cgh); - cl::sycl::nd_range<1> ndrange (args.problem_size, args.local_size); + sycl::nd_range<1> ndrange(args.problem_size, args.local_size); - cgh.parallel_for>(ndrange, - [=](cl::sycl::nd_item<1> item) - { - size_t gid= item.get_global_linear_id(); - intermediate_product[gid] = in1[gid] * in2[gid]; - }); + cgh.parallel_for>(ndrange, [=](sycl::nd_item<1> item) { + size_t gid = item.get_global_linear_id(); + intermediate_product[gid] = in1[gid] * in2[gid]; + }); })); } -T reduce(std::vector& events, s::buffer &input_buf) { - auto array_size = args.problem_size; - auto wgroup_size = args.local_size; - // Not yet tested with more than 2 - auto elements_per_thread = 2; + T reduce(std::vector& events, s::buffer& input_buf) { + auto array_size = args.problem_size; + auto wgroup_size = args.local_size; + // Not yet tested with more than 2 + auto elements_per_thread = 2; - while (array_size!= 1) { - auto n_wgroups = (array_size + wgroup_size*elements_per_thread - 1)/(wgroup_size*elements_per_thread); // two threads per work item - - events.push_back(args.device_queue.submit( - [&](cl::sycl::handler& cgh) { + while(array_size != 1) { + auto n_wgroups = (array_size + wgroup_size * elements_per_thread - 1) / + (wgroup_size * elements_per_thread); // two threads per work item + events.push_back(args.device_queue.submit([&](sycl::handler& cgh) { auto global_mem = input_buf.template get_access(cgh); - + // local memory for reduction - auto local_mem = s::accessor {s::range<1>(wgroup_size), cgh}; - cl::sycl::nd_range<1> ndrange (n_wgroups*wgroup_size, wgroup_size); - - cgh.parallel_for>(ndrange, - [=](cl::sycl::nd_item<1> item) - { - size_t gid= item.get_global_linear_id(); - size_t lid = item.get_local_linear_id(); - - // initialize local memory to 0 - local_mem[lid] = 0; - - if ((elements_per_thread * gid) < array_size) { - local_mem[lid] = global_mem[elements_per_thread*gid] + global_mem[elements_per_thread*gid + 1]; - } + auto local_mem = s::local_accessor{s::range<1>(wgroup_size), cgh}; + sycl::nd_range<1> ndrange(n_wgroups * wgroup_size, wgroup_size); - item.barrier(s::access::fence_space::local_space); + cgh.parallel_for>(ndrange, [=](sycl::nd_item<1> item) { + size_t gid = item.get_global_linear_id(); + size_t lid = item.get_local_linear_id(); - for (size_t stride = 1; stride < wgroup_size; stride *= elements_per_thread) { - auto local_mem_index = elements_per_thread * stride * lid; - if (local_mem_index < wgroup_size) { - local_mem[local_mem_index] = local_mem[local_mem_index] + local_mem[local_mem_index + stride]; - } + // initialize local memory to 0 + local_mem[lid] = 0; - item.barrier(s::access::fence_space::local_space); - } + if((elements_per_thread * gid) < array_size) { + local_mem[lid] = global_mem[elements_per_thread * gid] + global_mem[elements_per_thread * gid + 1]; + } + + sycl::group_barrier(item.get_group()); - // Only one work-item per work group writes to global memory - if (lid == 0) { - global_mem[item.get_group_linear_id()] = local_mem[0]; + + for(size_t stride = 1; stride < wgroup_size; stride *= elements_per_thread) { + auto local_mem_index = elements_per_thread * stride * lid; + if(local_mem_index < wgroup_size) { + local_mem[local_mem_index] = local_mem[local_mem_index] + local_mem[local_mem_index + stride]; } - }); + + sycl::group_barrier(item.get_group()); + } + + // Only one work-item per work group writes to global memory + if(lid == 0) { + global_mem[item.get_group_linear_id()] = local_mem[0]; + } + }); })); - array_size = n_wgroups; + array_size = n_wgroups; + } + auto reduced_value = input_buf.get_host_access(); + return (reduced_value[0]); } - auto reduced_value = input_buf.template get_access(); - return(reduced_value[0]); -} - - void run(std::vector& events) { + void run(std::vector& events) { vec_product(events, input1_buf.get(), input2_buf.get(), output_buf.get()); T ss_xy = reduce(events, output_buf.get()); @@ -132,71 +128,72 @@ T reduce(std::vector& events, s::buffer &input_buf) { T ss_xx = reduce(events, output_buf.get()); - T mean_x = reduce(events, input1_buf.get())/args.problem_size; - T mean_y = reduce(events, input2_buf.get())/args.problem_size; + T mean_x = reduce(events, input1_buf.get()) / args.problem_size; + T mean_y = reduce(events, input2_buf.get()) / args.problem_size; - ss_xy = ss_xy - mean_x*mean_y; - ss_xx = ss_xx - mean_x*mean_x; + ss_xy = ss_xy - mean_x * mean_y; + ss_xx = ss_xx - mean_x * mean_x; - coeff_b1 = ss_xy/ss_xx; - coeff_b0 = mean_y - coeff_b1*mean_x; + coeff_b1 = ss_xy / ss_xx; + coeff_b0 = mean_y - coeff_b1 * mean_x; - //std::cout << "ss_xy = " << ss_xy << "ss_xx = " << ss_xx << std::endl; - //std::cout << "Mean_x = " << mean_x << "Mean_y = " << mean_y << std::endl; - //std::cout << "Coeff_b1 = " << coeff_b1 << ", " << "Coeff_b0 = " << coeff_b0 << std::endl; + // std::cout << "ss_xy = " << ss_xy << "ss_xx = " << ss_xx << std::endl; + // std::cout << "Mean_x = " << mean_x << "Mean_y = " << mean_y << std::endl; + // std::cout << "Coeff_b1 = " << coeff_b1 << ", " << "Coeff_b0 = " << coeff_b0 << std::endl; } - bool verify(VerificationSetting &ver) { - bool pass = true; - + bool verify(VerificationSetting& ver) { + bool pass = true; + T sum_of_vec1 = 0; T sum_of_vec2 = 0; - for (size_t i = 0; i < args.problem_size; i++) { + for(size_t i = 0; i < args.problem_size; i++) { sum_of_vec1 += input1ver[i]; sum_of_vec2 += input2ver[i]; } - T mean_x = sum_of_vec1/args.problem_size; - T mean_y = sum_of_vec2/args.problem_size; + T mean_x = sum_of_vec1 / args.problem_size; + T mean_y = sum_of_vec2 / args.problem_size; T ss_xy = 0; T ss_xx = 0; - for (size_t i = 0; i < args.problem_size; i++) { - ss_xy += input1ver[i]*input2ver[i]; - ss_xx += input1ver[i]*input1ver[i]; + for(size_t i = 0; i < args.problem_size; i++) { + ss_xy += input1ver[i] * input2ver[i]; + ss_xx += input1ver[i] * input1ver[i]; } - ss_xy = ss_xy - mean_x*mean_y; - ss_xx = ss_xx - mean_x*mean_x; + ss_xy = ss_xy - mean_x * mean_y; + ss_xx = ss_xx - mean_x * mean_x; - T expected_coeff_b1 = ss_xy/ss_xx; - T expected_coeff_b0 = mean_y - expected_coeff_b1*mean_x; + T expected_coeff_b1 = ss_xy / ss_xx; + T expected_coeff_b0 = mean_y - expected_coeff_b1 * mean_x; - //std::cout << "Coeff_b1 = " << coeff_b1 << ", " << "Coeff_b0 = " << coeff_b0 << std::endl; - //std::cout << "Expected Coeff_b1 = " << expected_coeff_b1 << ", " << "Expected Coeff_b0 = " << expected_coeff_b0 << std::endl; + // std::cout << "Coeff_b1 = " << coeff_b1 << ", " << "Coeff_b0 = " << coeff_b0 << std::endl; + // std::cout << "Expected Coeff_b1 = " << expected_coeff_b1 << ", " << "Expected Coeff_b0 = " << expected_coeff_b0 + // << std::endl; const T tolerance = 0.00001; - if ((fabs(expected_coeff_b0 - coeff_b0) > tolerance) || (fabs(expected_coeff_b1 - coeff_b1) > tolerance)) + if((fabs(expected_coeff_b0 - coeff_b0) > tolerance) || (fabs(expected_coeff_b1 - coeff_b1) > tolerance)) pass = false; return pass; } - + static std::string getBenchmarkName(BenchmarkArgs& args) { std::stringstream name; name << "LinearRegressionCoeff_"; name << ReadableTypename::name; - return name.str(); + return name.str(); } }; -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { BenchmarkApp app(argc, argv); - if(app.shouldRunNDRangeKernels()){ + if(app.shouldRunNDRangeKernels()) { app.run>(); - if(app.deviceSupportsFP64()) + if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) { app.run>(); + } } return 0; } diff --git a/single-kernel/lin_reg_error.cpp b/single-kernel/lin_reg_error.cpp index c2d169e1..0b49e097 100644 --- a/single-kernel/lin_reg_error.cpp +++ b/single-kernel/lin_reg_error.cpp @@ -1,32 +1,32 @@ #include "common.h" #include -//using namespace cl::sycl; -namespace s = cl::sycl; -template class LinearRegressionKernel; +// using namespace sycl; +namespace s = sycl; +template +class LinearRegressionKernel; template -class LinearRegressionBench -{ -protected: - std::vector input1; - std::vector input2; - std::vector alpha; - std::vector beta; - std::vector output; - std::vector expected_output; - BenchmarkArgs args; - - PrefetchedBuffer input1_buf; - PrefetchedBuffer input2_buf; - PrefetchedBuffer alpha_buf; - PrefetchedBuffer beta_buf; - PrefetchedBuffer output_buf; +class LinearRegressionBench { +protected: + std::vector input1; + std::vector input2; + std::vector alpha; + std::vector beta; + std::vector output; + std::vector expected_output; + BenchmarkArgs args; + + PrefetchedBuffer input1_buf; + PrefetchedBuffer input2_buf; + PrefetchedBuffer alpha_buf; + PrefetchedBuffer beta_buf; + PrefetchedBuffer output_buf; public: - LinearRegressionBench(const BenchmarkArgs &_args) : args(_args) {} - - void setup() { + LinearRegressionBench(const BenchmarkArgs& _args) : args(_args) {} + + void setup() { // host memory allocation and initialization input1.resize(args.problem_size); input2.resize(args.problem_size); @@ -35,24 +35,22 @@ class LinearRegressionBench output.resize(args.problem_size, 0); expected_output.resize(args.problem_size, 0); - for (size_t i = 0; i < args.problem_size; i++) { - input1[i] = static_cast (rand()) / static_cast (RAND_MAX); - input2[i] = static_cast (rand()) / static_cast (RAND_MAX); - alpha[i] = static_cast (rand()) / static_cast (RAND_MAX); - beta[i] = static_cast (rand()) / static_cast (RAND_MAX); + for(size_t i = 0; i < args.problem_size; i++) { + input1[i] = static_cast(rand()) / static_cast(RAND_MAX); + input2[i] = static_cast(rand()) / static_cast(RAND_MAX); + alpha[i] = static_cast(rand()) / static_cast(RAND_MAX); + beta[i] = static_cast(rand()) / static_cast(RAND_MAX); } input1_buf.initialize(args.device_queue, input1.data(), s::range<1>(args.problem_size)); input2_buf.initialize(args.device_queue, input2.data(), s::range<1>(args.problem_size)); - alpha_buf. initialize(args.device_queue, alpha.data(), s::range<1>(args.problem_size)); - beta_buf. initialize(args.device_queue, beta.data(), s::range<1>(args.problem_size)); + alpha_buf.initialize(args.device_queue, alpha.data(), s::range<1>(args.problem_size)); + beta_buf.initialize(args.device_queue, beta.data(), s::range<1>(args.problem_size)); output_buf.initialize(args.device_queue, output.data(), s::range<1>(args.problem_size)); } - void run(std::vector& events) { - - events.push_back(args.device_queue.submit( - [&](cl::sycl::handler& cgh) { + void run(std::vector& events) { + events.push_back(args.device_queue.submit([&](sycl::handler& cgh) { auto in1 = input1_buf.template get_access(cgh); auto in2 = input2_buf.template get_access(cgh); auto alpha = alpha_buf.template get_access(cgh); @@ -60,79 +58,77 @@ class LinearRegressionBench // Use discard_write here, otherwise the content of the host buffer must first be copied to device auto output = output_buf.template get_access(cgh); - cl::sycl::range<1> ndrange (args.problem_size); + sycl::range<1> ndrange(args.problem_size); - cgh.parallel_for>(ndrange, - [=, problem_size = args.problem_size](cl::sycl::id<1> idx) - { - size_t gid= idx[0]; - T a = alpha[gid]; - T b = beta[gid]; - T error = 0.0; - if (gid < problem_size) { + cgh.parallel_for>( + ndrange, [=, problem_size = args.problem_size](sycl::id<1> idx) { + size_t gid = idx[0]; + T a = alpha[gid]; + T b = beta[gid]; + T error = 0.0; + if(gid < problem_size) { // Use parallel reduction to add errors - for (size_t i = 0; i < problem_size; i++) { - T e = (a*in1[i] + b) - in2[i]; - error += e*e; + for(size_t i = 0; i < problem_size; i++) { + T e = (a * in1[i] + b) - in2[i]; + error += e * e; } - } - output[gid] = error; - }); + } + output[gid] = error; + }); })); } bool compare(const std::vector& expected_output, const int length, const T epsilon) { - T error = 0.0f; - T ref = 0.0f; + T error = 0.0f; + T ref = 0.0f; - auto output = output_buf.template get_access(); + auto output = output_buf.get_host_access(); - for(size_t i = 0; i < length; ++i) { - T diff = expected_output[i] - output[i]; - error += diff * diff; - ref += expected_output[i] * expected_output[i]; - } + for(size_t i = 0; i < length; ++i) { + T diff = expected_output[i] - output[i]; + error += diff * diff; + ref += expected_output[i] * expected_output[i]; + } - T normRef = sqrtf((T) ref); - if (fabs(ref) < 1e-7f) { - return false; - } + T normRef = sqrtf((T)ref); + if(fabs(ref) < 1e-7f) { + return false; + } - T normError = sqrtf((T) error); - error = normError / normRef; + T normError = sqrtf((T)error); + error = normError / normRef; - //std::cout << "error =" << error << "epsilon =" << epsilon; + // std::cout << "error =" << error << "epsilon =" << epsilon; - return error < epsilon; + return error < epsilon; } - bool verify(VerificationSetting &ver) { - - for (size_t i = 0; i < args.problem_size; i ++) { + bool verify(VerificationSetting& ver) { + for(size_t i = 0; i < args.problem_size; i++) { T error = 0.0; for(size_t j = 0; j < args.problem_size; j++) { T e = (alpha[i] * input1[j] + beta[i]) - input2[j]; - error += e*e; + error += e * e; } - expected_output[i] = error; + expected_output[i] = error; } return compare(expected_output, args.problem_size, 0.000001); } - + static std::string getBenchmarkName(BenchmarkArgs& args) { std::stringstream name; name << "LinearRegression_"; name << ReadableTypename::name; - return name.str(); + return name.str(); } }; -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { BenchmarkApp app(argc, argv); app.run>(); - if(app.deviceSupportsFP64()) + if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) { app.run>(); + } return 0; } diff --git a/single-kernel/median.cpp b/single-kernel/median.cpp index 2c7d4c8d..d8659f33 100644 --- a/single-kernel/median.cpp +++ b/single-kernel/median.cpp @@ -1,14 +1,14 @@ -#include #include +#include -#include "common.h" #include "bitmap.h" +#include "common.h" -namespace s = cl::sycl; +namespace s = sycl; class MedianFilterBenchKernel; // kernel forward declaration -void swap(cl::sycl::float4 A[], int i, int j) { +void swap(sycl::float4 A[], int i, int j) { /*if(A[i] > A[j]) { float temp = A[i]; A[i] = A[j]; @@ -20,126 +20,121 @@ void swap(cl::sycl::float4 A[], int i, int j) { /* A median filter with a windows of 3 pixels (3x3). - Input and output are two-dimensional buffers of floats. + Input and output are two-dimensional buffers of floats. */ -class MedianFilterBench -{ +class MedianFilterBench { protected: - std::vector input; - std::vector output; + std::vector input; + std::vector output; - size_t w, h; // size of the input picture - size_t size; // user-defined size (input and output will be size x size) - BenchmarkArgs args; + size_t w, h; // size of the input picture + size_t size; // user-defined size (input and output will be size x size) + BenchmarkArgs args; - PrefetchedBuffer input_buf; - PrefetchedBuffer output_buf; + PrefetchedBuffer input_buf; + PrefetchedBuffer output_buf; public: - MedianFilterBench(const BenchmarkArgs &_args) : args(_args) {} + MedianFilterBench(const BenchmarkArgs& _args) : args(_args) {} void setup() { size = args.problem_size; // input size defined by the user - input.resize(size * size); + input.resize(size * size); load_bitmap_mirrored("../share/Brommy.bmp", size, input); output.resize(size * size); - input_buf.initialize(args.device_queue, input.data(), s::range<2>(size, size)); + input_buf.initialize(args.device_queue, input.data(), s::range<2>(size, size)); output_buf.initialize(args.device_queue, output.data(), s::range<2>(size, size)); } - void run(std::vector& events) { - - events.push_back(args.device_queue.submit( - [&](cl::sycl::handler& cgh) { - auto in = input_buf .get_access(cgh); + void run(std::vector& events) { + events.push_back(args.device_queue.submit([&](sycl::handler& cgh) { + auto in = input_buf.get_access(cgh); auto out = output_buf.get_access(cgh); - cl::sycl::range<2> ndrange {size, size}; - - cgh.parallel_for(ndrange, - [in, out, size_ = size](cl::sycl::id<2> gid) - { - int x = gid[0]; - int y = gid[1]; - - // Optimization note: this array can be prefetched in local memory, TODO - cl::sycl::float4 window[9]; - int k = 0; - for(int i = -1; i<2; i++) - for(int j = -1; j<2; j++) { - uint xs = s::min(s::max(x+j, 0), static_cast(size_-1)); // borders are handled here with extended values - uint ys = s::min(s::max(y+i, 0), static_cast(size_-1)); - window[k] =in[ {xs,ys} ]; - k++; - } - - // (channel-wise) median selection using bitonic sorting - // the following network is used (Bose-Nelson algorithm): - // [[0,1],[2,3],[4,5],[7,8]] - // [[0,2],[1,3],[6,8]] - // [[1,2],[6,7],[5,8]] - // [[4,7],[3,8]] - // [[4,6],[5,7]] - // [[5,6],[2,7]] - // [[0,5],[1,6],[3,7]] - // [[0,4],[1,5],[3,6]] - // [[1,4],[2,5]] - // [[2,4],[3,5]] - // [[3,4]] - // se also http://pages.ripco.net/~jgamble/nw.html - swap(window, 0, 1); - swap(window, 2, 3); - swap(window, 0, 2); - swap(window, 1, 3); - swap(window, 1, 2); - swap(window, 4, 5); - swap(window, 7, 8); - swap(window, 6, 8); - swap(window, 6, 7); - swap(window, 4, 7); - swap(window, 4, 6); - swap(window, 5, 8); - swap(window, 5, 7); - swap(window, 5, 6); - swap(window, 0, 5); - swap(window, 0, 4); - swap(window, 1, 6); - swap(window, 1, 5); - swap(window, 1, 4); - swap(window, 2, 7); - swap(window, 3, 8); - swap(window, 3, 7); - swap(window, 2, 5); - swap(window, 2, 4); - swap(window, 3, 6); - swap(window, 3, 5); - swap(window, 3, 4); - - out[gid] = window[4]; - } - ); - })); - - args.device_queue.wait_and_throw(); - } - - - bool verify(VerificationSetting &ver) { + sycl::range<2> ndrange{size, size}; + + cgh.parallel_for(ndrange, [in, out, size_ = size](sycl::id<2> gid) { + int x = gid[0]; + int y = gid[1]; + + // Optimization note: this array can be prefetched in local memory, TODO + sycl::float4 window[9]; + int k = 0; + for(int i = -1; i < 2; i++) + for(int j = -1; j < 2; j++) { + uint xs = + s::min(s::max(x + j, 0), static_cast(size_ - 1)); // borders are handled here with extended values + uint ys = s::min(s::max(y + i, 0), static_cast(size_ - 1)); + window[k] = in[{xs, ys}]; + k++; + } + + // (channel-wise) median selection using bitonic sorting + // the following network is used (Bose-Nelson algorithm): + // [[0,1],[2,3],[4,5],[7,8]] + // [[0,2],[1,3],[6,8]] + // [[1,2],[6,7],[5,8]] + // [[4,7],[3,8]] + // [[4,6],[5,7]] + // [[5,6],[2,7]] + // [[0,5],[1,6],[3,7]] + // [[0,4],[1,5],[3,6]] + // [[1,4],[2,5]] + // [[2,4],[3,5]] + // [[3,4]] + // se also http://pages.ripco.net/~jgamble/nw.html + swap(window, 0, 1); + swap(window, 2, 3); + swap(window, 0, 2); + swap(window, 1, 3); + swap(window, 1, 2); + swap(window, 4, 5); + swap(window, 7, 8); + swap(window, 6, 8); + swap(window, 6, 7); + swap(window, 4, 7); + swap(window, 4, 6); + swap(window, 5, 8); + swap(window, 5, 7); + swap(window, 5, 6); + swap(window, 0, 5); + swap(window, 0, 4); + swap(window, 1, 6); + swap(window, 1, 5); + swap(window, 1, 4); + swap(window, 2, 7); + swap(window, 3, 8); + swap(window, 3, 7); + swap(window, 2, 5); + swap(window, 2, 4); + swap(window, 3, 6); + swap(window, 3, 5); + swap(window, 3, 4); + + out[gid] = window[4]; + }); + })); + + args.device_queue.wait_and_throw(); + } + + + bool verify(VerificationSetting& ver) { save_bitmap("median.bmp", size, output); bool pass = true; - auto output_acc = output_buf.get_access(); + auto output_acc = output_buf.get_host_access(); - for(size_t i=ver.begin[0]; i 0.01f) - { + sycl::float4 expected = window[4]; + sycl::float4 dif = fdim(output_acc.get_pointer()[i], expected); + float length = sycl::length(dif); + if(length > 0.01f) { pass = false; break; } - } + } return pass; -} + } -static std::string getBenchmarkName(BenchmarkArgs& args) { - return "MedianFilter"; - } + static std::string getBenchmarkName(BenchmarkArgs& args) { return "MedianFilter"; } }; // MedianFilterBench class -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { BenchmarkApp app(argc, argv); - app.run(); + app.run(); return 0; } - - diff --git a/single-kernel/mol_dyn.cpp b/single-kernel/mol_dyn.cpp index 917c2802..df1c7204 100644 --- a/single-kernel/mol_dyn.cpp +++ b/single-kernel/mol_dyn.cpp @@ -1,12 +1,11 @@ #include "common.h" #include -//using namespace cl::sycl; -namespace s = cl::sycl; +// using namespace sycl; +namespace s = sycl; class MolecularDynamicsKernel; -class MolecularDynamicsBench -{ +class MolecularDynamicsBench { protected: std::vector input; std::vector output; @@ -23,7 +22,7 @@ class MolecularDynamicsBench PrefetchedBuffer output_buf; public: - MolecularDynamicsBench(const BenchmarkArgs &_args) : args(_args) {} + MolecularDynamicsBench(const BenchmarkArgs& _args) : args(_args) {} void setup() { // host memory allocation and initialization @@ -50,104 +49,97 @@ class MolecularDynamicsBench output_buf.initialize(args.device_queue, output.data(), s::range<1>(args.problem_size * sizeof(s::float4))); } - void run(std::vector& events) { - - events.push_back(args.device_queue.submit( - [&](cl::sycl::handler& cgh) { + void run(std::vector& events) { + events.push_back(args.device_queue.submit([&](sycl::handler& cgh) { auto in = input_buf.get_access(cgh); auto neigh = neighbour_buf.get_access(cgh); auto out = output_buf.get_access(cgh); - cl::sycl::range<1> ndrange (args.problem_size); - - cgh.parallel_for(ndrange, - [=, problem_size = args.problem_size, neighCount_ = neighCount, - inum_ = inum, cutsq_ = cutsq, lj1_ = lj1, lj2_ = lj2] - (cl::sycl::id<1> idx) - { - size_t gid= idx[0]; - - if (gid < problem_size) { - s::float4 ipos = in[gid]; - s::float4 f = {0.0f, 0.0f, 0.0f, 0.0f}; - int j = 0; - while (j < neighCount_) { - int jidx = neigh[j*inum_ + gid]; - s::float4 jpos = in[jidx]; - - // Calculate distance - float delx = ipos.x() - jpos.x(); - float dely = ipos.y() - jpos.y(); - float delz = ipos.z() - jpos.z(); - float r2inv = delx*delx + dely*dely + delz*delz; - - // If distance is less than cutoff, calculate force - if (r2inv < cutsq_) { - r2inv = 10.0f/r2inv; - float r6inv = r2inv * r2inv * r2inv; - float forceC = r2inv*r6inv*(lj1_*r6inv - lj2_); - - f.x() += delx * forceC; - f.y() += dely * forceC; - f.z() += delz * forceC; - } - j++; + sycl::range<1> ndrange(args.problem_size); + + cgh.parallel_for( + ndrange, [=, problem_size = args.problem_size, neighCount_ = neighCount, inum_ = inum, cutsq_ = cutsq, + lj1_ = lj1, lj2_ = lj2](sycl::id<1> idx) { + size_t gid = idx[0]; + + if(gid < problem_size) { + s::float4 ipos = in[gid]; + s::float4 f = {0.0f, 0.0f, 0.0f, 0.0f}; + int j = 0; + while(j < neighCount_) { + int jidx = neigh[j * inum_ + gid]; + s::float4 jpos = in[jidx]; + + // Calculate distance + float delx = ipos.x() - jpos.x(); + float dely = ipos.y() - jpos.y(); + float delz = ipos.z() - jpos.z(); + float r2inv = delx * delx + dely * dely + delz * delz; + + // If distance is less than cutoff, calculate force + if(r2inv < cutsq_) { + r2inv = 10.0f / r2inv; + float r6inv = r2inv * r2inv * r2inv; + float forceC = r2inv * r6inv * (lj1_ * r6inv - lj2_); + + f.x() += delx * forceC; + f.y() += dely * forceC; + f.z() += delz * forceC; } - out[gid] = f; + j++; + } + out[gid] = f; } - }); + }); })); } - bool verify(VerificationSetting &ver) { - auto output_acc = output_buf.get_access(); + bool verify(VerificationSetting& ver) { + auto output_acc = output_buf.get_host_access(); bool pass = true; unsigned equal = 1; constexpr float maxErr = 10.f * std::numeric_limits::epsilon(); for(unsigned int i = 0; i < args.problem_size; ++i) { - s::float4 ipos = input[i]; - s::float4 f = {0.0f, 0.0f, 0.0f, 0.0f}; - int j = 0; - while (j < neighCount) { - int jidx = neighbour[j*inum + i]; - s::float4 jpos = input[jidx]; - - // Calculate distance - float delx = ipos.x() - jpos.x(); - float dely = ipos.y() - jpos.y(); - float delz = ipos.z() - jpos.z(); - float r2inv = delx*delx + dely*dely + delz*delz; - - // If distance is less than cutoff, calculate force - if (r2inv < cutsq) { - r2inv = 10.0f/r2inv; - float r6inv = r2inv * r2inv * r2inv; - float forceC = r2inv*r6inv*(lj1*r6inv - lj2); - - f.x() += delx * forceC; - f.y() += dely * forceC; - f.z() += delz * forceC; - } - j++; + s::float4 ipos = input[i]; + s::float4 f = {0.0f, 0.0f, 0.0f, 0.0f}; + int j = 0; + while(j < neighCount) { + int jidx = neighbour[j * inum + i]; + s::float4 jpos = input[jidx]; + + // Calculate distance + float delx = ipos.x() - jpos.x(); + float dely = ipos.y() - jpos.y(); + float delz = ipos.z() - jpos.z(); + float r2inv = delx * delx + dely * dely + delz * delz; + + // If distance is less than cutoff, calculate force + if(r2inv < cutsq) { + r2inv = 10.0f / r2inv; + float r6inv = r2inv * r2inv * r2inv; + float forceC = r2inv * r6inv * (lj1 * r6inv - lj2); + + f.x() += delx * forceC; + f.y() += dely * forceC; + f.z() += delz * forceC; } + j++; + } - if(s::distance(f, output_acc[i]) / s::length(f) > maxErr) { - pass = false; - break; - } + if(s::distance(f, output_acc[i]) / s::length(f) > maxErr) { + pass = false; + break; + } } return pass; } - - static std::string getBenchmarkName(BenchmarkArgs& args) { - return "MolecularDynamics"; - } + + static std::string getBenchmarkName(BenchmarkArgs& args) { return "MolecularDynamics"; } }; -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { BenchmarkApp app(argc, argv); - app.run(); + app.run(); return 0; } diff --git a/single-kernel/nbody.cpp b/single-kernel/nbody.cpp index a080832a..0af9ef35 100644 --- a/single-kernel/nbody.cpp +++ b/single-kernel/nbody.cpp @@ -1,18 +1,19 @@ #include "common.h" -#include #include +#include #include -using namespace cl; +using namespace sycl; -template class NDRangeNBodyKernel; -template class HierarchicalNBodyKernel; +template +class NDRangeNBodyKernel; +template +class HierarchicalNBodyKernel; -template -class NBody -{ +template +class NBody { protected: using particle_type = sycl::vec; using vector_type = sycl::vec; @@ -31,19 +32,17 @@ class NBody PrefetchedBuffer particles_buf; PrefetchedBuffer velocities_buf; + public: - NBody(const BenchmarkArgs& _args) - : args(_args), gravitational_softening{1.e-5f}, dt{1.e-2f} { + NBody(const BenchmarkArgs& _args) : args(_args), gravitational_softening{1.e-5f}, dt{1.e-2f} { assert(args.problem_size % args.local_size == 0); } void setup() { - particles.resize(args.problem_size); velocities.resize(args.problem_size); for(std::size_t i = 0; i < args.problem_size; ++i) { - float_type rel_i = static_cast(i) / static_cast(args.problem_size); particles[i].x() = rel_i * std::cos(3000.f * 2.f * M_PI * rel_i); @@ -56,21 +55,21 @@ class NBody velocities[i].z() = 0; } - particles_buf. initialize(args.device_queue, this->particles.data(), sycl::range<1>{this->args.problem_size}); + particles_buf.initialize(args.device_queue, this->particles.data(), sycl::range<1>{this->args.problem_size}); velocities_buf.initialize(args.device_queue, this->velocities.data(), sycl::range<1>{this->args.problem_size}); - output_particles. initialize(args.device_queue, sycl::range<1>{args.problem_size}); + output_particles.initialize(args.device_queue, sycl::range<1>{args.problem_size}); output_velocities.initialize(args.device_queue, sycl::range<1>{args.problem_size}); } - bool verify(VerificationSetting &ver) { - auto resulting_particles = output_particles.template get_access(); - auto resulting_velocities = output_velocities.template get_access(); + bool verify(VerificationSetting& ver) { + auto resulting_particles = output_particles.get_host_access(); + auto resulting_velocities = output_velocities.get_host_access(); std::vector host_resulting_particles(particles.size()); std::vector host_resulting_velocities(particles.size()); - + for(std::size_t i = 0; i < particles.size(); ++i) { const particle_type my_p = particles[i]; @@ -78,27 +77,20 @@ class NBody vector_type acceleration{static_cast(0.0f)}; for(std::size_t j = 0; j < particles.size(); ++j) { - if(i != j) { const particle_type p = particles[j]; - - const vector_type R { - p.x() - my_p.x(), - p.y() - my_p.y(), - p.z() - my_p.z() - }; - const float_type r_inv = sycl::rsqrt(R.x() * R.x() + R.y() * R.y() + R.z() * R.z() + - gravitational_softening); + const vector_type R{p.x() - my_p.x(), p.y() - my_p.y(), p.z() - my_p.z()}; + + const float_type r_inv = sycl::rsqrt(R.x() * R.x() + R.y() * R.y() + R.z() * R.z() + gravitational_softening); acceleration += static_cast(p.w()) * r_inv * r_inv * r_inv * R; } - } vector_type new_v = my_v + acceleration * dt; particle_type new_p = my_p; - new_p.x() += new_v.x() * dt; + new_p.x() += new_v.x() * dt; new_p.y() += new_v.y() * dt; new_p.z() += new_v.z() * dt; @@ -131,12 +123,10 @@ class NBody auto output_particles_access = output_particles.template get_access(cgh); auto output_velocities_access = output_velocities.template get_access(cgh); - auto scratch = sycl::accessor{ - sycl::range<1>{args.local_size}, cgh}; + auto scratch = sycl::local_accessor{sycl::range<1>{args.local_size}, cgh}; cgh.parallel_for>(execution_range, [=, dt = this->dt, gravitational_softening = this->gravitational_softening](sycl::nd_item<1> tid) { - const size_t global_id = tid.get_global_id(0); const size_t local_id = tid.get_local_id(0); const size_t num_particles = tid.get_global_range()[0]; @@ -153,7 +143,7 @@ class NBody scratch[local_id] = (global_id < num_particles) ? particles_access[offset + local_id] : particle_type{static_cast(0.0f)}; - tid.barrier(); + sycl::group_barrier(tid.get_group()); for(int i = 0; i < local_size; ++i) { const particle_type p = scratch[i]; @@ -168,7 +158,7 @@ class NBody acceleration += static_cast(p.w()) * r_inv * r_inv * r_inv * R; } - tid.barrier(); + sycl::group_barrier(tid.get_group()); } // This is a dirt cheap Euler integration, but could be @@ -199,8 +189,7 @@ class NBody auto output_particles_access = output_particles.template get_access(cgh); auto output_velocities_access = output_velocities.template get_access(cgh); - auto scratch = sycl::accessor{ - sycl::range<1>{args.local_size}, cgh}; + auto scratch = sycl::local_accessor{sycl::range<1>{args.local_size}, cgh}; const size_t local_size = args.local_size; @@ -267,20 +256,16 @@ class NBody } }; -template -class NBodyNDRange : public NBody -{ +template +class NBodyNDRange : public NBody { public: using typename NBody::particle_type; using typename NBody::vector_type; - NBodyNDRange(const BenchmarkArgs& _args) - : NBody{_args} {} + NBodyNDRange(const BenchmarkArgs& _args) : NBody{_args} {} - void run(){ - this->submitNDRange(this->particles_buf.get(), this->velocities_buf.get()); - } + void run() { this->submitNDRange(this->particles_buf.get(), this->velocities_buf.get()); } std::string getBenchmarkName(BenchmarkArgs& args) { std::stringstream name; @@ -291,43 +276,38 @@ class NBodyNDRange : public NBody }; -template -class NBodyHierarchical : public NBody -{ +template +class NBodyHierarchical : public NBody { public: using typename NBody::particle_type; using typename NBody::vector_type; - NBodyHierarchical(const BenchmarkArgs& _args) - : NBody{_args} {} + NBodyHierarchical(const BenchmarkArgs& _args) : NBody{_args} {} - void run(){ - this->submitHierarchical(this->particles_buf.get(), this->velocities_buf.get()); - } + void run() { this->submitHierarchical(this->particles_buf.get(), this->velocities_buf.get()); } std::string getBenchmarkName(BenchmarkArgs& args) { std::stringstream name; name << "NBody_Hierarchical_"; name << ReadableTypename::name; - + return name.str(); } }; -int main(int argc, char** argv) -{ - +int main(int argc, char** argv) { BenchmarkApp app(argc, argv); - app.run< NBodyHierarchical >(); - if(app.deviceSupportsFP64()) + app.run>(); + if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) { app.run>(); - + } if(app.shouldRunNDRangeKernels()) { - app.run< NBodyNDRange >(); - if(app.deviceSupportsFP64()) + app.run>(); + if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) { app.run>(); + } } return 0; diff --git a/single-kernel/perlin.cpp b/single-kernel/perlin.cpp index 633e2137..f97626fb 100644 --- a/single-kernel/perlin.cpp +++ b/single-kernel/perlin.cpp @@ -1 +1 @@ -TODO: import Perlin noise from the Insieme OpenCL benchmark +TODO : import Perlin noise from the Insieme OpenCL benchmark diff --git a/single-kernel/scalar_prod.cpp b/single-kernel/scalar_prod.cpp index 7583924b..da146f55 100644 --- a/single-kernel/scalar_prod.cpp +++ b/single-kernel/scalar_prod.cpp @@ -1,47 +1,46 @@ #include "common.h" +#include #include #include -#include -//using namespace cl::sycl; -namespace s = cl::sycl; +// using namespace sycl; +namespace s = sycl; -template +template class ScalarProdKernel; -template +template class ScalarProdKernelHierarchical; -template +template class ScalarProdReduction; -template +template class ScalarProdReductionHierarchical; -template +template class ScalarProdGatherKernel; -template -class ScalarProdBench -{ -protected: - std::vector input1; - std::vector input2; - std::vector output; - BenchmarkArgs args; +template +class ScalarProdBench { +protected: + std::vector input1; + std::vector input2; + std::vector output; + BenchmarkArgs args; - PrefetchedBuffer input1_buf; - PrefetchedBuffer input2_buf; - PrefetchedBuffer output_buf; + PrefetchedBuffer input1_buf; + PrefetchedBuffer input2_buf; + PrefetchedBuffer output_buf; public: - ScalarProdBench(const BenchmarkArgs &_args) : args(_args) {} - - void setup() { + ScalarProdBench(const BenchmarkArgs& _args) : args(_args) {} + + void setup() { // host memory allocation and initialization input1.resize(args.problem_size); input2.resize(args.problem_size); output.resize(args.problem_size); - for (size_t i = 0; i < args.problem_size; i++) { + for(size_t i = 0; i < args.problem_size; i++) { input1[i] = static_cast(1); input2[i] = static_cast(2); output[i] = static_cast(0); @@ -52,35 +51,29 @@ class ScalarProdBench output_buf.initialize(args.device_queue, output.data(), s::range<1>(args.problem_size)); } - void run(std::vector& events) { - - events.push_back(args.device_queue.submit( - [&](cl::sycl::handler& cgh) { + void run(std::vector& events) { + events.push_back(args.device_queue.submit([&](sycl::handler& cgh) { auto in1 = input1_buf.template get_access(cgh); auto in2 = input2_buf.template get_access(cgh); // Use discard_write here, otherwise the content of the hostbuffer must first be copied to device auto intermediate_product = output_buf.template get_access(cgh); - if(Use_ndrange){ - cl::sycl::nd_range<1> ndrange (args.problem_size, args.local_size); + if(Use_ndrange) { + sycl::nd_range<1> ndrange(args.problem_size, args.local_size); - cgh.parallel_for>(ndrange, - [=](cl::sycl::nd_item<1> item) - { - size_t gid= item.get_global_linear_id(); - intermediate_product[gid] = in1[gid] * in2[gid]; - }); - } - else { + cgh.parallel_for>(ndrange, [=](sycl::nd_item<1> item) { + size_t gid = item.get_global_linear_id(); + intermediate_product[gid] = in1[gid] * in2[gid]; + }); + } else { cgh.parallel_for_work_group>( - cl::sycl::range<1>{args.problem_size / args.local_size}, - cl::sycl::range<1>{args.local_size}, - [=](cl::sycl::group<1> grp){ - grp.parallel_for_work_item([&](cl::sycl::h_item<1> idx){ - size_t gid = idx.get_global_id(0); - intermediate_product[gid] = in1[gid] * in2[gid]; + sycl::range<1>{args.problem_size / args.local_size}, sycl::range<1>{args.local_size}, + [=](sycl::group<1> grp) { + grp.parallel_for_work_item([&](sycl::h_item<1> idx) { + size_t gid = idx.get_global_id(0); + intermediate_product[gid] = in1[gid] * in2[gid]; + }); }); - }); } })); @@ -91,119 +84,109 @@ class ScalarProdBench // Not yet tested with more than 2 auto elements_per_thread = 2; - while (array_size!= 1) { - auto n_wgroups = (array_size + wgroup_size*elements_per_thread - 1)/(wgroup_size*elements_per_thread); // two threads per work item - - events.push_back(args.device_queue.submit( - [&](cl::sycl::handler& cgh) { - - auto global_mem = output_buf.template get_access(cgh); - - // local memory for reduction - auto local_mem = s::accessor {s::range<1>(wgroup_size), cgh}; - cl::sycl::nd_range<1> ndrange (n_wgroups*wgroup_size, wgroup_size); - - if(Use_ndrange) { - cgh.parallel_for>(ndrange, - [=](cl::sycl::nd_item<1> item) - { - size_t gid= item.get_global_linear_id(); - size_t lid = item.get_local_linear_id(); - - // initialize local memory to 0 - local_mem[lid] = 0; - - for(int i = 0; i < elements_per_thread; ++i) { - int input_element = gid + i * n_wgroups * wgroup_size; - - if(input_element < array_size) - local_mem[lid] += global_mem[input_element]; - } + while(array_size != 1) { + auto n_wgroups = (array_size + wgroup_size * elements_per_thread - 1) / + (wgroup_size * elements_per_thread); // two threads per work item - item.barrier(s::access::fence_space::local_space); + events.push_back(args.device_queue.submit([&](sycl::handler& cgh) { + auto global_mem = output_buf.template get_access(cgh); - for(size_t stride = wgroup_size/elements_per_thread; stride >= 1; stride /= elements_per_thread) { - if(lid < stride) { - for(int i = 0; i < elements_per_thread-1; ++i){ - local_mem[lid] += local_mem[lid + stride + i]; - } - } - item.barrier(s::access::fence_space::local_space); - } - - // Only one work-item per work group writes to global memory - if (lid == 0) { - global_mem[item.get_global_id()] = local_mem[0]; + // local memory for reduction + auto local_mem = s::local_accessor{s::range<1>(wgroup_size), cgh}; + + sycl::nd_range<1> ndrange(n_wgroups * wgroup_size, wgroup_size); + + if(Use_ndrange) { + cgh.parallel_for>(ndrange, [=](sycl::nd_item<1> item) { + size_t gid = item.get_global_linear_id(); + size_t lid = item.get_local_linear_id(); + + // initialize local memory to 0 + local_mem[lid] = 0; + + for(int i = 0; i < elements_per_thread; ++i) { + int input_element = gid + i * n_wgroups * wgroup_size; + + if(input_element < array_size) + local_mem[lid] += global_mem[input_element]; + } + + sycl::group_barrier(item.get_group()); + + for(size_t stride = wgroup_size / elements_per_thread; stride >= 1; stride /= elements_per_thread) { + if(lid < stride) { + for(int i = 0; i < elements_per_thread - 1; ++i) { + local_mem[lid] += local_mem[lid + stride + i]; } - }); - } - else { - cgh.parallel_for_work_group>( - cl::sycl::range<1>{n_wgroups}, cl::sycl::range<1>{wgroup_size}, - [=](cl::sycl::group<1> grp){ - - grp.parallel_for_work_item([&](cl::sycl::h_item<1> idx){ + } + sycl::group_barrier(item.get_group()); + } + + // Only one work-item per work group writes to global memory + if(lid == 0) { + global_mem[item.get_global_id()] = local_mem[0]; + } + }); + } else { + cgh.parallel_for_work_group>( + sycl::range<1>{n_wgroups}, sycl::range<1>{wgroup_size}, [=](sycl::group<1> grp) { + grp.parallel_for_work_item([&](sycl::h_item<1> idx) { const size_t gid = idx.get_global_id(0); const size_t lid = idx.get_local_id(0); // initialize local memory to 0 - local_mem[lid] = 0; + local_mem[lid] = 0; for(int i = 0; i < elements_per_thread; ++i) { int input_element = gid + i * n_wgroups * wgroup_size; - + if(input_element < array_size) local_mem[lid] += global_mem[input_element]; } }); - for(size_t stride = wgroup_size/elements_per_thread; stride >= 1; stride /= elements_per_thread) { - grp.parallel_for_work_item([&](cl::sycl::h_item<1> idx){ - + for(size_t stride = wgroup_size / elements_per_thread; stride >= 1; stride /= elements_per_thread) { + grp.parallel_for_work_item([&](sycl::h_item<1> idx) { const size_t lid = idx.get_local_id(0); - + if(lid < stride) { - for(int i = 0; i < elements_per_thread-1; ++i){ + for(int i = 0; i < elements_per_thread - 1; ++i) { local_mem[lid] += local_mem[lid + stride + i]; } } }); } - grp.parallel_for_work_item([&](cl::sycl::h_item<1> idx){ + grp.parallel_for_work_item([&](sycl::h_item<1> idx) { const size_t lid = idx.get_local_id(0); if(lid == 0) - global_mem[grp.get_id(0) * grp.get_local_range(0)] = local_mem[0]; + global_mem[grp.get_group_id(0) * grp.get_local_range(0)] = local_mem[0]; }); }); - } - })); - - events.push_back(args.device_queue.submit( - [&](cl::sycl::handler& cgh) { - - auto global_mem = output_buf.template get_access(cgh); - - cgh.parallel_for>(cl::sycl::range<1>{n_wgroups}, - [=](cl::sycl::id<1> idx){ - global_mem[idx] = global_mem[idx * wgroup_size]; - }); - })); + } + })); + + events.push_back(args.device_queue.submit([&](sycl::handler& cgh) { + auto global_mem = output_buf.template get_access(cgh); + + cgh.parallel_for>( + sycl::range<1>{n_wgroups}, [=](sycl::id<1> idx) { global_mem[idx] = global_mem[idx * wgroup_size]; }); + })); array_size = n_wgroups; } } - bool verify(VerificationSetting &ver) { + bool verify(VerificationSetting& ver) { bool pass = true; - auto expected = static_cast (0); + auto expected = static_cast(0); - auto output_acc = output_buf.template get_access(); + auto output_acc = output_buf.get_host_access(); for(size_t i = 0; i < args.problem_size; i++) { - expected += input1[i] * input2[i]; + expected += input1[i] * input2[i]; } - //std::cout << "Scalar product on CPU =" << expected << std::endl; - //std::cout << "Scalar product on Device =" << output[0] << std::endl; + // std::cout << "Scalar product on CPU =" << expected << std::endl; + // std::cout << "Scalar product on Device =" << output[0] << std::endl; // Todo: update to type-specific test (Template specialization?) const auto tolerance = 0.00001f; @@ -213,7 +196,7 @@ class ScalarProdBench return pass; } - + static std::string getBenchmarkName(BenchmarkArgs& args) { std::stringstream name; name << "ScalarProduct_"; @@ -223,22 +206,22 @@ class ScalarProdBench } }; -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { BenchmarkApp app(argc, argv); if(app.shouldRunNDRangeKernels()) { app.run>(); app.run>(); app.run>(); - if(app.deviceSupportsFP64()) + if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) { app.run>(); + } } app.run>(); app.run>(); app.run>(); - if(app.deviceSupportsFP64()) + if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) { app.run>(); - + } return 0; } diff --git a/single-kernel/sobel.cpp b/single-kernel/sobel.cpp index 9068668e..f906befc 100644 --- a/single-kernel/sobel.cpp +++ b/single-kernel/sobel.cpp @@ -1,57 +1,57 @@ -#include #include +#include -#include "common.h" #include "bitmap.h" +#include "common.h" -namespace s = cl::sycl; +namespace s = sycl; class SobelBenchKernel; // kernel forward declaration /* A Sobel filter with a convolution matrix 3x3. - Input and output are two-dimensional buffers of floats. + Input and output are two-dimensional buffers of floats. */ -class SobelBench -{ +class SobelBench { protected: - std::vector input; - std::vector output; + std::vector input; + std::vector output; size_t w, h; // size of the input picture size_t size; // user-defined size (input and output will be size x size) BenchmarkArgs args; - PrefetchedBuffer input_buf; - PrefetchedBuffer output_buf; + PrefetchedBuffer input_buf; + PrefetchedBuffer output_buf; + public: - SobelBench(const BenchmarkArgs &_args) : args(_args) {} + SobelBench(const BenchmarkArgs& _args) : args(_args) {} void setup() { size = args.problem_size; // input size defined by the user - input.resize(size * size); + input.resize(size * size); load_bitmap_mirrored("../../share/Brommy.bmp", size, input); output.resize(size * size); - input_buf.initialize(args.device_queue, input.data(), s::range<2>(size, size)); + input_buf.initialize(args.device_queue, input.data(), s::range<2>(size, size)); output_buf.initialize(args.device_queue, output.data(), s::range<2>(size, size)); } - void run(std::vector& events) { - events.push_back(args.device_queue.submit([&](cl::sycl::handler& cgh) { + void run(std::vector& events) { + events.push_back(args.device_queue.submit([&](sycl::handler& cgh) { auto in = input_buf.get_access(cgh); auto out = output_buf.get_access(cgh); - cl::sycl::range<2> ndrange{size, size}; + sycl::range<2> ndrange{size, size}; // Sobel kernel 3x3 const float kernel[] = {1, 0, -1, 2, 0, -2, 1, 0, -1}; - cgh.parallel_for(ndrange, [in, out, kernel, size_ = size](cl::sycl::id<2> gid) { + cgh.parallel_for(ndrange, [in, out, kernel, size_ = size](sycl::id<2> gid) { int x = gid[0]; int y = gid[1]; - cl::sycl::float4 Gx = cl::sycl::float4(0, 0, 0, 0); - cl::sycl::float4 Gy = cl::sycl::float4(0, 0, 0, 0); + sycl::float4 Gx = sycl::float4(0, 0, 0, 0); + sycl::float4 Gy = sycl::float4(0, 0, 0, 0); const int radius = 3; // constant-size loops in [0,1,2] @@ -68,29 +68,29 @@ class SobelBench continue; // sample color - cl::sycl::float4 sample = in[{xs, ys}]; + sycl::float4 sample = in[{xs, ys}]; // convolution calculation int offset_x = x_shift + y_shift * radius; int offset_y = y_shift + x_shift * radius; float conv_x = kernel[offset_x]; - cl::sycl::float4 conv4_x = cl::sycl::float4(conv_x); + sycl::float4 conv4_x = sycl::float4(conv_x); Gx += conv4_x * sample; float conv_y = kernel[offset_y]; - cl::sycl::float4 conv4_y = cl::sycl::float4(conv_y); + sycl::float4 conv4_y = sycl::float4(conv_y); Gy += conv4_y * sample; } } // taking root of sums of squares of Gx and Gy - cl::sycl::float4 color = hypot(Gx, Gy); - cl::sycl::float4 minval = cl::sycl::float4(0.0, 0.0, 0.0, 0.0); - cl::sycl::float4 maxval = cl::sycl::float4(1.0, 1.0, 1.0, 1.0); + sycl::float4 color = hypot(Gx, Gy); + sycl::float4 minval = sycl::float4(0.0, 0.0, 0.0, 0.0); + sycl::float4 maxval = sycl::float4(1.0, 1.0, 1.0, 1.0); out[gid] = clamp(color, minval, maxval); }); })); - } + } bool verify(VerificationSetting& ver) { @@ -104,7 +104,7 @@ class SobelBench for(size_t i = ver.begin[0]; i < ver.begin[0] + ver.range[0]; i++) { int x = i % size; int y = i / size; - cl::sycl::float4 Gx, Gy; + sycl::float4 Gx, Gy; for(uint x_shift = 0; x_shift < 3; x_shift++) for(uint y_shift = 0; y_shift < 3; y_shift++) { uint xs = x + x_shift - 1; @@ -113,23 +113,23 @@ class SobelBench continue; if(xs < 0 || xs >= size || ys < 0 || ys >= size) continue; - cl::sycl::float4 sample = input[xs + ys * size]; + sycl::float4 sample = input[xs + ys * size]; int offset_x = x_shift + y_shift * radius; int offset_y = y_shift + x_shift * radius; float conv_x = kernel[offset_x]; - cl::sycl::float4 conv4_x = cl::sycl::float4(conv_x); + sycl::float4 conv4_x = sycl::float4(conv_x); Gx += conv4_x * sample; float conv_y = kernel[offset_y]; - cl::sycl::float4 conv4_y = cl::sycl::float4(conv_y); + sycl::float4 conv4_y = sycl::float4(conv_y); Gy += conv4_y * sample; } - cl::sycl::float4 color = hypot(Gx, Gy); - cl::sycl::float4 minval = cl::sycl::float4(0.0, 0.0, 0.0, 0.0); - cl::sycl::float4 maxval = cl::sycl::float4(1.0, 1.0, 1.0, 1.0); - cl::sycl::float4 expected = clamp(color, minval, maxval); - cl::sycl::float4 dif = fdim(output[i], expected); - float length = cl::sycl::length(dif); + sycl::float4 color = hypot(Gx, Gy); + sycl::float4 minval = sycl::float4(0.0, 0.0, 0.0, 0.0); + sycl::float4 maxval = sycl::float4(1.0, 1.0, 1.0, 1.0); + sycl::float4 expected = clamp(color, minval, maxval); + sycl::float4 dif = fdim(output[i], expected); + float length = sycl::length(dif); if(length > 0.01f) { pass = false; break; @@ -144,11 +144,8 @@ class SobelBench }; // SobelBench class -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { BenchmarkApp app(argc, argv); - app.run(); + app.run(); return 0; } - - diff --git a/single-kernel/sobel5.cpp b/single-kernel/sobel5.cpp index 92ede750..47316328 100644 --- a/single-kernel/sobel5.cpp +++ b/single-kernel/sobel5.cpp @@ -1,11 +1,11 @@ -#include #include +#include -#include "common.h" #include "bitmap.h" +#include "common.h" -namespace s = cl::sycl; +namespace s = sycl; class Sobel5BenchKernel; // kernel forward declaration @@ -13,22 +13,22 @@ class Sobel5BenchKernel; // kernel forward declaration A Sobel filter with a convolution matrix 5x5. The convolution kernel is calculated by using a recursive conv2 on the [1 2 1]'*[1 0 -1] basis matrix. - Input and output are two-dimensional buffers of floats. + Input and output are two-dimensional buffers of floats. */ -class Sobel5Bench -{ +class Sobel5Bench { protected: - std::vector input; - std::vector output; + std::vector input; + std::vector output; + + size_t w, h; // size of the input picture + size_t size; // user-defined size (input and output will be size x size) + BenchmarkArgs args; - size_t w, h; // size of the input picture - size_t size; // user-defined size (input and output will be size x size) - BenchmarkArgs args; + PrefetchedBuffer input_buf; + PrefetchedBuffer output_buf; - PrefetchedBuffer input_buf; - PrefetchedBuffer output_buf; public: - Sobel5Bench(const BenchmarkArgs &_args) : args(_args) {} + Sobel5Bench(const BenchmarkArgs& _args) : args(_args) {} void setup() { size = args.problem_size; // input size defined by the user @@ -40,132 +40,113 @@ class Sobel5Bench output_buf.initialize(args.device_queue, output.data(), s::range<2>(size, size)); } - void run(std::vector& events) { - events.push_back(args.device_queue.submit( - [&](cl::sycl::handler& cgh) { - auto in = input_buf .get_access(cgh); + void run(std::vector& events) { + events.push_back(args.device_queue.submit([&](sycl::handler& cgh) { + auto in = input_buf.get_access(cgh); auto out = output_buf.get_access(cgh); - cl::sycl::range<2> ndrange {size, size}; + sycl::range<2> ndrange{size, size}; // Sobel kernel 5x5 - const float kernel[] = - { 1, 2, 0, -2, -1, - 4, 8, 0, -8, -4, - 6, 12, 0, -12, -6, - 4, 8, 0, -8, -4, - 1, 2, 0, -2, -1 - }; - - cgh.parallel_for(ndrange, - [in, out, kernel, size_ = size](cl::sycl::id<2> gid) - { - int x = gid[0]; - int y = gid[1]; - cl::sycl::float4 Gx = cl::sycl::float4(0,0,0,0); - cl::sycl::float4 Gy = cl::sycl::float4(0,0,0,0); - const int radius = 5; - - // constant-size loops in [0,1,2,3,4] - for(int x_shift = 0; x_shift<5; x_shift++) - { - for(int y_shift = 0; y_shift<5; y_shift++) - { - // sample position - uint xs = x + x_shift - 2; // [x-2,x-1,x,x+1,x+2] - uint ys = y + y_shift - 2; // [y-2,y-1,y,y+1,y+2] - // for the same pixel, convolution is always 0 - if(x==xs && y==ys) continue; - // boundary check - if(xs < 0 || xs >= size_ || ys < 0 || ys >= size_) continue; - - // sample color - cl::sycl::float4 sample = in[ {xs,ys} ]; - - // convolution calculation - int offset_x = x_shift + y_shift * radius; - int offset_y = y_shift + x_shift * radius; - - float conv_x = kernel[offset_x]; - cl::sycl::float4 conv4_x = cl::sycl::float4(conv_x); - Gx += conv4_x * sample; - - float conv_y = kernel[offset_y]; - cl::sycl::float4 conv4_y = cl::sycl::float4(conv_y); - Gy += conv4_y * sample; - } + const float kernel[] = {1, 2, 0, -2, -1, 4, 8, 0, -8, -4, 6, 12, 0, -12, -6, 4, 8, 0, -8, -4, 1, 2, 0, -2, -1}; + + cgh.parallel_for(ndrange, [in, out, kernel, size_ = size](sycl::id<2> gid) { + int x = gid[0]; + int y = gid[1]; + sycl::float4 Gx = sycl::float4(0, 0, 0, 0); + sycl::float4 Gy = sycl::float4(0, 0, 0, 0); + const int radius = 5; + + // constant-size loops in [0,1,2,3,4] + for(int x_shift = 0; x_shift < 5; x_shift++) { + for(int y_shift = 0; y_shift < 5; y_shift++) { + // sample position + uint xs = x + x_shift - 2; // [x-2,x-1,x,x+1,x+2] + uint ys = y + y_shift - 2; // [y-2,y-1,y,y+1,y+2] + // for the same pixel, convolution is always 0 + if(x == xs && y == ys) + continue; + // boundary check + if(xs < 0 || xs >= size_ || ys < 0 || ys >= size_) + continue; + + // sample color + sycl::float4 sample = in[{xs, ys}]; + + // convolution calculation + int offset_x = x_shift + y_shift * radius; + int offset_y = y_shift + x_shift * radius; + + float conv_x = kernel[offset_x]; + sycl::float4 conv4_x = sycl::float4(conv_x); + Gx += conv4_x * sample; + + float conv_y = kernel[offset_y]; + sycl::float4 conv4_y = sycl::float4(conv_y); + Gy += conv4_y * sample; } - // taking root of sums of squares of Gx and Gy - cl::sycl::float4 color = hypot(Gx, Gy); - cl::sycl::float4 minval = cl::sycl::float4(0.0, 0.0, 0.0, 0.0); - cl::sycl::float4 maxval = cl::sycl::float4(1.0, 1.0, 1.0, 1.0); - out[gid] = clamp(color, minval, maxval); - } - ); - })); - } - - bool verify(VerificationSetting &ver) { + } + // taking root of sums of squares of Gx and Gy + sycl::float4 color = hypot(Gx, Gy); + sycl::float4 minval = sycl::float4(0.0, 0.0, 0.0, 0.0); + sycl::float4 maxval = sycl::float4(1.0, 1.0, 1.0, 1.0); + out[gid] = clamp(color, minval, maxval); + }); + })); + } + + bool verify(VerificationSetting& ver) { // Triggers writeback output_buf.reset(); save_bitmap("sobel5.bmp", size, output); - const float kernel[] = { 1, 2, 0, -2, -1,4, 8, 0, -8, -4, 6, 12, 0, -12, -6, 4, 8, 0, -8, -4, 1, 2, 0, -2, -1 }; + const float kernel[] = {1, 2, 0, -2, -1, 4, 8, 0, -8, -4, 6, 12, 0, -12, -6, 4, 8, 0, -8, -4, 1, 2, 0, -2, -1}; bool pass = true; int radius = 5; - for(size_t i=ver.begin[0]; i= size || ys < 0 || ys >= size) continue; - cl::sycl::float4 sample = input[xs + ys * size]; - int offset_x = x_shift + y_shift * radius; - int offset_y = y_shift + x_shift * radius; - float conv_x = kernel[offset_x]; - cl::sycl::float4 conv4_x = cl::sycl::float4(conv_x); - Gx += conv4_x * sample; - float conv_y = kernel[offset_y]; - cl::sycl::float4 conv4_y = cl::sycl::float4(conv_y); - Gy += conv4_y * sample; - } - cl::sycl::float4 color = hypot(Gx, Gy); - cl::sycl::float4 minval = cl::sycl::float4(0.0, 0.0, 0.0, 0.0); - cl::sycl::float4 maxval = cl::sycl::float4(1.0, 1.0, 1.0, 1.0); - cl::sycl::float4 expected = clamp(color, minval, maxval); - cl::sycl::float4 dif = fdim(output[i], expected); - float length = cl::sycl::length(dif); - if(length > 0.01f) - { - pass = false; - break; + sycl::float4 Gx, Gy; + for(uint x_shift = 0; x_shift < 5; x_shift++) + for(uint y_shift = 0; y_shift < 5; y_shift++) { + uint xs = x + x_shift - 2; + uint ys = y + y_shift - 2; + if(x == xs && y == ys) + continue; + if(xs < 0 || xs >= size || ys < 0 || ys >= size) + continue; + sycl::float4 sample = input[xs + ys * size]; + int offset_x = x_shift + y_shift * radius; + int offset_y = y_shift + x_shift * radius; + float conv_x = kernel[offset_x]; + sycl::float4 conv4_x = sycl::float4(conv_x); + Gx += conv4_x * sample; + float conv_y = kernel[offset_y]; + sycl::float4 conv4_y = sycl::float4(conv_y); + Gy += conv4_y * sample; } + sycl::float4 color = hypot(Gx, Gy); + sycl::float4 minval = sycl::float4(0.0, 0.0, 0.0, 0.0); + sycl::float4 maxval = sycl::float4(1.0, 1.0, 1.0, 1.0); + sycl::float4 expected = clamp(color, minval, maxval); + sycl::float4 dif = fdim(output[i], expected); + float length = sycl::length(dif); + if(length > 0.01f) { + pass = false; + break; + } } return pass; -} - - -static std::string getBenchmarkName(BenchmarkArgs& args) { - return "Sobel5"; } -}; // SobelBench class + static std::string getBenchmarkName(BenchmarkArgs& args) { return "Sobel5"; } +}; // SobelBench class -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { BenchmarkApp app(argc, argv); app.run(); return 0; } - - - - - diff --git a/single-kernel/sobel7.cpp b/single-kernel/sobel7.cpp index b75d25da..5a2d7cf0 100644 --- a/single-kernel/sobel7.cpp +++ b/single-kernel/sobel7.cpp @@ -1,32 +1,32 @@ -#include #include +#include -#include "common.h" #include "bitmap.h" +#include "common.h" -namespace s = cl::sycl; +namespace s = sycl; class Sobel7BenchKernel; // kernel forward declaration /* A Sobel filter with a convolution matrix 7x7. - Input and output are two-dimensional buffers of floats. + Input and output are two-dimensional buffers of floats. */ -class Sobel7Bench -{ +class Sobel7Bench { protected: - std::vector input; - std::vector output; + std::vector input; + std::vector output; + + size_t w, h; // size of the input picture + size_t size; // user-defined size (input and output will be size x size) + BenchmarkArgs args; - size_t w, h; // size of the input picture - size_t size; // user-defined size (input and output will be size x size) - BenchmarkArgs args; + PrefetchedBuffer input_buf; + PrefetchedBuffer output_buf; - PrefetchedBuffer input_buf; - PrefetchedBuffer output_buf; public: - Sobel7Bench(const BenchmarkArgs &_args) : args(_args) {} + Sobel7Bench(const BenchmarkArgs& _args) : args(_args) {} void setup() { size = args.problem_size; // input size defined by the user @@ -38,22 +38,22 @@ class Sobel7Bench output_buf.initialize(args.device_queue, output.data(), s::range<2>(size, size)); } - void run(std::vector& events) { - events.push_back(args.device_queue.submit([&](cl::sycl::handler& cgh) { + void run(std::vector& events) { + events.push_back(args.device_queue.submit([&](sycl::handler& cgh) { auto in = input_buf.get_access(cgh); auto out = output_buf.get_access(cgh); - cl::sycl::range<2> ndrange{size, size}; + sycl::range<2> ndrange{size, size}; // Sobel kernel 7x7 const float kernel[] = {130, 120, 78, 0, -78, -120, -130, 180, 195, 156, 0, -156, -195, -180, 234, 312, 390, 0, -390, -312, -234, 260, 390, 780, 0, -780, -390, -260, 234, 312, 390, 0, -390, -312, -234, 180, 195, 156, 0, -156, -195, -180, 130, 120, 78, 0, -78, -120, -130}; - cgh.parallel_for(ndrange, [in, out, kernel, size_ = size](cl::sycl::id<2> gid) { + cgh.parallel_for(ndrange, [in, out, kernel, size_ = size](sycl::id<2> gid) { int x = gid[0]; int y = gid[1]; - cl::sycl::float4 Gx = cl::sycl::float4(0, 0, 0, 0); - cl::sycl::float4 Gy = cl::sycl::float4(0, 0, 0, 0); + sycl::float4 Gx = sycl::float4(0, 0, 0, 0); + sycl::float4 Gy = sycl::float4(0, 0, 0, 0); const int radius = 7; // constant-size loops in [0,1,2,3,4,5,6] @@ -70,25 +70,25 @@ class Sobel7Bench continue; // sample color - cl::sycl::float4 sample = in[{xs, ys}]; + sycl::float4 sample = in[{xs, ys}]; // convolution calculation int offset_x = x_shift + y_shift * radius; int offset_y = y_shift + x_shift * radius; float conv_x = kernel[offset_x]; - cl::sycl::float4 conv4_x = cl::sycl::float4(conv_x); + sycl::float4 conv4_x = sycl::float4(conv_x); Gx += conv4_x * sample; float conv_y = kernel[offset_y]; - cl::sycl::float4 conv4_y = cl::sycl::float4(conv_y); + sycl::float4 conv4_y = sycl::float4(conv_y); Gy += conv4_y * sample; } } // taking root of sums of squares of Gx and Gy - cl::sycl::float4 color = hypot(Gx, Gy); - cl::sycl::float4 minval = cl::sycl::float4(0.0, 0.0, 0.0, 0.0); - cl::sycl::float4 maxval = cl::sycl::float4(1.0, 1.0, 1.0, 1.0); + sycl::float4 color = hypot(Gx, Gy); + sycl::float4 minval = sycl::float4(0.0, 0.0, 0.0, 0.0); + sycl::float4 maxval = sycl::float4(1.0, 1.0, 1.0, 1.0); out[gid] = clamp(color, minval, maxval); }); })); @@ -99,22 +99,16 @@ class Sobel7Bench output_buf.reset(); save_bitmap("sobel7.bmp", size, output); - const float kernel[] = { - 130, 120, 78, 0, -78, -120, -130, - 180, 195, 156, 0, -156, -195, -180, - 234, 312, 390, 0, -390, -312, -234, - 260, 390, 780, 0, -780, -390, -260, - 234, 312, 390, 0, -390, -312, -234, - 180, 195, 156, 0, -156, -195, -180, - 130, 120, 78, 0, -78, -120, -130 - }; + const float kernel[] = {130, 120, 78, 0, -78, -120, -130, 180, 195, 156, 0, -156, -195, -180, 234, 312, 390, 0, + -390, -312, -234, 260, 390, 780, 0, -780, -390, -260, 234, 312, 390, 0, -390, -312, -234, 180, 195, 156, 0, + -156, -195, -180, 130, 120, 78, 0, -78, -120, -130}; bool pass = true; int radius = 7; for(size_t i = ver.begin[0]; i < ver.begin[0] + ver.range[0]; i++) { int x = i % size; int y = i / size; - cl::sycl::float4 Gx, Gy; + sycl::float4 Gx, Gy; for(uint x_shift = 0; x_shift < 7; x_shift++) for(uint y_shift = 0; y_shift < 7; y_shift++) { uint xs = x + x_shift - 3; @@ -123,22 +117,22 @@ class Sobel7Bench continue; if(xs < 0 || xs >= size || ys < 0 || ys >= size) continue; - cl::sycl::float4 sample = input[xs + ys * size]; + sycl::float4 sample = input[xs + ys * size]; int offset_x = x_shift + y_shift * radius; int offset_y = y_shift + x_shift * radius; float conv_x = kernel[offset_x]; - cl::sycl::float4 conv4_x = cl::sycl::float4(conv_x); + sycl::float4 conv4_x = sycl::float4(conv_x); Gx += conv4_x * sample; float conv_y = kernel[offset_y]; - cl::sycl::float4 conv4_y = cl::sycl::float4(conv_y); + sycl::float4 conv4_y = sycl::float4(conv_y); Gy += conv4_y * sample; } - cl::sycl::float4 color = hypot(Gx, Gy); - cl::sycl::float4 minval = cl::sycl::float4(0.0, 0.0, 0.0, 0.0); - cl::sycl::float4 maxval = cl::sycl::float4(1.0, 1.0, 1.0, 1.0); - cl::sycl::float4 expected = clamp(color, minval, maxval); - cl::sycl::float4 dif = fdim(output[i], expected); - float length = cl::sycl::length(dif); + sycl::float4 color = hypot(Gx, Gy); + sycl::float4 minval = sycl::float4(0.0, 0.0, 0.0, 0.0); + sycl::float4 maxval = sycl::float4(1.0, 1.0, 1.0, 1.0); + sycl::float4 expected = clamp(color, minval, maxval); + sycl::float4 dif = fdim(output[i], expected); + float length = sycl::length(dif); if(length > 0.01f) { pass = false; break; @@ -148,24 +142,13 @@ class Sobel7Bench } -static std::string getBenchmarkName(BenchmarkArgs& args) { - return "Sobel7"; - } + static std::string getBenchmarkName(BenchmarkArgs& args) { return "Sobel7"; } }; // SobelBench class - - -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { BenchmarkApp app(argc, argv); app.run(); return 0; } - - - - - - diff --git a/single-kernel/vec_add.cpp b/single-kernel/vec_add.cpp index 7d768814..0a1a7231 100644 --- a/single-kernel/vec_add.cpp +++ b/single-kernel/vec_add.cpp @@ -2,17 +2,17 @@ #include -// Opening cl::sycl namespace is unsupported on hipSYCL -// (mainly due to CUDA/HIP design issues), better +// Opening sycl namespace is unsupported on hipSYCL +// (mainly due to CUDA/HIP design issues), better // avoid it -//using namespace cl::sycl; -namespace s = cl::sycl; -template class VecAddKernel; +// using namespace sycl; +namespace s = sycl; +template +class VecAddKernel; template -class VecAddBench -{ -protected: +class VecAddBench { +protected: std::vector input1; std::vector input2; std::vector output; @@ -23,15 +23,15 @@ class VecAddBench PrefetchedBuffer output_buf; public: - VecAddBench(const BenchmarkArgs &_args) : args(_args) {} - + VecAddBench(const BenchmarkArgs& _args) : args(_args) {} + void setup() { // host memory intilization input1.resize(args.problem_size); input2.resize(args.problem_size); output.resize(args.problem_size); - for (size_t i =0; i < args.problem_size; i++) { + for(size_t i = 0; i < args.problem_size; i++) { input1[i] = static_cast(i); input2[i] = static_cast(i); output[i] = static_cast(0); @@ -42,39 +42,33 @@ class VecAddBench output_buf.initialize(args.device_queue, output.data(), s::range<1>(args.problem_size)); } - void run(std::vector& events) { - events.push_back(args.device_queue.submit( - [&](cl::sycl::handler& cgh) { + void run(std::vector& events) { + events.push_back(args.device_queue.submit([&](sycl::handler& cgh) { auto in1 = input1_buf.template get_access(cgh); auto in2 = input2_buf.template get_access(cgh); // Use discard_write here, otherwise the content of the host buffer must first be copied to device auto out = output_buf.template get_access(cgh); - cl::sycl::range<1> ndrange {args.problem_size}; + sycl::range<1> ndrange{args.problem_size}; - cgh.parallel_for>(ndrange, - [=](cl::sycl::id<1> gid) - { - out[gid] = in1[gid] + in2[gid]; - }); + cgh.parallel_for>(ndrange, [=](sycl::id<1> gid) { out[gid] = in1[gid] + in2[gid]; }); })); - } - bool verify(VerificationSetting &ver) { - //Triggers writeback + bool verify(VerificationSetting& ver) { + // Triggers writeback output_buf.reset(); bool pass = true; - for(size_t i=ver.begin[0]; i>(); - app.run>(); + app.run>(); app.run>(); - if(app.deviceSupportsFP64()) + if constexpr(SYCL_BENCH_HAS_FP64_SUPPORT) { app.run>(); + } return 0; } diff --git a/sycl2020/USM/usm_accessors_latency.cpp b/sycl2020/USM/usm_accessors_latency.cpp new file mode 100644 index 00000000..49d2c922 --- /dev/null +++ b/sycl2020/USM/usm_accessors_latency.cpp @@ -0,0 +1,171 @@ +#include "common.h" +#include "memory_wrappers.h" + +namespace s = sycl; + +static constexpr std::size_t kernels_launch_default = 5000; + +template +class accessor_latency_kernel; +template +class usm_latency_kernel; + + +/** +Measure Accessors latency compared to USM +The benchmark submits multiple small kernels which stress SYCL dependency tracking. + */ +template +class LatencyBenchmark { +protected: + BenchmarkArgs args; + size_t kernel_launches_num; + + LatencyBenchmark(const BenchmarkArgs& args, const size_t kernel_launches_num) + : args(args), kernel_launches_num(kernel_launches_num) {} + + s::range<1> getRange() const { return s::range<1>{args.problem_size}; } + + s::nd_range<1> getNDRange() const { + return s::nd_range<1>{args.problem_size, args.problem_size > 1024 ? 1024 : args.problem_size}; + } + + sycl::queue& getQueue() { + if constexpr(in_order) { + return args.device_queue_in_order; + } else { + return args.device_queue; + } + } +}; + +template +class AccessorLatency : LatencyBenchmark { +protected: + PrefetchedBuffer buff_A; + PrefetchedBuffer buff_B; + PrefetchedBuffer buff_C; + +public: + using base = LatencyBenchmark; + using base::args; + using base::base; + using base::getNDRange; + using base::getQueue; + using base::getRange; + using base::kernel_launches_num; + + AccessorLatency(const BenchmarkArgs& args, const size_t kernel_launches_num) : base(args, kernel_launches_num) {} + + void setup() { + const auto range = getRange(); + buff_A.initialize(args.device_queue, range); + buff_B.initialize(args.device_queue, range); + buff_C.initialize(args.device_queue, range); + } + + void run(std::vector& events) { + auto& queue = getQueue(); + for(int i = 0; i < kernel_launches_num; i++) { + auto event = queue.submit([&](s::handler& cgh) { + auto acc_A = buff_A.template get_access(cgh, buff_A.get_range()); + auto acc_B = buff_B.template get_access(cgh, buff_B.get_range()); + auto acc_C = buff_C.template get_access(cgh, buff_C.get_range()); + + cgh.parallel_for>( + getNDRange(), [=](s::nd_item<1> item) { + const auto id = item.get_global_linear_id(); + acc_C[id] = acc_A[id] + acc_B[id]; + }); + }); + if constexpr(synch) { + queue.wait(); + } + events.push_back(event); + } + } + + + static std::string getBenchmarkName(BenchmarkArgs& args) { + std::stringstream name; + name << "SYCL2020_Accessors_Latency_"; + name << ReadableTypename::name << "_"; + name << (in_order ? "in_order" : "out_of_order") << "_"; + name << (synch ? "synch" : "") << "_"; + return name.str(); + } +}; + +template +class USMLatency : LatencyBenchmark { +protected: + USMBuffer buff_A; + USMBuffer buff_B; + USMBuffer buff_C; + + using base = LatencyBenchmark; + using base::args; + using base::base; + using base::getNDRange; + using base::getQueue; + using base::getRange; + using base::kernel_launches_num; + +public: + USMLatency(const BenchmarkArgs& args, const size_t kernel_launches_num) : base(args, kernel_launches_num) {} + + void setup() { + buff_A.initialize(getQueue(), getRange()); + buff_B.initialize(getQueue(), getRange()); + buff_C.initialize(getQueue(), getRange()); + } + + void run(std::vector& events) { + auto& queue = getQueue(); + sycl::event event; + auto* acc_A = buff_A.get(); + auto* acc_B = buff_B.get(); + auto* acc_C = buff_C.get(); + for(int i = 0; i < kernel_launches_num; i++) { + event = queue.submit([&](s::handler& cgh) { + // Disable kernel dependencies build when queue is in_order + if constexpr(!in_order && !synch) { + cgh.depends_on(event); + } + cgh.parallel_for>(getNDRange(), [=](s::nd_item<1> item) { + const auto id = item.get_global_linear_id(); + acc_C[id] = acc_A[id] + acc_B[id]; + }); + }); + if constexpr(synch) { + queue.wait(); + } + // Add kernel event to kernel's list + events.push_back(event); + } + } + + + static std::string getBenchmarkName(BenchmarkArgs& args) { + std::stringstream name; + name << "USM_Latency_"; + name << ReadableTypename::name << "_"; + name << (in_order ? "in_order" : "out_of_order") << "_"; + name << (synch ? "synch" : "") << "_"; + return name.str(); + } +}; + +template