From bd9f2fb8f976cce77a2bdbd4375bd48f762fc5a5 Mon Sep 17 00:00:00 2001 From: Sean Miller Date: Fri, 16 Aug 2024 17:03:03 -0500 Subject: [PATCH 1/3] Various tweaks to enable Wave32 for Radeon cards with HIP. --- CMakeLists.txt | 1 + include/RAJA/config.hpp.in | 2 ++ include/RAJA/pattern/kernel/InitLocalMem.hpp | 9 +-------- include/RAJA/policy/hip/policy.hpp | 5 +++-- include/RAJA/policy/tensor/arch/hip/hip_wave.hpp | 9 +++++---- include/RAJA/policy/tensor/arch/hip/traits.hpp | 3 ++- test/include/RAJA_test-tensor.hpp | 4 +++- 7 files changed, 17 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b31cbe124..3eb0dbc8d2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -171,6 +171,7 @@ endif() if(RAJA_ENABLE_HIP) message(STATUS "HIP version: ${hip_VERSION}") + set(RAJA_HIP_WAVESIZE "64" CACHE STRING "Set the wave size for GPU architecture. E.g. MI200/MI300 this is 64.") if("${hip_VERSION}" VERSION_LESS "3.5") message(FATAL_ERROR "Trying to use HIP/ROCm version ${hip_VERSION}. RAJA requires HIP/ROCm version 3.5 or newer. ") endif() diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in index 380418efa1..29d97fed69 100644 --- a/include/RAJA/config.hpp.in +++ b/include/RAJA/config.hpp.in @@ -182,6 +182,8 @@ static_assert(RAJA_HAS_SOME_CXX14, #cmakedefine RAJA_ENABLE_NV_TOOLS_EXT #cmakedefine RAJA_ENABLE_ROCTX +#cmakedefine RAJA_HIP_WAVESIZE @RAJA_HIP_WAVESIZE@ + /*! ****************************************************************************** * diff --git a/include/RAJA/pattern/kernel/InitLocalMem.hpp b/include/RAJA/pattern/kernel/InitLocalMem.hpp index 21d9e3cd2a..e4d5cc9567 100644 --- a/include/RAJA/pattern/kernel/InitLocalMem.hpp +++ b/include/RAJA/pattern/kernel/InitLocalMem.hpp @@ -77,23 +77,16 @@ struct StatementExecutor::param_tuple_t>::value_type; // Initialize memory -#ifdef RAJA_COMPILER_MSVC - // MSVC doesn't like taking a pointer to stack allocated data?!?! varType *ptr = new varType[camp::get(data.param_tuple).size()]; camp::get(data.param_tuple).set_data(ptr); -#else - varType Array[camp::get(data.param_tuple).size()]; - camp::get(data.param_tuple).set_data(&Array[0]); -#endif + // Initialize others and execute exec_expanded(data); // Cleanup and return camp::get(data.param_tuple).set_data(nullptr); -#ifdef RAJA_COMPILER_MSVC delete[] ptr; -#endif } diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index a9f9027675..040de50f31 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -324,8 +324,9 @@ struct DeviceConstants // values for HIP warp size and max block size. // #if defined(__HIP_PLATFORM_AMD__) -constexpr DeviceConstants device_constants(64, 1024, 64); // MI300A -// constexpr DeviceConstants device_constants(64, 1024, 128); // MI250X +constexpr DeviceConstants device_constants(RAJA_HIP_WAVESIZE, 1024, 64); // MI300A +// constexpr DeviceConstants device_constants(RAJA_HIP_WAVESIZE, 1024, 128); // MI250X + #elif defined(__HIP_PLATFORM_NVIDIA__) constexpr DeviceConstants device_constants(32, 1024, 32); // V100 #endif diff --git a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp index 74bbc2f077..3e1cff1d56 100644 --- a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp +++ b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp @@ -57,7 +57,7 @@ namespace expt public: - static constexpr int s_num_elem = 64; + static constexpr int s_num_elem = policy::hip::device_constants.WARP_SIZE; /*! * @brief Default constructor, zeros register contents @@ -780,8 +780,8 @@ namespace expt // Third: mask off everything but output_segment // this is because all output segments are valid at this point - // (5-segbits), the 5 is since the warp-width is 32 == 1<<5 - int our_output_segment = get_lane()>>(6-segbits); + constexpr int log2_warp_size = RAJA::log2(RAJA::policy::hip::device_constants.WARP_SIZE); + int our_output_segment = get_lane()>>(log2_warp_size-segbits); bool in_output_segment = our_output_segment == output_segment; if(!in_output_segment){ result.get_raw_value() = 0; @@ -828,8 +828,9 @@ namespace expt // First: tree reduce values within each segment element_type x = m_value; + constexpr int log2_warp_size = RAJA::log2(RAJA::policy::hip::device_constants.WARP_SIZE); RAJA_UNROLL - for(int i = 0;i < 6-segbits; ++ i){ + for(int i = 0;i < log2_warp_size-segbits; ++ i){ // tree shuffle int delta = s_num_elem >> (i+1); diff --git a/include/RAJA/policy/tensor/arch/hip/traits.hpp b/include/RAJA/policy/tensor/arch/hip/traits.hpp index 4c4d959599..1b8a9679bb 100644 --- a/include/RAJA/policy/tensor/arch/hip/traits.hpp +++ b/include/RAJA/policy/tensor/arch/hip/traits.hpp @@ -29,7 +29,8 @@ namespace expt { struct RegisterTraits{ using element_type = T; using register_policy = RAJA::expt::hip_wave_register; - static constexpr camp::idx_t s_num_elem = 64; + static constexpr camp::idx_t s_num_elem = policy::hip::device_constants.WARP_SIZE; + static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem; using int_element_type = int32_t; }; diff --git a/test/include/RAJA_test-tensor.hpp b/test/include/RAJA_test-tensor.hpp index cf633098a9..83ef4fe49f 100644 --- a/test/include/RAJA_test-tensor.hpp +++ b/test/include/RAJA_test-tensor.hpp @@ -87,7 +87,9 @@ struct TensorTestHelper void exec(BODY const &body){ hipDeviceSynchronize(); - RAJA::forall>(RAJA::RangeSegment(0,64), + constexpr int warp_size = RAJA::policy::hip::device_constants.WARP_SIZE; + + RAJA::forall>(RAJA::RangeSegment(0,warp_size), [=] RAJA_HOST_DEVICE (int ){ body(); }); From f352a5f84bd28bcb20b59b6398b21d837bac9b1c Mon Sep 17 00:00:00 2001 From: Sean Miller Date: Mon, 18 Nov 2024 11:33:30 -0600 Subject: [PATCH 2/3] Removing dynamic stack allocation fix --- include/RAJA/pattern/kernel/InitLocalMem.hpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/RAJA/pattern/kernel/InitLocalMem.hpp b/include/RAJA/pattern/kernel/InitLocalMem.hpp index e4d5cc9567..21d9e3cd2a 100644 --- a/include/RAJA/pattern/kernel/InitLocalMem.hpp +++ b/include/RAJA/pattern/kernel/InitLocalMem.hpp @@ -77,16 +77,23 @@ struct StatementExecutor::param_tuple_t>::value_type; // Initialize memory +#ifdef RAJA_COMPILER_MSVC + // MSVC doesn't like taking a pointer to stack allocated data?!?! varType *ptr = new varType[camp::get(data.param_tuple).size()]; camp::get(data.param_tuple).set_data(ptr); - +#else + varType Array[camp::get(data.param_tuple).size()]; + camp::get(data.param_tuple).set_data(&Array[0]); +#endif // Initialize others and execute exec_expanded(data); // Cleanup and return camp::get(data.param_tuple).set_data(nullptr); +#ifdef RAJA_COMPILER_MSVC delete[] ptr; +#endif } From d0a65e74e7b4b0e0d3d308de011d8d2faefa4be7 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 25 Nov 2024 11:29:29 -0800 Subject: [PATCH 3/3] Apply suggestions from code review --- include/RAJA/policy/tensor/arch/hip/hip_wave.hpp | 4 ++-- test/include/RAJA_test-tensor.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp index 3e1cff1d56..f1810807f9 100644 --- a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp +++ b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp @@ -780,7 +780,7 @@ namespace expt // Third: mask off everything but output_segment // this is because all output segments are valid at this point - constexpr int log2_warp_size = RAJA::log2(RAJA::policy::hip::device_constants.WARP_SIZE); + static constexpr int log2_warp_size = RAJA::log2(RAJA::policy::hip::device_constants.WARP_SIZE); int our_output_segment = get_lane()>>(log2_warp_size-segbits); bool in_output_segment = our_output_segment == output_segment; if(!in_output_segment){ @@ -828,7 +828,7 @@ namespace expt // First: tree reduce values within each segment element_type x = m_value; - constexpr int log2_warp_size = RAJA::log2(RAJA::policy::hip::device_constants.WARP_SIZE); + static constexpr int log2_warp_size = RAJA::log2(RAJA::policy::hip::device_constants.WARP_SIZE); RAJA_UNROLL for(int i = 0;i < log2_warp_size-segbits; ++ i){ diff --git a/test/include/RAJA_test-tensor.hpp b/test/include/RAJA_test-tensor.hpp index 83ef4fe49f..d836e1463f 100644 --- a/test/include/RAJA_test-tensor.hpp +++ b/test/include/RAJA_test-tensor.hpp @@ -87,7 +87,7 @@ struct TensorTestHelper void exec(BODY const &body){ hipDeviceSynchronize(); - constexpr int warp_size = RAJA::policy::hip::device_constants.WARP_SIZE; + static constexpr int warp_size = RAJA::policy::hip::device_constants.WARP_SIZE; RAJA::forall>(RAJA::RangeSegment(0,warp_size), [=] RAJA_HOST_DEVICE (int ){