From ab31cceab88898522e04ea6af43bb0f6695b6bf6 Mon Sep 17 00:00:00 2001 From: carlushuang Date: Tue, 25 Jul 2023 23:54:16 +0800 Subject: [PATCH 1/5] [FIX SW 396203] check launch kernel grid size not beyond 32bit integer (#2263) * check 32bit launch size --- .../conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp | 62 +++++++++++++---- .../conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp | 62 +++++++++++++---- .../conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp | 66 ++++++++++++++----- 3 files changed, 148 insertions(+), 42 deletions(-) diff --git a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp index 4f77cdacab..d2b288ebf9 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp @@ -382,12 +382,28 @@ GetBwdXdlopsNHWCConfigList() return kernel_param_list; } -static std::tuple // splits_4G +// clang-format off +static inline PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC +GetBwdXdlopsNHWCConfigLargestTileFp32() +{ + return {"bwd", "nhwc", miopenFloat, 0, 1, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}; +} +static inline PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC +GetBwdXdlopsNHWCConfigLargestTileFp16() +{ + return {"bwd", "nhwc", miopenHalf, 0, 1, 256, 256, 32, 32, 32, 8, 2, 2, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}; +} +static inline PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC +GetBwdXdlopsNHWCConfigLargestTileBf16() +{ + return {"bwd", "nhwc", miopenBFloat16, 0, 1, 256, 256, 32, 32, 32, 8, 2, 2, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}; +} +// clang-format on + +static std::tuple // splits_4G GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel( - const ConvolutionContext& ctx, const ProblemDescription& problem, const PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC& config) { @@ -441,12 +457,11 @@ GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel( integer_divide_ceil(gemm_n, config.gemm_n_per_block) * (1 << config.gemm_k_global_split); if(config.multihead != 0) grid_size *= num_of_gemm; - std::string kernel_name = config.ToKernelName(ctx); - return std::make_tuple(kernel_name, block_size, grid_size, splits_4G); + return std::make_tuple(block_size, grid_size, splits_4G); } void PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::HeuristicInit( - const ConvolutionContext& ctx, const ProblemDescription& problem) + const ConvolutionContext&, const ProblemDescription& problem) { static const std::vector> tile_list_fp32 = { std::make_tuple(128, 128, 16), @@ -693,8 +708,8 @@ void PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::HeuristicInit( } } size_t current_grid_size; - std::tie(std::ignore, std::ignore, current_grid_size, std::ignore) = - GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel(ctx, problem, config); + std::tie(std::ignore, current_grid_size, std::ignore) = + GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel(problem, config); size_t gks = ComputeLog2GemmKGlobalSplitsWith2DMerge(current_grid_size, 1200, k / group, @@ -809,6 +824,13 @@ bool PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::IsValid( if(problem.IsFp16() && gemm_k_global_split != 0 && vector_store != 1 && splits_4G > 1) return false; + size_t current_block_size, current_grid_size, current_splits_4G; + std::tie(current_block_size, current_grid_size, current_splits_4G) = + GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel(problem, *this); + + if(current_block_size * current_grid_size * current_splits_4G > 0xffffffffULL) + return false; + bool unit_conv = (x == 1) && (y == 1) && (stride_h == 1) && (stride_w == 1) && (dilation_h == 1) && (dilation_w == 1) && (pad_h == 0) && (pad_w == 0); @@ -934,7 +956,18 @@ bool ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::IsApplicable( problem.n_outputs, miopen::GetTypeSize(problem.in_data_type))) return false; - + { + auto largest_config = problem.IsFp32() + ? GetBwdXdlopsNHWCConfigLargestTileFp32() + : (problem.IsFp16() ? GetBwdXdlopsNHWCConfigLargestTileFp16() + : GetBwdXdlopsNHWCConfigLargestTileBf16()); + size_t current_block_size, current_grid_size, current_splits_4G; + std::tie(current_block_size, current_grid_size, current_splits_4G) = + GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel(problem, largest_config); + + if(current_block_size * current_grid_size * current_splits_4G > 0xffffffffULL) + return false; + } return true; } @@ -1000,14 +1033,15 @@ ConvSolution ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetSolution( ConvSolution result; KernelInfo kernel; - std::string kernel_name; size_t block_size; size_t grid_size; int splits_4G; - std::tie(kernel_name, block_size, grid_size, splits_4G) = - GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel(ctx, problem, config); + std::tie(block_size, grid_size, splits_4G) = + GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel(problem, config); + + std::string kernel_name = config.ToKernelName(ctx); const auto required_workspace_size = GetWorkspaceSize(ctx, problem); result.workspace_sz = required_workspace_size; diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp index c53e40039d..8b00d894ff 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp @@ -312,12 +312,28 @@ GetFwdXdlopsNHWCConfigList() return kernel_param_list; } -static std::tuple // splits_4G +// clang-format off +static inline PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC +GetFwdXdlopsNHWCConfigLargestTileFp32() +{ + return {"fwd", "nhwc", miopenFloat, 0, 1, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}; +} +static inline PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC +GetFwdXdlopsNHWCConfigLargestTileFp16() +{ + return {"fwd", "nhwc", miopenHalf, 0, 1, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}; +} +static inline PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC +GetFwdXdlopsNHWCConfigLargestTileBf16() +{ + return {"fwd", "nhwc", miopenBFloat16, 0, 1, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}; +} +// clang-format on + +static std::tuple // splits_4G GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel( - const ConvolutionContext& ctx, const ProblemDescription& problem, const PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC& config) { @@ -340,12 +356,11 @@ GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel( size_t grid_size = static_cast(group) * integer_divide_ceil(gemm_m, config.gemm_m_per_block) * integer_divide_ceil(gemm_n, config.gemm_n_per_block) * (1 << config.gemm_k_global_split); - std::string kernel_name = config.ToKernelName(ctx); - return std::make_tuple(kernel_name, block_size, grid_size, splits_4G); + return std::make_tuple(block_size, grid_size, splits_4G); } void PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::HeuristicInit( - const ConvolutionContext& ctx, const ProblemDescription& problem) + const ConvolutionContext&, const ProblemDescription& problem) { static const std::vector> tile_list_fp32 = { std::make_tuple(128, 128, 16), @@ -567,8 +582,8 @@ void PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::HeuristicInit( } } size_t current_grid_size; - std::tie(std::ignore, std::ignore, current_grid_size, std::ignore) = - GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(ctx, problem, config); + std::tie(std::ignore, current_grid_size, std::ignore) = + GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(problem, config); size_t gks = ComputeLog2GemmKGlobalSplitsWith2DMerge(current_grid_size, 1200, c / group, @@ -683,6 +698,13 @@ bool PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::IsValid( if(problem.IsFp16() && gemm_k_global_split != 0 && vector_store != 1 && splits_4G > 1) return false; + size_t current_block_size, current_grid_size, current_splits_4G; + std::tie(current_block_size, current_grid_size, current_splits_4G) = + GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(problem, *this); + + if(current_block_size * current_grid_size * current_splits_4G > 0xffffffffULL) + return false; + bool unit_conv = (x == 1) && (y == 1) && (stride_h == 1) && (stride_w == 1) && (dilation_h == 1) && (dilation_w == 1) && (pad_h == 0) && (pad_w == 0); @@ -873,6 +895,19 @@ bool ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::IsApplicable( miopen::GetTypeSize(problem.in_data_type))) return false; + { + auto largest_config = problem.IsFp32() + ? GetFwdXdlopsNHWCConfigLargestTileFp32() + : (problem.IsFp16() ? GetFwdXdlopsNHWCConfigLargestTileFp16() + : GetFwdXdlopsNHWCConfigLargestTileBf16()); + size_t current_block_size, current_grid_size, current_splits_4G; + std::tie(current_block_size, current_grid_size, current_splits_4G) = + GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(problem, largest_config); + + if(current_block_size * current_grid_size * current_splits_4G > 0xffffffffULL) + return false; + } + return true; } ConvSolution ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetSolution( @@ -883,14 +918,15 @@ ConvSolution ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetSolution( ConvSolution result; KernelInfo kernel; - std::string kernel_name; size_t block_size; size_t grid_size; int splits_4G; - std::tie(kernel_name, block_size, grid_size, splits_4G) = - GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(ctx, problem, config); + std::tie(block_size, grid_size, splits_4G) = + GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(problem, config); + + std::string kernel_name = config.ToKernelName(ctx); const auto required_workspace_size = GetWorkspaceSize(ctx, problem); result.workspace_sz = required_workspace_size; diff --git a/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp index 6973a4ecba..65f6cb6fcc 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp @@ -309,12 +309,28 @@ GetWrwXdlopsNHWCConfigList() return kernel_param_list; } -static std::tuple // occupancy +// clang-format off +static inline PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC +GetWrwXdlopsNHWCConfigLargestTileFp32() +{ + return {"wrw", "nhwc", miopenFloat, 0, 0, 256, 128, 16, 32, 32, 2, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 1, 1,16}, { 1, 16, 1, 16}, { 1, 1, 1, 8}, { 1, 16, 1, 16}}; +} +static inline PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC +GetWrwXdlopsNHWCConfigLargestTileFp16() +{ + return {"wrw", "nhwc", miopenHalf, 0, 1, 256, 256, 32, 32, 32, 8, 2, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, { 1, 8, 1, 32}, { 1, 4, 1, 8}, { 1, 8, 1, 32}}; +} +static inline PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC +GetWrwXdlopsNHWCConfigLargestTileBf16() +{ + return {"wrw", "nhwc", miopenBFloat16, 0, 1, 256, 256, 32, 32, 32, 8, 2, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, { 1, 8, 1, 32}, { 1, 4, 1, 8}, { 1, 8, 1, 32}}; +} +// clang-format on + +static std::tuple // occupancy GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel( - const ConvolutionContext& ctx, const ProblemDescription& problem, const PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC& config) { @@ -338,9 +354,8 @@ GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel( size_t grid_size = static_cast(group) * integer_divide_ceil(gemm_m, config.gemm_m_per_block) * integer_divide_ceil(gemm_n, config.gemm_n_per_block); - std::string kernel_name = config.ToKernelName(ctx); - size_t occupancy = config.ComputeKernelOccupancy(); - return std::make_tuple(kernel_name, block_size, grid_size, occupancy); + size_t occupancy = config.ComputeKernelOccupancy(); + return std::make_tuple(block_size, grid_size, occupancy); } size_t PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::ComputeKernelOccupancy() const @@ -624,8 +639,8 @@ void PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::HeuristicInit( size_t current_grid_size; size_t occupancy; - std::tie(std::ignore, std::ignore, current_grid_size, occupancy) = - GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(ctx, problem, config_list[selected_index]); + std::tie(std::ignore, current_grid_size, occupancy) = + GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(problem, config_list[selected_index]); bool need_k_split = current_grid_size <= non_split_gridsize; size_t gks = ComputeGemmKGlobalSplitsWith2DMerge(current_grid_size, occupancy, num_cu); need_k_split |= gks != 0; @@ -658,8 +673,8 @@ void PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::HeuristicInit( { size_t current_grid_size; size_t occupancy; - std::tie(std::ignore, std::ignore, current_grid_size, occupancy) = - GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(ctx, problem, config); + std::tie(std::ignore, current_grid_size, occupancy) = + GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(problem, config); bool need_k_split = current_grid_size <= non_split_gridsize; size_t gks = ComputeGemmKGlobalSplitsWith2DMerge(current_grid_size, occupancy, num_cu); @@ -787,6 +802,13 @@ bool PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::IsValid( return false; } + size_t current_block_size, current_grid_size, current_splits_4G; + std::tie(current_block_size, current_grid_size, current_splits_4G) = + GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(problem, *this); + + if(current_block_size * current_grid_size * current_splits_4G > 0xffffffffULL) + return false; + return true; } @@ -861,6 +883,19 @@ bool ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::IsApplicable( miopen::GetTypeSize(problem.in_data_type))) return false; + { + auto largest_config = problem.IsFp32() + ? GetWrwXdlopsNHWCConfigLargestTileFp32() + : (problem.IsFp16() ? GetWrwXdlopsNHWCConfigLargestTileFp16() + : GetWrwXdlopsNHWCConfigLargestTileBf16()); + size_t current_block_size, current_grid_size, current_splits_4G; + std::tie(current_block_size, current_grid_size, current_splits_4G) = + GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(problem, largest_config); + + if(current_block_size * current_grid_size * current_splits_4G > 0xffffffffULL) + return false; + } + return true; } @@ -975,12 +1010,13 @@ ConvSolution ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetSolution( ConvSolution result; KernelInfo kernel; - std::string kernel_name; size_t block_size; size_t grid_size; - std::tie(kernel_name, block_size, grid_size, std::ignore) = - GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(ctx, problem, config); + std::tie(block_size, grid_size, std::ignore) = + GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(problem, config); + + std::string kernel_name = config.ToKernelName(ctx); const auto& hi = problem.out_height; const auto& wi = problem.out_width; From 5d3a510375c44e2e7b284285c6e66bdd89d780cf Mon Sep 17 00:00:00 2001 From: carlushuang Date: Sat, 29 Jul 2023 01:13:56 +0800 Subject: [PATCH 2/5] Fix legacy fwd/bwd hip igemm solvers which do not support group convo (#2281) --- src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp | 2 ++ src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp index 95baf53f9e..0d62ef8ce3 100644 --- a/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp @@ -372,6 +372,8 @@ bool ConvHipImplicitGemmBwdXdlops::IsApplicable(const ConvolutionContext& ctx, return false; if(!IsIndexRangeLargeEnough(problem)) return false; + if(problem.GetGroupCount() > 1) + return false; switch(problem.conv_problem.GetInDataType()) { case miopenHalf: return CheckCKApplicability(problem); diff --git a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp index af7d6d896d..5f7068d73c 100644 --- a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp @@ -348,6 +348,8 @@ bool ConvHipImplicitGemmFwdXdlops::IsApplicable(const ConvolutionContext& ctx, return false; if(!problem.IsLayoutNHWC()) return false; + if(problem.GetGroupCount() > 1) + return false; switch(problem.conv_problem.GetInDataType()) { case miopenInt8: return CheckCKApplicability(problem); From b07a980941899c2b922d5c56dd90f961d22d0cba Mon Sep 17 00:00:00 2001 From: Jun Liu Date: Wed, 2 Aug 2023 08:12:45 -0700 Subject: [PATCH 3/5] [MI100] Remove Mlir entries from kdb (#2293) --- src/kernels/gfx90878.kdb.bz2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kernels/gfx90878.kdb.bz2 b/src/kernels/gfx90878.kdb.bz2 index c56f381e0d..8f2a92297b 100644 --- a/src/kernels/gfx90878.kdb.bz2 +++ b/src/kernels/gfx90878.kdb.bz2 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89b40209ddf23566452005638fdc1d359690096abc606df3b069a97a7520452d -size 525411381 +oid sha256:29bcecf319bf59fc2e6036849192ce786d75a747181b32fc6b8c55ce67f1acce +size 336866906 From 696b4174baa8298bb9009a984f318c293a83f9fd Mon Sep 17 00:00:00 2001 From: zjing14 Date: Tue, 8 Aug 2023 14:21:08 -0500 Subject: [PATCH 4/5] [HOTFIX] Workaround for HIP iGEMM in buffer_load_max_length (#2297) * add workaround for SWDEV_413051 * fix bugprone-branch-clone --------- Co-authored-by: Jing Zhang Co-authored-by: Jun Liu --- src/include/miopen/solver/implicitgemm_util.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/include/miopen/solver/implicitgemm_util.hpp b/src/include/miopen/solver/implicitgemm_util.hpp index ce69aea846..4d8efcf2af 100644 --- a/src/include/miopen/solver/implicitgemm_util.hpp +++ b/src/include/miopen/solver/implicitgemm_util.hpp @@ -52,6 +52,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_BLOCK_SYNC_LDS_WITHOUT_SY // corresponding llvm intrinsic functions // so we disable using those llvm intrinsic functions on gfx1030 #define WORKAROUND_MIOPEN_ISSUE_557 1 +#define WORKAROUND_SWDEV_413051 1 namespace miopen { @@ -497,7 +498,7 @@ static inline bool support_amd_buffer_atomic_fadd(const std::string& device_name template int amd_buffer_load_max_length() { - if(std::is_same()) + if(std::is_same() || WORKAROUND_SWDEV_413051) { return 4; } From 2d5df5608f538b3dbfa653b04e4645001f8130e9 Mon Sep 17 00:00:00 2001 From: Jun Liu Date: Tue, 8 Aug 2023 12:53:19 -0700 Subject: [PATCH 5/5] [HotFix] Disable HIP iGEMM V6R1 DLOps Kernels (#2306) * Disable HIP iGEMM V6R1 DLOps Kernels * Update src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp Co-authored-by: Artem Tamazov * Allow test to use HIP iGEMM v6r1 kernels * add comment to explain the use of env var --------- Co-authored-by: Artem Tamazov --- src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp | 6 ++++++ test/CMakeLists.txt | 7 +++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp index be8fb65224..d767b99334 100644 --- a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp +++ b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp @@ -33,6 +33,8 @@ #include "../composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp" +#define WORKAROUND_SWDEV_411729 1 + MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW) namespace miopen { @@ -82,7 +84,11 @@ bool PerformanceConvCkIgemmFwdV6r1DlopsNchw::IsValid(const ProblemDescription& p bool ConvCkIgemmFwdV6r1DlopsNchw::IsApplicable(const ConvolutionContext& ctx, const ProblemDescription& problem) const { +#if WORKAROUND_SWDEV_411729 + if(!miopen::IsEnabled(MIOPEN_DEBUG_CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW{})) +#else if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW{})) +#endif return false; if(!ctx.use_hip_kernels) return false; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c33a8100b7..424d570ab1 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1796,9 +1796,11 @@ add_custom_test(test_regression_float_mi100 SKIP_UNLESS_ALL GFX900_DISABLED GFX9 COMMAND ${IMPLICITGEMM_TESTING_ENV} MIOPEN_LOG_LEVEL=5 $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 32 256 38 38 --weights 256 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights ) +# MIOPEN_DEBUG_CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW is explicitly enabled due to the kernel is disabled by default via #2306 set(CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV MIOPEN_FIND_MODE=normal - MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvCkIgemmFwdV6r1DlopsNchw) + MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvCkIgemmFwdV6r1DlopsNchw + MIOPEN_DEBUG_CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW=1) # gfx908 disabled as a workaround for https://github.com/ROCmSoftwarePlatform/MIOpen/pull/1790/files?diff=split&w=1#r982923610 add_custom_test(test_conv_ck_igemm_fwd_v6r1_dlops_nchw FLOAT_ENABLED HALF_ENABLED BF16_DISABLED GFX908_DISABLED GFX103X_ENABLED SKIP_UNLESS_ALL @@ -2104,10 +2106,11 @@ add_custom_test(smoke_solver_ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC GFX900_D ) # MIOPEN_DEBUG_TUNING_ITERATIONS_MAX is set to 2 because kernels are very slow to build. +# MIOPEN_DEBUG_CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW is explicitly enabled due to the kernel is disabled by default via #2306 add_custom_test(smoke_solver_ConvCkIgemmFwdV6r1DlopsNchw GFX103X_ENABLED HALF_ENABLED TEST_TUNING COMMAND MIOPEN_FIND_ENFORCE=SEARCH_DB_UPDATE MIOPEN_DEBUG_TUNING_ITERATIONS_MAX=2 MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP16_ALT_IMPL=0 - MIOPEN_FIND_MODE=normal MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvCkIgemmFwdV6r1DlopsNchw $ + MIOPEN_FIND_MODE=normal MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvCkIgemmFwdV6r1DlopsNchw MIOPEN_DEBUG_CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW=1 $ ${TEST_CONV_VERBOSE_F} --input 128 64 56 56 --weights 256 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 ${MIOPEN_TEST_FLAGS_ARGS} )