From 3d833ff06c3542eef87d699fc1552148cc6d4190 Mon Sep 17 00:00:00 2001 From: Kealan Barbieri Date: Fri, 20 Dec 2024 14:59:28 -0800 Subject: [PATCH 01/40] xe: jit: gemm: fix debug strategy submission --- src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp | 100 +++++++++++---------- 1 file changed, 51 insertions(+), 49 deletions(-) diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp index bec0b9ad301..10375a5a930 100644 --- a/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp +++ b/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp @@ -83,9 +83,57 @@ status_t gen_gemm_kernel_desc_t::finalize(const char *tags) { // Parse strategy string. strategy_ = GEMMStrategy(hw_, stepping_); - strategy_.unroll[LoopM] = entry_->driverInfo.unroll[LoopM]; - strategy_.unroll[LoopN] = entry_->driverInfo.unroll[LoopN]; - parseStrategy(entry_->strategy, hw_, problem_, strategy_); +#ifdef DNNL_DEV_MODE + std::string ovr_strategy; + ovr_strategy = gpu_utils::dev_getenv("GEMM_KERNEL", ovr_strategy); + if (!ovr_strategy.empty()) { + // Warning: will override problem data types (including up/down + // conversions) - this will cause inaccuracies if precisions/layouts + // are chosen that are incompatible with the given problem + std::stringstream ss(ovr_strategy); + std::string val; + ss >> val; + gpu_assert(val == "gemm"); + ss >> val; + const char *pstr = val.c_str(); + pstr = parsePrecisions(pstr, problem_.Ta_ext, problem_.Ta); + pstr = parsePrecisions(pstr, problem_.Tb_ext, problem_.Tb); + pstr = parsePrecisions(pstr, problem_.Tc, problem_.Tc_ext); + ss >> val; + pstr = val.c_str(); + pstr = parseLayout(pstr, problem_.A); + pstr = parseLayout(pstr, problem_.B); + pstr = parseLayout(pstr, problem_.C); + + if (problem_.A.alignment == 0) + problem_.A.setAlignment( + problem_.A.defaultAlignment(problem_.Ta_ext)); + if (problem_.B.alignment == 0) + problem_.B.setAlignment( + problem_.B.defaultAlignment(problem_.Tb_ext)); + if (problem_.C.alignment == 0) + problem_.C.setAlignment( + problem_.C.defaultAlignment(problem_.Tc_ext)); + + strategy_ = GEMMStrategy(hw_, stepping_); + ss >> strategy_.unroll[LoopM]; + ss >> strategy_.unroll[LoopN]; + + ss >> val; + problem_.alpha = std::stoi(val); + ss >> val; + problem_.beta = std::stoi(val); + + ovr_strategy = ss.str().substr(ss.tellg()); // remaining string + parseStrategy(ovr_strategy.c_str(), hw_, problem_, strategy_); + } else { +#endif + strategy_.unroll[LoopM] = entry_->driverInfo.unroll[LoopM]; + strategy_.unroll[LoopN] = entry_->driverInfo.unroll[LoopN]; + parseStrategy(entry_->strategy, hw_, problem_, strategy_); +#ifdef DNNL_DEV_MODE + } +#endif strategy_.panelCheck |= (isPacked(problem_.A.layout) || isPacked(problem_.B.layout)); adjustStrategy(hw_, problem_, strategy_, tags); @@ -182,52 +230,6 @@ status_t gen_gemm_kernel_desc_t::finalize(const char *tags) { } } -#ifdef DNNL_DEV_MODE - std::string ovr_strategy; - ovr_strategy = gpu_utils::dev_getenv("GEMM_KERNEL", ovr_strategy); - if (!ovr_strategy.empty()) { - // Warning: will override problem data types (including up/down - // conversions) - this will cause inaccuracies if precisions/layouts - // are chosen that are incompatible with the given problem - std::stringstream ss(ovr_strategy); - std::string val; - ss >> val; - gpu_assert(val == "gemm"); - ss >> val; - const char *pstr = val.c_str(); - pstr = parsePrecisions(pstr, problem_.Ta_ext, problem_.Ta); - pstr = parsePrecisions(pstr, problem_.Tb_ext, problem_.Tb); - pstr = parsePrecisions(pstr, problem_.Tc, problem_.Tc_ext); - ss >> val; - pstr = val.c_str(); - pstr = parseLayout(pstr, problem_.A); - pstr = parseLayout(pstr, problem_.B); - pstr = parseLayout(pstr, problem_.C); - - if (problem_.A.alignment == 0) - problem_.A.setAlignment( - problem_.A.defaultAlignment(problem_.Ta_ext)); - if (problem_.B.alignment == 0) - problem_.B.setAlignment( - problem_.B.defaultAlignment(problem_.Tb_ext)); - if (problem_.C.alignment == 0) - problem_.C.setAlignment( - problem_.C.defaultAlignment(problem_.Tc_ext)); - - strategy_ = GEMMStrategy(hw_, stepping_); - ss >> strategy_.unroll[LoopM]; - ss >> strategy_.unroll[LoopN]; - - ss >> val; - problem_.alpha = std::stoi(val); - ss >> val; - problem_.beta = std::stoi(val); - - ovr_strategy = ss.str().substr(ss.tellg()); // remaining string - parseStrategy(ovr_strategy.c_str(), hw_, problem_, strategy_); - } -#endif - strategy_.relaxedAccumulation |= relaxed_acc_; strategy_.systolicAvailable &= !disable_systolic_; try { From d27b7b7a0829e8ec2dda3ee8362353051ad09ecb Mon Sep 17 00:00:00 2001 From: "Guskov, Andrey Y" Date: Thu, 12 Dec 2024 15:53:16 -0800 Subject: [PATCH 02/40] src: common: make rnn_s8s8_compensation a power of 2 --- src/common/memory_desc.hpp | 14 ++++++------- src/common/memory_desc_wrapper.hpp | 28 +++++++++----------------- src/common/primitive_hashing.cpp | 8 +++----- src/common/serialization.cpp | 10 +++------ src/common/type_helpers.hpp | 19 +++-------------- src/cpu/rnn/rnn_reorders.hpp | 11 ++-------- src/gpu/intel/ocl/rnn/rnn_reorders.hpp | 12 +++-------- 7 files changed, 31 insertions(+), 71 deletions(-) diff --git a/src/common/memory_desc.hpp b/src/common/memory_desc.hpp index 468a8528ec2..5dc820c67c1 100644 --- a/src/common/memory_desc.hpp +++ b/src/common/memory_desc.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2024 Intel Corporation +* Copyright 2024-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -56,7 +56,7 @@ const rnn_packed_memory_format_t ldio_p = rnn_packed_memory_format_t::ldio_p; // TODO: convert to 'enum class'. // Flags for memory special features enum memory_extra_flags_t { - dnnl_memory_extra_flag_none = 0x0U, + dnnl_memory_extra_flag_none = 0u, // Indicates the weights have an additional buffer, that depends on the // @p compensation_mask. // @@ -64,13 +64,13 @@ enum memory_extra_flags_t { // the additional buffer would consist of OC values: // O[oc : 0,OC] = // -128 * SUM(ic : 0,IC; kh : 0,KH; kw : 0,KW){ weights(oc, ic, kh, kw) } - dnnl_memory_extra_flag_compensation_conv_s8s8 = 0x1U, - dnnl_memory_extra_flag_scale_adjust = 0x2U, - dnnl_memory_extra_flag_rnn_u8s8_compensation = 0x4U, + dnnl_memory_extra_flag_compensation_conv_s8s8 = 1u, + dnnl_memory_extra_flag_scale_adjust = 2u, + dnnl_memory_extra_flag_rnn_u8s8_compensation = 4u, dnnl_memory_extra_flag_gpu_rnn_u8s8_compensation = dnnl_memory_extra_flag_rnn_u8s8_compensation, - dnnl_memory_extra_flag_compensation_conv_asymmetric_src = 0x8U, - dnnl_memory_extra_flag_rnn_s8s8_compensation = 0x16U, + dnnl_memory_extra_flag_compensation_conv_asymmetric_src = 8u, + dnnl_memory_extra_flag_rnn_s8s8_compensation = 16u, }; // Create aliases for extra flags to preserve the old behavior. diff --git a/src/common/memory_desc_wrapper.hpp b/src/common/memory_desc_wrapper.hpp index 5cf2e2f66ba..847951ba558 100644 --- a/src/common/memory_desc_wrapper.hpp +++ b/src/common/memory_desc_wrapper.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2016-2024 Intel Corporation +* Copyright 2016-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -149,9 +149,7 @@ struct memory_desc_wrapper : public c_compatible { size_t additional_buffer_data_size(uint64_t flag_select) const { using namespace memory_extra_flags; if (flag_select & compensation_conv_s8s8) return sizeof(int32_t); - if ((flag_select & rnn_u8s8_compensation) - && !types::extra_flag_rnn_s8s8_compensation_is_set(flag_select)) - return sizeof(float); + if (flag_select & rnn_u8s8_compensation) return sizeof(float); if (flag_select & compensation_conv_asymmetric_src) return sizeof(int32_t); return 0; @@ -160,19 +158,16 @@ struct memory_desc_wrapper : public c_compatible { /** return true if memory format has additional buffer */ bool is_additional_buffer() const { using namespace memory_extra_flags; - // Currently compensation is not required for rnn_s8s8_compensation, - // but it has common bit with rnn_u8s8_compensation constant so we have - // to exclude rnn_s8s8_compensation case explicitly - return ((extra().flags - & (compensation_conv_s8s8 | rnn_u8s8_compensation - | compensation_conv_asymmetric_src)) - && !types::extra_flag_rnn_s8s8_compensation_is_set( - extra().flags)); + return extra().flags + & (compensation_conv_s8s8 | rnn_u8s8_compensation + | compensation_conv_asymmetric_src); } /** returns the size required for a particular extra memory buffer */ size_t additional_buffer_size(memory_extra_flags_t flag) const { using namespace memory_extra_flags; + const auto flags = extra().flags; + if (!(flags & flag)) return 0; const auto ndims = this->ndims(); const auto &pdims = padded_dims(); @@ -186,18 +181,15 @@ struct memory_desc_wrapper : public c_compatible { return (size_t)prod * buff_data_size; }; - if (extra().flags & compensation_conv_s8s8) { + if (flag == compensation_conv_s8s8) { return calculate_size(extra().compensation_mask, additional_buffer_data_size(flag)); } - - if ((extra().flags & rnn_u8s8_compensation) - && !types::extra_flag_rnn_s8s8_compensation_is_set( - extra().flags)) { + if (flag == rnn_u8s8_compensation) { return calculate_size(extra().compensation_mask, additional_buffer_data_size(flag)); } - if (extra().flags & compensation_conv_asymmetric_src) { + if (flag == compensation_conv_asymmetric_src) { return calculate_size(extra().asymm_compensation_mask, additional_buffer_data_size(flag)); } diff --git a/src/common/primitive_hashing.cpp b/src/common/primitive_hashing.cpp index 7c51d4d5de3..a8d9f25ce8c 100644 --- a/src/common/primitive_hashing.cpp +++ b/src/common/primitive_hashing.cpp @@ -190,11 +190,9 @@ size_t get_md_hash(const memory_desc_t &md) { if (md.extra.flags != dnnl_memory_extra_flag_none) { seed = hash_combine(seed, md.extra.flags); - if ((md.extra.flags - & (dnnl_memory_extra_flag_compensation_conv_s8s8 - | dnnl_memory_extra_flag_rnn_u8s8_compensation)) - && !types::extra_flag_rnn_s8s8_compensation_is_set( - md.extra.flags)) { + if (md.extra.flags + & (dnnl_memory_extra_flag_compensation_conv_s8s8 + | dnnl_memory_extra_flag_rnn_u8s8_compensation)) { seed = hash_combine(seed, md.extra.compensation_mask); } diff --git a/src/common/serialization.cpp b/src/common/serialization.cpp index fe43c2e2efc..8e40dd29819 100644 --- a/src/common/serialization.cpp +++ b/src/common/serialization.cpp @@ -120,18 +120,14 @@ void serialize_md(serialization_stream_t &sstream, const memory_desc_t &md) { if (md.extra.flags != dnnl_memory_extra_flag_none) { sstream.write(&md.extra.flags); - if ((md.extra.flags - & (dnnl_memory_extra_flag_compensation_conv_s8s8 - | dnnl_memory_extra_flag_rnn_u8s8_compensation)) - && !types::extra_flag_rnn_s8s8_compensation_is_set( - md.extra.flags)) { + if (md.extra.flags + & (dnnl_memory_extra_flag_compensation_conv_s8s8 + | dnnl_memory_extra_flag_rnn_u8s8_compensation)) { sstream.write(&md.extra.compensation_mask); } - if (md.extra.flags & dnnl_memory_extra_flag_scale_adjust) { sstream.write(&md.extra.scale_adjust); } - if (md.extra.flags & dnnl_memory_extra_flag_compensation_conv_asymmetric_src) { sstream.write(&md.extra.asymm_compensation_mask); diff --git a/src/common/type_helpers.hpp b/src/common/type_helpers.hpp index ef617b4731d..7a6efb9d986 100644 --- a/src/common/type_helpers.hpp +++ b/src/common/type_helpers.hpp @@ -299,28 +299,15 @@ inline format_kind_t format_tag_to_kind(format_tag_t tag) { return format_kind::undef; } -// Currently rnn_s8s8_compensation has common bits with rnn_u8s8_compensation -// and scale_adjust constants so we have to perform additional checks to -// separate these two cases -inline bool extra_flag_rnn_s8s8_compensation_is_set(uint64_t flags) { - return ((flags & memory_extra_flags::rnn_s8s8_compensation) - ^ memory_extra_flags::rnn_s8s8_compensation) - == 0; -} - inline bool memory_extra_desc_is_equal( const memory_extra_desc_t &lhs, const memory_extra_desc_t &rhs) { using namespace memory_extra_flags; - return true && lhs.flags == rhs.flags + return lhs.flags == rhs.flags && IMPLICATION(lhs.flags & compensation_conv_s8s8, lhs.compensation_mask == rhs.compensation_mask) - && IMPLICATION((lhs.flags & rnn_u8s8_compensation) - && !extra_flag_rnn_s8s8_compensation_is_set( - lhs.flags), + && IMPLICATION(lhs.flags & rnn_u8s8_compensation, lhs.compensation_mask == rhs.compensation_mask) - && IMPLICATION((lhs.flags & scale_adjust) - && !extra_flag_rnn_s8s8_compensation_is_set( - lhs.flags), + && IMPLICATION(lhs.flags & scale_adjust, lhs.scale_adjust == rhs.scale_adjust) && IMPLICATION(lhs.flags & compensation_conv_asymmetric_src, lhs.asymm_compensation_mask == rhs.asymm_compensation_mask); diff --git a/src/cpu/rnn/rnn_reorders.hpp b/src/cpu/rnn/rnn_reorders.hpp index 5156350d860..e96828d369c 100644 --- a/src/cpu/rnn/rnn_reorders.hpp +++ b/src/cpu/rnn/rnn_reorders.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2018-2024 Intel Corporation +* Copyright 2018-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -779,12 +779,7 @@ struct rnn_brgemm_weights_reorder_s8_t : public primitive_t { return unimplemented; // Check the proper memory desc has been passed to u8s8 and s8s8 - // Note: currently rnn_u8s8_compensation and rnn_s8s8_compensation - // have common bit so we have to perform additional checks to - // separate these two cases const bool check_u8s8 = (od.extra().flags & rnn_u8s8_compensation) - && !types::extra_flag_rnn_s8s8_compensation_is_set( - od.extra().flags) && od.extra().compensation_mask == ((id.ndims() == 5) ? 27 /* 11011 */ : 13 /* 1101 */); @@ -886,9 +881,7 @@ struct rnn_brgemm_weights_reorder_s8_t : public primitive_t { .template get(memory_tracking::names:: key_reorder_rnn_weights_reduction); float *comp = reinterpret_cast(dst + compensation_offset); - const bool req_s8s8_comp = (dst_d.extra().flags & rnn_u8s8_compensation) - && !types::extra_flag_rnn_s8s8_compensation_is_set( - dst_d.extra().flags); + const bool req_s8s8_comp = dst_d.extra().flags & rnn_u8s8_compensation; const auto mask_ok = [&](int mask) { return mask == ((src_d.ndims() == 5) ? 27 /* 11011 */ diff --git a/src/gpu/intel/ocl/rnn/rnn_reorders.hpp b/src/gpu/intel/ocl/rnn/rnn_reorders.hpp index 80f1ed4c0b3..5b72142ce0a 100644 --- a/src/gpu/intel/ocl/rnn/rnn_reorders.hpp +++ b/src/gpu/intel/ocl/rnn/rnn_reorders.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2024 Intel Corporation +* Copyright 2019-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,14 +42,8 @@ struct rnn_weights_reorder_t : public gpu_primitive_t { status_t init(impl::engine_t *engine, impl::engine_t *src_engine, impl::engine_t *dst_engine) { - // Note: currently rnn_u8s8_compensation and rnn_s8s8_compensation - // have common bit so we have to perform additional checks to - // separate these two cases - VDISPATCH_REORDER( - !IMPLICATION(dst_md()->extra.flags - & memory_extra_flags::rnn_u8s8_compensation, - types::extra_flag_rnn_s8s8_compensation_is_set( - dst_md()->extra.flags)), + VDISPATCH_REORDER(dst_md()->extra.flags + & memory_extra_flags::rnn_u8s8_compensation, VERBOSE_BAD_FLAGS); VDISPATCH_REORDER(utils::one_of(src_engine->kind(), From 4d5ec0f83246adc434777f544552cc9e2c4f10d5 Mon Sep 17 00:00:00 2001 From: "Guskov, Andrey Y" Date: Wed, 8 Jan 2025 23:30:10 -0800 Subject: [PATCH 03/40] src: gpu: intel: jit: conv: add reorder-based precomputed zero points --- src/common/memory_desc.cpp | 7 +- src/common/memory_desc.hpp | 41 ++++- src/common/memory_desc_wrapper.hpp | 8 + src/common/primitive_hashing.cpp | 9 + src/common/serialization.cpp | 8 + src/common/type_helpers.hpp | 8 +- src/common/verbose.cpp | 15 ++ src/cpu/reorder/cpu_reorder_pd.hpp | 5 +- src/gpu/generic/convolution_deconvolution.hpp | 9 +- src/gpu/generic/cross_engine_reorder.cpp | 52 ++++-- src/gpu/generic/cross_engine_reorder.hpp | 9 +- src/gpu/gpu_reorder_pd.cpp | 101 +++++++++++ src/gpu/gpu_reorder_pd.hpp | 32 +++- src/gpu/gpu_utils.hpp | 2 +- src/gpu/gpu_zero_points_conv.cpp | 96 +++++++++++ src/gpu/gpu_zero_points_conv.hpp | 36 ++++ src/gpu/intel/jit/codegen/kernel.hpp | 57 ++++-- src/gpu/intel/jit/codegen/reorder.hpp | 12 +- src/gpu/intel/jit/conv/config.cpp | 77 +++++++++ src/gpu/intel/jit/conv/config.hpp | 4 +- src/gpu/intel/jit/conv/gen_convolution.cpp | 163 ++++++++---------- src/gpu/intel/jit/conv/normalization.cpp | 27 +-- src/gpu/intel/jit/conv/normalization.hpp | 5 +- src/gpu/intel/jit/conv/zp_plan.cpp | 12 +- src/gpu/intel/jit/conv/zp_plan.hpp | 3 +- src/gpu/intel/jit/ir/epilogue.cpp | 3 + src/gpu/intel/jit/ir/kernel_info.hpp | 6 +- src/gpu/intel/jit/ir/post_ops.cpp | 25 ++- src/gpu/intel/jit/ir/post_ops.hpp | 29 +++- src/gpu/intel/jit/ir/tensor_config.cpp | 10 +- src/gpu/intel/jit/reorder/gen_reorder.cpp | 12 +- src/gpu/intel/jit/reorder/gen_reorder.hpp | 3 +- src/gpu/intel/ocl/ref_reorder.cpp | 26 +-- src/gpu/intel/ocl/ref_reorder.hpp | 10 +- 34 files changed, 703 insertions(+), 219 deletions(-) create mode 100644 src/gpu/gpu_reorder_pd.cpp create mode 100644 src/gpu/gpu_zero_points_conv.cpp create mode 100644 src/gpu/gpu_zero_points_conv.hpp diff --git a/src/common/memory_desc.cpp b/src/common/memory_desc.cpp index f9345a72302..5d5a0958b52 100644 --- a/src/common/memory_desc.cpp +++ b/src/common/memory_desc.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2024 Intel Corporation +* Copyright 2022-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -471,8 +471,9 @@ status_t memory_desc_permute_axes(memory_desc_t &out_memory_desc, VCHECK_MEMORY( !memory_desc_wrapper(in_memory_desc).has_runtime_dims_or_strides(), invalid_arguments, VERBOSE_UNSUPPORTED_MEM_STRIDE); - VCHECK_MEMORY(in_memory_desc.extra.flags == 0, invalid_arguments, - VERBOSE_UNSUPPORTED_MD_FLAG, "extra"); + VCHECK_MEMORY( + check_md_extra_flags_compensation_gpu(in_memory_desc.extra.flags), + invalid_arguments, VERBOSE_UNSUPPORTED_MD_FLAG, "extra"); // verify that perm is indeed a permutation of [0 .. ndims) unsigned occurrence_mask = 0; diff --git a/src/common/memory_desc.hpp b/src/common/memory_desc.hpp index 5dc820c67c1..3b9dd8d0b1e 100644 --- a/src/common/memory_desc.hpp +++ b/src/common/memory_desc.hpp @@ -71,6 +71,15 @@ enum memory_extra_flags_t { = dnnl_memory_extra_flag_rnn_u8s8_compensation, dnnl_memory_extra_flag_compensation_conv_asymmetric_src = 8u, dnnl_memory_extra_flag_rnn_s8s8_compensation = 16u, + // This flag has to be kept separate from *compensation_conv_asymmetric_src + // since the GPU precompute algorithm is incompatible with that of the CPU + dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src = 32u, + // This flag depends on *compensation_gpu_conv_asymmetric_src and is used + // when precompute is to be performed for a backward-by-data convolution + dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src_bwd = 64u, + // This flag depends on *compensation_gpu_conv_asymmetric_src and is used + // when IC and OC are swapped to reinterpret a deconv as a BWD_D conv + dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src_swap = 128u, }; // Create aliases for extra flags to preserve the old behavior. @@ -87,8 +96,23 @@ const memory_extra_flags_t rnn_s8s8_compensation = dnnl_memory_extra_flag_rnn_s8s8_compensation; const memory_extra_flags_t compensation_conv_asymmetric_src = dnnl_memory_extra_flag_compensation_conv_asymmetric_src; +const memory_extra_flags_t compensation_gpu_conv_asymmetric_src + = dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src; +const memory_extra_flags_t compensation_gpu_conv_asymmetric_src_bwd + = dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src_bwd; +const memory_extra_flags_t compensation_gpu_conv_asymmetric_src_swap + = dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src_swap; } // namespace memory_extra_flags +inline bool check_md_extra_flags_compensation_gpu(uint64_t flags) { + using namespace memory_extra_flags; + const uint64_t c = compensation_gpu_conv_asymmetric_src; + const uint64_t b = compensation_gpu_conv_asymmetric_src_bwd; + const uint64_t s = compensation_gpu_conv_asymmetric_src_swap; + return (flags == none) || (flags == c) || (flags == (c | b)) + || (flags == (c | b | s)); +} + // Generic description of blocked data layout for most memory formats. struct blocking_desc_t { // The strides between the outermost blocks. @@ -208,7 +232,12 @@ struct memory_extra_desc_t { : flags(0) , compensation_mask(0) , scale_adjust(0.0f) - , asymm_compensation_mask(0) {} + , asymm_compensation_mask(0) + , idhw {0, 0, 0} + , odhw {0, 0, 0} + , pdhw {0, 0, 0} + , ddhw {0, 0, 0} + , dst_size(0) {} // The flags contain arbitrary extra information, such as compensation. // @sa dnnl_memory_extra_flags_t uint64_t flags; @@ -218,6 +247,16 @@ struct memory_extra_desc_t { float scale_adjust; // Compensation mask for asymmetric quantization int asymm_compensation_mask; + // Precomp GPU ZP convolution input spatials + dim_t idhw[3]; + // Precomp GPU ZP convolution output spatials + dim_t odhw[3]; + // Precomp GPU ZP convolution padding spatials + dim_t pdhw[3]; + // Precomp GPU ZP convolution dilation spatials + dim_t ddhw[3]; + // Precomp GPU ZP convolution destination size + dim_t dst_size; }; status_t DNNL_API memory_desc_init_by_tag(memory_desc_t &memory_desc, int ndims, diff --git a/src/common/memory_desc_wrapper.hpp b/src/common/memory_desc_wrapper.hpp index 847951ba558..9b32468975b 100644 --- a/src/common/memory_desc_wrapper.hpp +++ b/src/common/memory_desc_wrapper.hpp @@ -152,6 +152,8 @@ struct memory_desc_wrapper : public c_compatible { if (flag_select & rnn_u8s8_compensation) return sizeof(float); if (flag_select & compensation_conv_asymmetric_src) return sizeof(int32_t); + if (flag_select & compensation_gpu_conv_asymmetric_src) + return sizeof(int32_t); return 0; } @@ -160,6 +162,7 @@ struct memory_desc_wrapper : public c_compatible { using namespace memory_extra_flags; return extra().flags & (compensation_conv_s8s8 | rnn_u8s8_compensation + | compensation_gpu_conv_asymmetric_src | compensation_conv_asymmetric_src); } @@ -193,6 +196,9 @@ struct memory_desc_wrapper : public c_compatible { return calculate_size(extra().asymm_compensation_mask, additional_buffer_data_size(flag)); } + if (flag == compensation_gpu_conv_asymmetric_src) { + return extra().dst_size; + } return 0; } @@ -212,6 +218,8 @@ struct memory_desc_wrapper : public c_compatible { buff_size += additional_buffer_size(compensation_conv_s8s8); buff_size += additional_buffer_size(rnn_u8s8_compensation); buff_size += additional_buffer_size(compensation_conv_asymmetric_src); + buff_size + += additional_buffer_size(compensation_gpu_conv_asymmetric_src); return buff_size; } diff --git a/src/common/primitive_hashing.cpp b/src/common/primitive_hashing.cpp index a8d9f25ce8c..a7a0f9ed295 100644 --- a/src/common/primitive_hashing.cpp +++ b/src/common/primitive_hashing.cpp @@ -204,6 +204,15 @@ size_t get_md_hash(const memory_desc_t &md) { & dnnl_memory_extra_flag_compensation_conv_asymmetric_src) { seed = hash_combine(seed, md.extra.asymm_compensation_mask); } + + if (md.extra.flags + & dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src) { + seed = get_array_hash(seed, md.extra.idhw, 3); + seed = get_array_hash(seed, md.extra.odhw, 3); + seed = get_array_hash(seed, md.extra.pdhw, 3); + seed = get_array_hash(seed, md.extra.ddhw, 3); + seed = hash_combine(seed, md.extra.dst_size); + } } // Combined hash for a memory descriptor return seed; diff --git a/src/common/serialization.cpp b/src/common/serialization.cpp index 8e40dd29819..afe9c37f49e 100644 --- a/src/common/serialization.cpp +++ b/src/common/serialization.cpp @@ -132,6 +132,14 @@ void serialize_md(serialization_stream_t &sstream, const memory_desc_t &md) { & dnnl_memory_extra_flag_compensation_conv_asymmetric_src) { sstream.write(&md.extra.asymm_compensation_mask); } + if (md.extra.flags + & dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src) { + sstream.write(md.extra.idhw, 3); + sstream.write(md.extra.odhw, 3); + sstream.write(md.extra.pdhw, 3); + sstream.write(md.extra.ddhw, 3); + sstream.write(&md.extra.dst_size); + } } } diff --git a/src/common/type_helpers.hpp b/src/common/type_helpers.hpp index 7a6efb9d986..c8abbbe4364 100644 --- a/src/common/type_helpers.hpp +++ b/src/common/type_helpers.hpp @@ -310,7 +310,13 @@ inline bool memory_extra_desc_is_equal( && IMPLICATION(lhs.flags & scale_adjust, lhs.scale_adjust == rhs.scale_adjust) && IMPLICATION(lhs.flags & compensation_conv_asymmetric_src, - lhs.asymm_compensation_mask == rhs.asymm_compensation_mask); + lhs.asymm_compensation_mask == rhs.asymm_compensation_mask) + && IMPLICATION(lhs.flags & compensation_gpu_conv_asymmetric_src, + (lhs.dst_size == rhs.dst_size) + && utils::array_cmp(lhs.idhw, rhs.idhw, 3) + && utils::array_cmp(lhs.odhw, rhs.odhw, 3) + && utils::array_cmp(lhs.pdhw, rhs.pdhw, 3) + && utils::array_cmp(lhs.ddhw, rhs.ddhw, 3)); } inline bool blocking_desc_is_equal(const memory_desc_t &lhs_md, diff --git a/src/common/verbose.cpp b/src/common/verbose.cpp index 852e61deef3..c9344c638e1 100644 --- a/src/common/verbose.cpp +++ b/src/common/verbose.cpp @@ -392,6 +392,21 @@ std::ostream &operator<<(std::ostream &ss, const memory_extra_desc_t &extra) { ss << ":s8m" << extra.compensation_mask; if (extra.flags & compensation_conv_asymmetric_src) ss << ":zpm" << extra.asymm_compensation_mask; + if (extra.flags & compensation_gpu_conv_asymmetric_src) { + ss << ":zid" << extra.idhw[0]; + ss << ":zih" << extra.idhw[1]; + ss << ":ziw" << extra.idhw[2]; + ss << ":zod" << extra.odhw[0]; + ss << ":zoh" << extra.odhw[1]; + ss << ":zow" << extra.odhw[2]; + ss << ":zpd" << extra.pdhw[0]; + ss << ":zph" << extra.pdhw[1]; + ss << ":zpw" << extra.pdhw[2]; + ss << ":zdd" << extra.ddhw[0]; + ss << ":zdh" << extra.ddhw[1]; + ss << ":zdw" << extra.ddhw[2]; + ss << ":zs" << extra.dst_size; + } if (extra.flags & scale_adjust && extra.scale_adjust != 1.f) ss << ":sa" << extra.scale_adjust; return ss; diff --git a/src/cpu/reorder/cpu_reorder_pd.hpp b/src/cpu/reorder/cpu_reorder_pd.hpp index d1c8499c151..ca69992b0fe 100644 --- a/src/cpu/reorder/cpu_reorder_pd.hpp +++ b/src/cpu/reorder/cpu_reorder_pd.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2016-2024 Intel Corporation +* Copyright 2016-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,6 +38,9 @@ struct cpu_reorder_pd_t : public reorder_pd_t { post_ops.len() == 1 && post_ops.entry_[0].kind == primitive_kind::sum); VDISPATCH_REORDER(args_ok, VERBOSE_UNSUPPORTED_POSTOP); + auto gpu_zp = memory_extra_flags::compensation_gpu_conv_asymmetric_src; + VDISPATCH_REORDER(!(dst_md()->extra.flags & gpu_zp), + VERBOSE_UNSUPPORTED_MD_FLAG, "extra"); return status::success; } diff --git a/src/gpu/generic/convolution_deconvolution.hpp b/src/gpu/generic/convolution_deconvolution.hpp index 74893d4c5db..1c07d94522d 100644 --- a/src/gpu/generic/convolution_deconvolution.hpp +++ b/src/gpu/generic/convolution_deconvolution.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2024 Intel Corporation +* Copyright 2024-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,12 +32,15 @@ namespace generic { static status_t weights_axes_permutation( memory_desc_t *o_md, const memory_desc_t *i_md, bool with_groups) { + using namespace memory_extra_flags; int perm[DNNL_MAX_NDIMS] {}; // deconv to conv weight permutation for (int d = 0; d < DNNL_MAX_NDIMS; ++d) perm[d] = d; nstl::swap(perm[0 + with_groups], perm[1 + with_groups]); - - return memory_desc_permute_axes(*o_md, *i_md, perm); + CHECK(memory_desc_permute_axes(*o_md, *i_md, perm)); + if (o_md->extra.flags & compensation_gpu_conv_asymmetric_src) + o_md->extra.flags |= compensation_gpu_conv_asymmetric_src_swap; + return status::success; } static status_t conv_descr_create( diff --git a/src/gpu/generic/cross_engine_reorder.cpp b/src/gpu/generic/cross_engine_reorder.cpp index 6ded618a9c9..cbf4672c4c6 100644 --- a/src/gpu/generic/cross_engine_reorder.cpp +++ b/src/gpu/generic/cross_engine_reorder.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2024 Intel Corporation +* Copyright 2019-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,20 +27,18 @@ namespace impl { namespace gpu { namespace generic { -void cross_engine_reorder_t::pd_t::init_scratchpad(impl::engine_t *engine) { - using namespace memory_tracking::names; - if (!do_reorder_) return; - - auto *gpu_engine = utils::downcast(engine); - - const memory_desc_wrapper wspace_md( - desc()->src_engine_kind == reorder_engine_kind_ ? dst_md() - : src_md()); - auto scratchpad = scratchpad_registry().registrar(); - scratchpad.book(memory_tracking::names::key_reorder_cross_space, - wspace_md.size(), 1, gpu_engine->get_buffer_alignment()); - scratchpad.book(key_nested, reorder_pd_->scratchpad_registry().size(), 1, - gpu_engine->get_buffer_alignment()); +void cross_engine_reorder_t::pd_t::init_scratchpad(impl::engine_t *gpu_engine) { + if (do_reorder_) { + using namespace memory_tracking::names; + auto gpu_align = utils::downcast(gpu_engine) + ->get_buffer_alignment(); + auto scratchpad = scratchpad_registry().registrar(); + auto needs_dst = desc()->src_engine_kind == reorder_engine_kind_; + memory_desc_wrapper wspace((needs_dst) ? dst_md() : src_md()); + scratchpad.book(key_reorder_cross_space, wspace.size(), 1, gpu_align); + scratchpad.book(key_nested, reorder_pd_->scratchpad_registry().size(), + 1, gpu_align); + } } status_t cross_engine_reorder_t::pd_t::init(impl::engine_t *engine, @@ -50,7 +48,7 @@ status_t cross_engine_reorder_t::pd_t::init(impl::engine_t *engine, dst_engine->kind()), VERBOSE_BAD_ENGINE_KIND); VDISPATCH_REORDER(attr_ok(), VERBOSE_UNSUPPORTED_ATTR); - VDISPATCH_REORDER(extra_ok(), VERBOSE_UNSUPPORTED_MD_FLAG, "extra_ok"); + VDISPATCH_REORDER(extra_ok(true), VERBOSE_UNSUPPORTED_MD_FLAG, "extra_ok"); memory_desc_wrapper src_mdw(src_md()); memory_desc_wrapper dst_mdw(dst_md()); @@ -72,17 +70,31 @@ status_t cross_engine_reorder_t::pd_t::init(impl::engine_t *engine, primitive_attr_t r_attr(*attr()); if (!r_attr.is_initialized()) return status::out_of_memory; - VDISPATCH_REORDER_SC(reorder_primitive_desc_create(reorder_pd_, - reorder_engine, src_md(), dst_md(), &r_attr), + auto clean_src_md = *src_md(); + auto clean_dst_md = *dst_md(); + clean_src_md.extra = clean_dst_md.extra = {}; + VDISPATCH_REORDER_SC( + reorder_primitive_desc_create(reorder_pd_, reorder_engine, + &clean_src_md, &clean_dst_md, &r_attr), VERBOSE_PRIMITIVE_CREATION_FAIL, "reorder"); - init_scratchpad(engine); reorder_pd_t::init_desc( src_engine->kind(), dst_engine->kind(), true /* is_cross_engine */); + VDISPATCH_REORDER_SC(maybe_create_zp_precompute_conv_pd(dst_engine), + "failed to create nested zp precompute convolution"); + init_scratchpad( + (dst_engine->kind() == engine_kind::gpu) ? dst_engine : src_engine); return status::success; } +status_t cross_engine_reorder_t::init(impl::engine_t *engine) { + CHECK(pd()->maybe_create_zp_precompute_conv( + zp_precomp_conv_, engine, this)); + if (!pd()->do_reorder_) return status::success; + return create_nested_primitive(reorder_, pd()->reorder_pd_, engine); +} + status_t cross_engine_reorder_t::execute(const exec_ctx_t &ctx) const { using namespace memory_tracking::names; auto *gpu_stream = utils::downcast(ctx.stream()); @@ -158,6 +170,8 @@ status_t cross_engine_reorder_t::execute(const exec_ctx_t &ctx) const { ctx.input(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC), ctx.input(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST)); } + if (status == status::success) + status = pd()->maybe_exec_zp_precompute_conv(ctx, zp_precomp_conv_); } return status; } diff --git a/src/gpu/generic/cross_engine_reorder.hpp b/src/gpu/generic/cross_engine_reorder.hpp index cd69fefefaf..c6557ddaaeb 100644 --- a/src/gpu/generic/cross_engine_reorder.hpp +++ b/src/gpu/generic/cross_engine_reorder.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2024 Intel Corporation +* Copyright 2019-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,16 +57,13 @@ struct cross_engine_reorder_t : public gpu::primitive_t { DECLARE_GPU_REORDER_CREATE(); }; - status_t init(impl::engine_t *engine) override { - if (!pd()->do_reorder_) return status::success; - return create_nested_primitive(reorder_, pd()->reorder_pd_, engine); - } - + status_t init(impl::engine_t *engine) override; status_t execute(const exec_ctx_t &ctx) const override; private: const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } std::shared_ptr reorder_; + std::shared_ptr zp_precomp_conv_; }; } // namespace generic diff --git a/src/gpu/gpu_reorder_pd.cpp b/src/gpu/gpu_reorder_pd.cpp new file mode 100644 index 00000000000..ca293db5c89 --- /dev/null +++ b/src/gpu/gpu_reorder_pd.cpp @@ -0,0 +1,101 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/gpu_reorder_pd.hpp" +#include "gpu/gpu_engine.hpp" +#include "gpu/gpu_stream.hpp" +#include "gpu/gpu_zero_points_conv.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { + +status_t gpu_reorder_pd_t::maybe_create_zp_precompute_conv_pd( + impl::engine_t *dst_engine) { + memory_desc_wrapper dst_mdw(dst_md()); + auto &extra = dst_mdw.extra(); + auto needs_conv = memory_extra_flags::compensation_gpu_conv_asymmetric_src; + auto is_dst_gpu = (dst_engine->kind() == engine_kind::gpu); + do_zp_precomp_conv_ = is_dst_gpu && (extra.flags & needs_conv); + if (!do_zp_precomp_conv_) return status::success; + + using namespace memory_extra_flags; + const auto out_type = data_type::f32; + primitive_attr_t attr; + const bool is_bwd_d + = extra.flags & compensation_gpu_conv_asymmetric_src_bwd; + auto prop = (is_bwd_d) ? prop_kind::backward_data + : prop_kind::forward_inference; + CHECK(create_zp_precompute_conv_pd(zp_precomp_conv_pd_, dst_engine, attr, + dst_md(), extra.idhw, extra.odhw, extra.pdhw, extra.ddhw, out_type, + prop)); + + using namespace memory_tracking::names; + auto gpu_align = utils::downcast(dst_engine) + ->get_buffer_alignment(); + auto scratchpad = scratchpad_registry().registrar(); + auto registry = zp_precomp_conv_pd_->scratchpad_registry(); + memory_desc_wrapper wspace((is_bwd_d) ? zp_precomp_conv_pd_->diff_dst_md() + : zp_precomp_conv_pd_->src_md()); + scratchpad.book(key_conv_tr_src, wspace.size(), 1, gpu_align); + scratchpad.book(key_conv_tails, registry.size(), 1, gpu_align); + return status::success; +} + +status_t gpu_reorder_pd_t::maybe_create_zp_precompute_conv( + std::shared_ptr &zp_precomp_conv, + impl::engine_t *engine, gpu::primitive_t *primitive) const { + if (!do_zp_precomp_conv_) return status::success; + return primitive->create_nested_primitive( + zp_precomp_conv, zp_precomp_conv_pd_, engine); +} + +status_t gpu_reorder_pd_t::maybe_exec_zp_precompute_conv(const exec_ctx_t &ctx, + const std::shared_ptr &zp_precomp_conv) const { + using namespace memory_tracking::names; + if (!do_zp_precomp_conv_) return status::success; + + const bool is_bwd_d = (zp_precomp_conv_pd_->get_prop_kind() + == prop_kind::backward_data); + auto *gpu_stream = utils::downcast(ctx.stream()); + auto conv_md_in = (is_bwd_d) ? zp_precomp_conv_pd_->diff_dst_md() + : zp_precomp_conv_pd_->src_md(); + auto scratchpad + = ctx.get_scratchpad_grantor().get_memory_storage(key_conv_tr_src); + std::unique_ptr wspace; + CHECK(safe_ptr_assign(wspace, + new memory_t(ctx.stream()->engine(), conv_md_in, + std::move(scratchpad)))); + CHECK(gpu_stream->fill(*wspace->memory_storage(), 0x01, + memory_desc_wrapper(conv_md_in).size(), + gpu_stream->ctx().get_deps(), gpu_stream->ctx().get_deps())); + + exec_args_t r_args; + auto arg_in = (is_bwd_d) ? DNNL_ARG_DIFF_DST : DNNL_ARG_SRC; + auto arg_out = (is_bwd_d) ? DNNL_ARG_DIFF_SRC : DNNL_ARG_DST; + r_args[arg_in] = memory_arg_t {(memory_t *)wspace.get(), true}; + r_args[DNNL_ARG_WEIGHTS] = memory_arg_t {ctx.output(DNNL_ARG_TO), true}; + r_args[arg_out] = memory_arg_t {ctx.output(DNNL_ARG_TO), false}; + exec_ctx_t r_ctx(ctx, std::move(r_args)); + + nested_scratchpad_t ns(ctx, key_conv_tails, zp_precomp_conv); + r_ctx.set_scratchpad_grantor(ns.grantor()); + return zp_precomp_conv->execute(r_ctx); +} + +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/gpu_reorder_pd.hpp b/src/gpu/gpu_reorder_pd.hpp index d70c28bdd81..71617d96dc8 100644 --- a/src/gpu/gpu_reorder_pd.hpp +++ b/src/gpu/gpu_reorder_pd.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2024 Intel Corporation +* Copyright 2019-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #define GPU_GPU_REORDER_PD_HPP #include "common/reorder_pd.hpp" +#include "gpu/gpu_primitive.hpp" namespace dnnl { namespace impl { @@ -28,10 +29,9 @@ struct gpu_reorder_pd_t : public reorder_pd_t { protected: bool attr_ok() const { - return attr()->has_default_values( - dnnl_primitive_attr::skip_mask_t::zero_points_runtime - | dnnl_primitive_attr::skip_mask_t::scales_runtime - | dnnl_primitive_attr::skip_mask_t::post_ops) + using sm = dnnl_primitive_attr::skip_mask_t; + return attr()->has_default_values(sm::zero_points_runtime + | sm::scales_runtime | sm::post_ops) && post_ops_ok() && zero_points_ok(); } @@ -62,9 +62,27 @@ struct gpu_reorder_pd_t : public reorder_pd_t { && post_ops.entry_[0].kind == primitive_kind::sum); } - bool extra_ok() const { - return src_md()->extra.flags == 0 && dst_md()->extra.flags == 0; + bool extra_ok(bool accept_conv_asymm = false) const { + if (!accept_conv_asymm) + return (src_md()->extra.flags == memory_extra_flags::none) + && (dst_md()->extra.flags == memory_extra_flags::none); + return check_md_extra_flags_compensation_gpu(src_md()->extra.flags) + && check_md_extra_flags_compensation_gpu(dst_md()->extra.flags); } + + status_t maybe_create_zp_precompute_conv_pd(impl::engine_t *dst_engine); + +public: + status_t maybe_create_zp_precompute_conv( + std::shared_ptr &zp_precomp_conv, + impl::engine_t *engine, gpu::primitive_t *primitive) const; + + status_t maybe_exec_zp_precompute_conv(const exec_ctx_t &ctx, + const std::shared_ptr &zp_precomp_conv) const; + +private: + bool do_zp_precomp_conv_ = false; + std::shared_ptr zp_precomp_conv_pd_; }; } // namespace gpu diff --git a/src/gpu/gpu_utils.hpp b/src/gpu/gpu_utils.hpp index 18c82b1dccc..fe56ccaba41 100644 --- a/src/gpu/gpu_utils.hpp +++ b/src/gpu/gpu_utils.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2024 Intel Corporation +* Copyright 2024-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/gpu/gpu_zero_points_conv.cpp b/src/gpu/gpu_zero_points_conv.cpp new file mode 100644 index 00000000000..0e1edb567d9 --- /dev/null +++ b/src/gpu/gpu_zero_points_conv.cpp @@ -0,0 +1,96 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include + +#include "common/convolution_pd.hpp" +#include "common/primitive_desc_iterator.hpp" +#include "gpu/gpu_zero_points_conv.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { + +status_t create_zp_precompute_conv_pd(std::shared_ptr &retn, + dnnl::impl::engine_t *eng, const primitive_attr_t &attr, + const memory_desc_t *wei, const dim_t *idhw, const dim_t *odhw, + const dim_t *pdhw, const dim_t *ddhw, data_type_t out_type, + prop_kind_t prop, bool has_offset0) { + using namespace memory_extra_flags; + auto real_wei = *wei; + const int off = (!idhw[1]) ? 2 + !idhw[2] : !idhw[0]; + const bool with_groups = (real_wei.ndims == (6 - off)); + if (real_wei.extra.flags & compensation_gpu_conv_asymmetric_src_swap) { + static_assert(DNNL_MAX_NDIMS == 12, "DNNL_MAX_NDIMS is not 12"); + std::array perm_grp + = {0, 2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + std::array perm_no_grp + = {1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + CHECK(memory_desc_permute_axes(real_wei, *wei, + (with_groups) ? perm_grp.data() : perm_no_grp.data())); + } + real_wei.extra = memory_extra_desc_t(); + + const auto &dims = real_wei.dims; + const bool is_fwd = ((prop == prop_kind::forward_training) + || (prop == prop_kind::forward_inference)); + const bool is_bwd_d = (prop == prop_kind::backward_data); + assert((off < 3) && (real_wei.ndims >= 5 - off) && (is_fwd || is_bwd_d)); + MAYBE_UNUSED(is_fwd); + + using memory_dims = std::vector; + memory_dims S1 {1, 1, 1}; + memory_dims P1 {0, 0, 0}; + // dim order for weights: [G,] OC, IC, [[[D,] H,] W] + memory_dims dims_in {1, + (with_groups) ? dims[0] * dims[2 - is_bwd_d] : dims[1 - is_bwd_d]}; + memory_dims dims_out {1, + (with_groups) ? dims[0] * dims[1 + is_bwd_d] : dims[0 + is_bwd_d]}; + for (int i = off; i < 3; i++) { + const auto k_idx = 2 + with_groups + i - off; + const auto KD = (dims[k_idx] - 1) * (ddhw[i] + 1) + 1; + dims_in.emplace_back(idhw[i]); + dims_out.emplace_back(odhw[i]); + P1[i] = dims_out.back() - dims_in.back() - 1 + KD - pdhw[i]; + } + + memory_desc_t in, out; + CHECK(memory_desc_init_by_tag(out, int(dims_out.size()), dims_out.data(), + out_type, format_tag::any)); + CHECK(memory_desc_init_by_tag(in, int(dims_in.size()), dims_in.data(), + data_type::s8, format_tag::any)); + + if (has_offset0) { + auto out_type_size = types::data_type_size(out_type); + auto offset0 = memory_desc_wrapper(real_wei).size(0, false); + assert(offset0 % out_type_size == 0); + out.offset0 = offset0 / out_type_size; + } + auto conv_desc = convolution_desc_t(); + CHECK(dnnl::impl::conv_desc_init(&conv_desc, prop, + alg_kind::convolution_direct, (is_bwd_d) ? &out : &in, &real_wei, + nullptr, (is_bwd_d) ? &in : &out, S1.data() + off, ddhw + off, + pdhw + off, P1.data() + off)); + primitive_desc_iterator_t it(eng, (op_desc_t *)&conv_desc, &attr, nullptr); + if (!it.is_initialized()) return status::out_of_memory; + retn = *(++it); + return (retn) ? status::success : status::unimplemented; +} + +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/gpu_zero_points_conv.hpp b/src/gpu/gpu_zero_points_conv.hpp new file mode 100644 index 00000000000..e287454b4ec --- /dev/null +++ b/src/gpu/gpu_zero_points_conv.hpp @@ -0,0 +1,36 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_GPU_ZERO_POINTS_CONV_HPP +#define GPU_GPU_ZERO_POINTS_CONV_HPP + +#include "common/primitive_desc.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { + +status_t create_zp_precompute_conv_pd(std::shared_ptr &retn, + dnnl::impl::engine_t *eng, const primitive_attr_t &attr, + const memory_desc_t *wei, const dim_t *idhw, const dim_t *odhw, + const dim_t *pdhw, const dim_t *ddhw, data_type_t out_type, + prop_kind_t prop, bool has_offset0 = true); + +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/intel/jit/codegen/kernel.hpp b/src/gpu/intel/jit/codegen/kernel.hpp index fcee4247d7e..c099e0bc886 100644 --- a/src/gpu/intel/jit/codegen/kernel.hpp +++ b/src/gpu/intel/jit/codegen/kernel.hpp @@ -28,6 +28,8 @@ #include "gpu/intel/compute/utils.hpp" #include "gpu/intel/jit/codegen/operand.hpp" #include "gpu/intel/jit/codegen/register_allocator.hpp" +#include "gpu/intel/jit/codegen/register_scope.hpp" +#include "gpu/intel/jit/codegen/reorder.hpp" #include "gpu/intel/jit/emulation.hpp" #include "gpu/intel/jit/ir/ir.hpp" #include "gpu/intel/jit/ir/ir_builder.hpp" @@ -591,20 +593,28 @@ class ir_kernel_t : public jit_generator { } void eadd3(const ngen::InstructionModifier &mod, const ngen_operand_t &dst, - const ngen_operand_t &src0, const ngen_operand_t &src1, - const ngen_operand_t &src2) { + const ngen_operand_t &_src0, const ngen_operand_t &_src1, + const ngen_operand_t &_src2) { + auto src0 = _src0; + auto src1 = _src1; + auto src2 = _src2; + auto scope = ngen_register_scope_t(ra_); + align_src_dst_offset(this, scope, mod, dst, src0); + align_src_dst_offset(this, scope, mod, dst, src1); if (hw >= ngen::HW::XeHP) { if (src2.is_reg_data()) { - add3(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(), - src2.reg_data()); + align_src_dst_offset(this, scope, mod, dst, src2); + add3(mod, dst.reg_data(), fixup_ternary_rgn(src0.reg_data()), + fixup_ternary_rgn(src1.reg_data()), src2.reg_data()); } else { - add3(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(), - src2.immediate()); + add3(mod, dst.reg_data(), fixup_ternary_rgn(src0.reg_data()), + fixup_ternary_rgn(src1.reg_data()), src2.immediate()); } return; } add(mod, dst.reg_data(), src0.reg_data(), src1.reg_data()); if (src2.is_reg_data()) { + align_src_dst_offset(this, scope, mod, dst, src2); add(mod, dst.reg_data(), dst.reg_data(), src2.reg_data()); } else { add(mod, dst.reg_data(), dst.reg_data(), src2.immediate()); @@ -612,26 +622,34 @@ class ir_kernel_t : public jit_generator { } void emad(const ngen::InstructionModifier &mod, const ngen_operand_t &dst, - const ngen_operand_t &src0, const ngen_operand_t &src1, - const ngen_operand_t &src2) { + const ngen_operand_t &_src0, const ngen_operand_t &_src1, + const ngen_operand_t &_src2) { + auto src0 = _src0; + auto src1 = _src1; + auto src2 = _src2; + auto scope = ngen_register_scope_t(ra_); + align_src_dst_offset(this, scope, mod, dst, src1); if (src2.is_reg_data()) { - mad(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(), - src2.reg_data()); + align_src_dst_offset(this, scope, mod, dst, src0); + align_src_dst_offset(this, scope, mod, dst, src2); + mad(mod, dst.reg_data(), fixup_ternary_rgn(src0.reg_data()), + fixup_ternary_rgn(src1.reg_data()), src2.reg_data()); } else if (hw < ngen::HW::XeLP) { + align_src_dst_offset(this, scope, mod, dst, src0); mul(mod, dst.reg_data(), src1.reg_data(), src2.immediate()); add(mod, dst.reg_data(), dst.reg_data(), src0.reg_data()); } else if (src0.is_immediate() && (ngen_is_dw(src0.type()) || src0.type() == ngen::DataType::uw)) { // dword immediate src0 is not supported, move to a register. - auto tmp_src0 = ra_.alloc_sub(src0.type()); + auto tmp_src0 = scope.alloc_sub(src0.type()); mov(1, tmp_src0, src0.immediate()); - mad(mod, dst.reg_data(), tmp_src0, src1.reg_data(), - src2.immediate()); - ra_.safeRelease(tmp_src0); + mad(mod, dst.reg_data(), tmp_src0, + fixup_ternary_rgn(src1.reg_data()), src2.immediate()); } else { - mad(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(), - src2.immediate()); + align_src_dst_offset(this, scope, mod, dst, src0); + mad(mod, dst.reg_data(), fixup_ternary_rgn(src0.reg_data()), + fixup_ternary_rgn(src1.reg_data()), src2.immediate()); } } @@ -1160,6 +1178,13 @@ class ir_kernel_t : public jit_generator { return ir_utils::safe_divide(local_size, exec_cfg_.simd()); } + static ngen::RegData fixup_ternary_rgn(const ngen::RegData &r) { + ngen::RegData retn = r; + return ((retn.getHS() == 1) && (retn.getVS() == retn.getWidth())) + ? retn.setRegion(1, 1, 0) + : retn; + } + kernel_iface_t kernel_iface_; std::string kernel_name_; exec_config_t exec_cfg_; diff --git a/src/gpu/intel/jit/codegen/reorder.hpp b/src/gpu/intel/jit/codegen/reorder.hpp index 12d2187c8d1..aa4bc370794 100644 --- a/src/gpu/intel/jit/codegen/reorder.hpp +++ b/src/gpu/intel/jit/codegen/reorder.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2024 Intel Corporation +* Copyright 2022-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1300,15 +1300,17 @@ void align_src_dst_offset(GeneratorT *host, ngen_register_scope_t &scope, int dst_off = dst.offset(); int src_byte_off = src.byte_offset(); int dst_byte_off = dst.byte_offset(); + int esize = mod.getExecSize(); + int grf_size = ngen::GRF::bytes(scope.hw()); + int grf_src = grf_size / src.hs(); + int grf_dst = grf_size / dst.hs(); // If src is aligned with dst, return. - if ((is_xf || is_bf_to_f) && src_off == dst_off) return; - if (!is_xf && src_byte_off == dst_byte_off) return; + if ((is_xf || is_bf_to_f) && src_off % grf_src == dst_off % grf_dst) return; + if (!is_xf && src_byte_off % grf_size == dst_byte_off % grf_size) return; int new_src_byte_off = (is_xf ? dst_off * src_type_size : dst_byte_off); - int esize = mod.getExecSize(); - int grf_size = ngen::GRF::bytes(scope.hw()); int src_size = std::max(src_type_size * esize * src_stride, src_type_size); auto new_src = scope.alloc_reg_buf_data( diff --git a/src/gpu/intel/jit/conv/config.cpp b/src/gpu/intel/jit/conv/config.cpp index 08bbb4e4166..fe36059d3e0 100644 --- a/src/gpu/intel/jit/conv/config.cpp +++ b/src/gpu/intel/jit/conv/config.cpp @@ -20,6 +20,7 @@ #include #include +#include "common/utils.hpp" #include "gpu/intel/jit/conv/grf_usage.hpp" #include "gpu/intel/jit/conv/message_patterns.hpp" #include "gpu/intel/jit/conv/normalization.hpp" @@ -659,6 +660,61 @@ void init_data_tags(const conv_config_t &cfg, const memory_desc_t &src_md, if (user_dst_req == "user") dst_tag = user_dst_tag = "user"; } +void prepare_zp_precompute_conv(const conv_problem_t &prb, dim_t *idhw, + dim_t *odhw, dim_t *pdhw, dim_t *ddhw) { + const bool is_bwd_d = (prb.prop_kind() == prop_kind::backward_data); + using memory_dims = std::vector; + memory_dims I {prb.id, prb.ih, prb.iw}; + memory_dims O {prb.od, prb.oh, prb.ow}; + memory_dims K {prb.kd, prb.kh, prb.kw}; + memory_dims S {prb.sd, prb.sh, prb.sw}; + memory_dims D {prb.dd, prb.dh, prb.dw}; + memory_dims P {prb.pd, prb.ph, prb.pw}; + const int off = 5 - prb.ndims; + const auto *w = prb.conv_pd->weights_md(); + + // restore the original layout of the prb values + const auto *s + = (is_bwd_d) ? prb.conv_pd->diff_dst_md() : prb.conv_pd->src_md(); + const auto *d + = (is_bwd_d) ? prb.conv_pd->diff_src_md() : prb.conv_pd->dst_md(); + auto has_dim = [&](int i) { + return (s->dims[2 + i] > 1) || (d->dims[2 + i] > 1) + || (w->dims[2 + i + prb.with_groups] > 1); + }; + auto move_back = [&](int i, int off) { + if (off == 0) return; + I[i - off] = O[i - off] = K[i - off] = S[i - off] = 1; + D[i - off] = P[i - off] = 0; + std::swap(I[i - off], I[i]); + std::swap(O[i - off], O[i]); + std::swap(K[i - off], K[i]); + std::swap(S[i - off], S[i]); + std::swap(D[i - off], D[i]); + std::swap(P[i - off], P[i]); + }; + bool has_d = (off <= 0) && has_dim(0 - off); + bool has_h = (off <= 1) && has_dim(1 - off); + bool has_w = (off <= 2) && has_dim(2 - off); + if (!has_d && !has_h && !has_w) has_w = true; + move_back(1, has_d * (!has_h == has_w)); + move_back(2, !has_w * (!has_h + 1)); + + for (int i = off; i < int(K.size()); i++) { + const auto KD = (K[i] - 1) * (D[i] + 1) + 1; + ir_assert(w->dims[2 + i + prb.with_groups - off] == K[i]); + O[i] = ir_utils::max_unique_pad_states( + O[i], I[i], KD, P[i], S[i], true); + I[i] = std::min(KD, I[i]); + } + for (int i = 0; i < 3; i++) { + idhw[i] = (i < off) ? 0 : I[i]; + odhw[i] = (i < off) ? 0 : O[i]; + pdhw[i] = (i < off) ? 0 : P[i]; + ddhw[i] = (i < off) ? 0 : D[i]; + } +} + status_t init_tensor_layouts( conv_config_t &cfg, convolution_pd_t *pd, impl::engine_t *engine) { const auto &prb = cfg.prb(); @@ -778,6 +834,27 @@ status_t init_tensor_layouts( bia.set_compute(bia_layout); bia.set_user(user_bia_layout); + if (cfg.zp_cfg().needs_src_reorder_precalc) { + auto get_channels = [](const layout_t &layout) { + const dim_t min_esize = 16; + return std::max(utils::rnd_up_pow2(layout.dim(1) * layout.dim(2)), + min_esize); + }; + using namespace memory_extra_flags; + prepare_zp_precompute_conv(prb, wei_md.extra.idhw, wei_md.extra.odhw, + wei_md.extra.pdhw, wei_md.extra.ddhw); + + wei_md.extra.dst_size = sizeof(float); + for (const auto &o : wei_md.extra.odhw) + wei_md.extra.dst_size *= std::max(o, dim_t(1)); + if (prb.prop_kind() == prop_kind::backward_data) { + wei_md.extra.flags |= compensation_gpu_conv_asymmetric_src_bwd; + wei_md.extra.dst_size *= get_channels(src_layout); + } else { + wei_md.extra.dst_size *= get_channels(dst_layout); + } + wei_md.extra.flags |= compensation_gpu_conv_asymmetric_src; + } return status::success; } diff --git a/src/gpu/intel/jit/conv/config.hpp b/src/gpu/intel/jit/conv/config.hpp index f698d7ab546..b20e7240889 100644 --- a/src/gpu/intel/jit/conv/config.hpp +++ b/src/gpu/intel/jit/conv/config.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2024 Intel Corporation +* Copyright 2021-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -675,6 +675,8 @@ int default_regs(const conv_config_t &cfg); void init_kernel_grid(conv_config_t &cfg); void init_walk_order(conv_config_t &cfg); void init_thread_group_grid(conv_config_t &cfg); +void prepare_zp_precompute_conv(const conv_problem_t &prb, dim_t *idhw, + dim_t *odhw, dim_t *pdhw, dim_t *ddhw); std::array get_kernel_grid_conv_dims(const conv_config_t &cfg); std::array get_thread_group_grid_conv_dims( const conv_config_t &cfg); diff --git a/src/gpu/intel/jit/conv/gen_convolution.cpp b/src/gpu/intel/jit/conv/gen_convolution.cpp index 425d5977944..2a71b661da7 100644 --- a/src/gpu/intel/jit/conv/gen_convolution.cpp +++ b/src/gpu/intel/jit/conv/gen_convolution.cpp @@ -25,6 +25,7 @@ #include "common/impl_registration.hpp" #include "common/utils.hpp" #include "common/verbose.hpp" +#include "gpu/gpu_zero_points_conv.hpp" #include "gpu/intel/jit/ir/kernel_info.hpp" #include "gpu/intel/jit/reorder/reorder_kernel.hpp" #include "gpu/intel/jit/utils/utils.hpp" @@ -45,8 +46,7 @@ struct conv_pd_data_t { conv_config_t pd_cfg; tensor_config_t tensor_cfg; std::vector kernel_infos; - std::shared_ptr zp_pd; - std::shared_ptr zp_prim; + std::shared_ptr zp_pd; }; class gen_convolution_t { @@ -80,79 +80,31 @@ class gen_convolution_t { CHECK(init_pd_time_cfg( prb, pd->data->pd_cfg, engine, pd, &pd->attr_)); - if (pd->data->pd_cfg.zp_cfg().needs_src_precalc) { - memory::dims I {prb.id, prb.ih, prb.iw}; - memory::dims O {prb.od, prb.oh, prb.ow}; - memory::dims K {prb.kd, prb.kh, prb.kw}; - memory::dims S {prb.sd, prb.sh, prb.sw}; - memory::dims D {prb.dd, prb.dh, prb.dw}; - memory::dims P {prb.pd, prb.ph, prb.pw}; - const int off = 5 - prb.ndims; - const auto *w = pd->invariant_wei_md(); - { // restore the original layout of the prb values - const auto *s = pd->invariant_src_md(); - const auto *d = pd->invariant_dst_md(); - auto has_dim = [&](int i) { - return (s->dims[2 + i] > 1) || (d->dims[2 + i] > 1) - || (w->dims[2 + i + prb.with_groups] > 1); - }; - auto move_back = [&](int i, int off) { - if (off == 0) return; - I[i - off] = O[i - off] = K[i - off] = S[i - off] = 1; - D[i - off] = P[i - off] = 0; - std::swap(I[i - off], I[i]); - std::swap(O[i - off], O[i]); - std::swap(K[i - off], K[i]); - std::swap(S[i - off], S[i]); - std::swap(D[i - off], D[i]); - std::swap(P[i - off], P[i]); - }; - bool has_d = (off <= 0) && has_dim(0 - off); - bool has_h = (off <= 1) && has_dim(1 - off); - bool has_w = (off <= 2) && has_dim(2 - off); - if (!has_d && !has_h && !has_w) has_w = true; - move_back(1, has_d * (!has_h == has_w)); - move_back(2, !has_w * (!has_h + 1)); + if (pd->data->pd_cfg.zp_cfg().needs_src_reorder_precalc + || pd->data->pd_cfg.zp_cfg().needs_src_conv_precalc) { + primitive_attr_t attr; + if (pd->data->pd_cfg.zp_cfg().needs_src_conv_precalc) { + int mask = 0; + CHECK(pd->attr_.zero_points_.get(DNNL_ARG_SRC, &mask)); + attr.zero_points_.set(DNNL_ARG_SRC, mask); + attr.post_ops_.append_eltwise( + 1.f, alg_kind::eltwise_linear, -1.f, 0.f); } - memory::dims S1 {1, 1, 1}; - memory::dims P1 {0, 0, 0}; - memory::dims dims_src {1, dim_t(prb.g) * prb.ic}; - memory::dims dims_dst {1, dim_t(prb.g) * prb.oc}; - - for (int i = off; i < int(K.size()); i++) { - const auto KD = (K[i] - 1) * (D[i] + 1) + 1; - dims_src.emplace_back(std::min(KD, I[i])); - dims_dst.emplace_back(ir_utils::max_unique_pad_states( - O[i], I[i], KD, P[i], S[i], true)); - P1[i] = dims_dst.back() - dims_src.back() - 1 + KD - P[i]; + dim_t I[3], O[3], P[3], D[3]; + prepare_zp_precompute_conv(prb, I, O, P, D); + CHECK(create_zp_precompute_conv_pd(pd->data->zp_pd, engine, + attr, pd->weights_md(), I, O, P, D, data_type::f32, + pd->get_prop_kind(), + !pd->data->pd_cfg.zp_cfg().needs_src_conv_precalc)); + if (pd->data->pd_cfg.zp_cfg().needs_src_conv_precalc) { + auto scratchpad = pd->scratchpad_registry().registrar(); + scratchpad.book(memory_tracking::names::key_nested_multiple, + pd->data->zp_pd->scratchpad_registry()); } - memory::desc src(dims_src, memory::data_type::s8, - memory::format_tag::any); - memory::desc dst(dims_dst, memory::data_type::s32, - memory::format_tag::any); - - // create a nested conv and allocate a nested scratchpad for it - primitive_attr_t attr; - int mask = 0; - CHECK(pd->attr_.zero_points_.get(DNNL_ARG_SRC, &mask)); - attr.zero_points_.set(DNNL_ARG_SRC, mask); - attr.post_ops_.append_eltwise( - 1.f, alg_kind_t::dnnl_eltwise_linear, -1.f, 0.f); - dnnl_primitive_desc *zp_pd; - CHECK(dnnl_convolution_forward_primitive_desc_create(&zp_pd, - engine, dnnl_prop_kind_t::dnnl_forward_inference, - dnnl_alg_kind_t::dnnl_convolution_direct, src.get(), w, - nullptr, dst.get(), S1.data() + off, D.data() + off, - P.data() + off, P1.data() + off, &attr)); - pd->data->zp_pd.reset(zp_pd, dnnl_primitive_desc_destroy); - auto scratchpad = pd->scratchpad_registry().registrar(); - scratchpad.book(memory_tracking::names::key_nested_multiple, - pd->data->zp_pd->impl()->scratchpad_registry()); } - pd->data->tensor_cfg = get_tensor_config(pd->data->pd_cfg, - (pd->data->zp_pd) ? pd->data->zp_pd->impl()->src_md() - : nullptr); + pd->data->tensor_cfg = get_tensor_config( + pd->data->pd_cfg, zp_conv_md_in(*pd->data)); pd->data->kernel_infos.reserve(max_kernels); CHECK(init_kernel_infos(pd)); @@ -184,7 +136,7 @@ class gen_convolution_t { int max_tries = 100; conv_config_t cfg; layout_t zp_dst; - if (data.zp_pd) zp_dst = layout_t(data.zp_pd->impl()->dst_md(), false); + if (data.zp_pd) zp_dst = layout_t(zp_conv_md_out(data), false); if (primitive->cache_blob()) { tiler->set_cur_version(primitive->version()); @@ -206,8 +158,17 @@ class gen_convolution_t { ir_info() << cfg; init_nd_ranges(primitive, cfg); - auto &kernel_infos = data.kernel_infos; + + // This absolutely HAS to be executed first if present, + // since it adds its own version mark to the cache blob + for (int i = 0; i < int(kernel_infos.size()); i++) + if (kernel_infos[i].id() == kernel_id_t::zp_precalc) { + ir_assert(data.zp_pd); + CHECK(primitive->create_nested_primitive( + zp_prim_, data.zp_pd, engine)); + } + std::vector tmp_kernels; for (int i = 0; i < int(kernel_infos.size()); i++) { auto &info = kernel_infos[i]; @@ -256,10 +217,6 @@ class gen_convolution_t { break; case kernel_id_t::zp_precalc: - ir_assert(data.zp_pd); - if (!data.zp_prim) - CHECK(data.zp_pd->impl()->create_primitive( - data.zp_prim, engine)); tmp_kernels.emplace_back(); continue; @@ -327,12 +284,11 @@ class gen_convolution_t { new memory_t(ctx.stream()->engine(), md, std::move(s))); }; - ir_assert(data.zp_prim); + ir_assert(zp_prim_); std::unique_ptr zp_src, zp_dst; - CHECK(scratchpad_arg(zp_src, "src_zero_points", - data.zp_pd->impl()->src_md())); CHECK(scratchpad_arg( - zp_dst, "dst", data.zp_pd->impl()->dst_md())); + zp_src, "src_zero_points", zp_conv_md_in(data))); + CHECK(scratchpad_arg(zp_dst, "dst", zp_conv_md_out(data))); exec_args_t e_args; auto src_zp_idx = DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC; @@ -342,9 +298,9 @@ class gen_convolution_t { e_args[DNNL_ARG_DST] = memory_arg_t {zp_dst.get(), false}; exec_ctx_t e_ctx(ctx, std::move(e_args)); const auto nm = memory_tracking::names::key_nested_multiple; - nested_scratchpad_t ns(ctx, nm, data.zp_prim); + nested_scratchpad_t ns(ctx, nm, zp_prim_); e_ctx.set_scratchpad_grantor(ns.grantor()); - CHECK(data.zp_prim->execute(e_ctx)); + CHECK(zp_prim_->execute(e_ctx)); } nsubmitted++; if (nsubmitted == nkernels) break; @@ -355,6 +311,20 @@ class gen_convolution_t { } private: + static const memory_desc_t *zp_conv_md_in(const conv_pd_data_t &data) { + if (!data.zp_pd) return nullptr; + const bool is_bwd_d + = (data.zp_pd->get_prop_kind() == prop_kind::backward_data); + return (is_bwd_d) ? data.zp_pd->diff_dst_md() : data.zp_pd->src_md(); + } + + static const memory_desc_t *zp_conv_md_out(const conv_pd_data_t &data) { + if (!data.zp_pd) return nullptr; + const bool is_bwd_d + = (data.zp_pd->get_prop_kind() == prop_kind::backward_data); + return (is_bwd_d) ? data.zp_pd->diff_src_md() : data.zp_pd->dst_md(); + } + template static kernel_info_t &create_kernel_info(T *pd, kernel_id_t kernel_id) { auto &infos = pd->data->kernel_infos; @@ -369,10 +339,8 @@ class gen_convolution_t { static status_t init_kernel_infos(T *pd) { auto &data = *pd->data; auto &cfg = data.pd_cfg; - const bool needs_zp_precalc = cfg.zp_cfg().needs_src_precalc; - auto &conv_info = create_kernel_info(pd, kernel_id_t::convolution); - auto &zp_precalc_info = (needs_zp_precalc) + auto &zp_precalc_info = (cfg.zp_cfg().needs_src_conv_precalc) ? create_kernel_info(pd, kernel_id_t::zp_precalc) : conv_info; @@ -382,8 +350,10 @@ class gen_convolution_t { // Initialize kernel arguments. int scratchpad_key = memory_tracking::names::key_none; for (auto &t : data.tensor_cfg.tensors()) { - const bool src_zp_precalc - = needs_zp_precalc && (t.name == "src_zero_points"); + const bool wei_reorder_precalc = (t.name == "wei") + && cfg.zp_cfg().needs_src_reorder_precalc; + const bool src_conv_precalc = (t.name == "src_zero_points") + && cfg.zp_cfg().needs_src_conv_precalc; const auto compute_buf = make_buffer(t.name); size_t compute_size = t.compute_layout.size(); @@ -398,7 +368,7 @@ class gen_convolution_t { auto add_compute_arg = [&](kernel_info_t &ki, const expr_t &buf, bool is_input) { - if (t.needs_reorder || src_zp_precalc) + if (t.needs_reorder || src_conv_precalc) ki.register_scratchpad_arg( buf, compute_arg_key, is_input, compute_size); else @@ -419,12 +389,12 @@ class gen_convolution_t { return zero_out_info; }; - if (t.needs_reorder || src_zp_precalc) { + if (t.needs_reorder || src_conv_precalc) { int user_arg_key = compute_arg_key; auto user_buf = make_buffer(t.name + "_user"); compute_arg_key = ++scratchpad_key; - if (!src_zp_precalc && t.is_input) { + if (!src_conv_precalc && t.is_input) { auto &reorder_info = create_kernel_info(pd, kernel_id_t::pre_reorder); reorder_info.register_user_arg(user_buf, user_arg_key, @@ -433,7 +403,7 @@ class gen_convolution_t { reorder_info.set_nd_range(reorder_kernel_t<>::nd_range( cfg.exec_cfg(), t.user_layout, t.compute_layout)); } - if (!src_zp_precalc && t.is_output) { + if (!src_conv_precalc && t.is_output) { auto &reorder_info = create_kernel_info(pd, kernel_id_t::post_reorder); add_compute_arg(reorder_info, compute_buf, true); @@ -442,7 +412,7 @@ class gen_convolution_t { reorder_info.set_nd_range(reorder_kernel_t<>::nd_range( cfg.exec_cfg(), t.compute_layout, t.user_layout)); } - if (src_zp_precalc) { + if (src_conv_precalc) { scratchpad_book(++scratchpad_key); create_zero_out_info().register_scratchpad_arg(compute_buf, scratchpad_key, /*is_input=*/false, compute_size); @@ -464,6 +434,12 @@ class gen_convolution_t { add_compute_arg(zp_precalc_info, make_buffer("dst"), false); } scratchpad_book(compute_arg_key); + if (wei_reorder_precalc) { + // user-supplied weights contain precomputed ZP values, so + // the buffer is to be passed to the conv alongside weights + conv_info.register_user_arg( + user_buf, user_arg_key, t.is_input && !t.is_output); + } } if (t.needs_zero_out) { add_compute_arg(create_zero_out_info(), compute_buf, false); @@ -520,6 +496,7 @@ class gen_convolution_t { std::vector kernels_; std::vector nd_ranges_; + std::shared_ptr zp_prim_; }; status_t gen_convolution_fwd_t::pd_t::init(impl::engine_t *engine) { diff --git a/src/gpu/intel/jit/conv/normalization.cpp b/src/gpu/intel/jit/conv/normalization.cpp index 71dabfe229d..e1914e3eba3 100644 --- a/src/gpu/intel/jit/conv/normalization.cpp +++ b/src/gpu/intel/jit/conv/normalization.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2024 Intel Corporation +* Copyright 2022-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -175,9 +175,11 @@ void maybe_reshape_dims(dim_idx_t ndims, layout_t &layout, // this method only gets called when ZP precompute is in order; // in all other cases ZPs are applied ad-hoc, without a post-op view_t conv_post_op_view_mapper_t::create_src_zp_view(uint32_t mask) const { - auto map_o2k = [](view_t &v, dim_idx_t idx, dim_t O, dim_t I, dim_t KD, - dim_t P, dim_t S) { - const bool needs_right_bound = ((O - 1) * S + (KD - P) >= I); + auto map_o2k = [this](view_t &v, dim_idx_t idx, dim_t O, dim_t I, dim_t K, + dim_t D, dim_t P, dim_t S) { + const auto KD = (K - 1) * (D + 1) + 1; + const auto KDP = (KD > 1) ? KD - P : 0; + const bool needs_right_bound = (O - 1) * S + KDP >= I; expr_t o = v.vvars()[idx]; if (KD >= I) { o = o * S; @@ -186,7 +188,13 @@ view_t conv_post_op_view_mapper_t::create_src_zp_view(uint32_t mask) const { dim_t off = P; if (P > 0) l = binary_op_t::make(op_kind_t::_min, o * S - P, 0); if (needs_right_bound) { - r = binary_op_t::make(op_kind_t::_max, o * S + (KD - P), I); + if (schedule_.var_bound(o) > O) { + auto q = binary_op_t::make( + op_kind_t::_min, o * S + KDP, (O - 1) * S + KDP); + r = binary_op_t::make(op_kind_t::_max, q, I); + } else { + r = binary_op_t::make(op_kind_t::_max, o * S + KDP, I); + } off -= I; } o = (!l.is_empty()) ? l : o; @@ -218,9 +226,6 @@ view_t conv_post_op_view_mapper_t::create_src_zp_view(uint32_t mask) const { } dst = layout_t(dst.type(), dst.ndims(), dst.offset(), new_blk, false); - const auto KDD = (prb_.kd - 1) * (prb_.dd + 1) + 1; - const auto KDH = (prb_.kh - 1) * (prb_.dh + 1) + 1; - const auto KDW = (prb_.kw - 1) * (prb_.dw + 1) + 1; view_t view(vars, 6); view.set_vdim(vars[0], 1); // mb view.set_vdim(vars[1], prb_.g); @@ -228,9 +233,9 @@ view_t conv_post_op_view_mapper_t::create_src_zp_view(uint32_t mask) const { view.set_tdim(0, vars[0]); view.set_tdim(1, vars[1]); view.set_tdim(2, vars[2]); - map_o2k(view, 3, prb_.od, prb_.id, KDD, prb_.pd, prb_.sd); - map_o2k(view, 4, prb_.oh, prb_.ih, KDH, prb_.ph, prb_.sh); - map_o2k(view, 5, prb_.ow, prb_.iw, KDW, prb_.pw, prb_.sw); + map_o2k(view, 3, prb_.od, prb_.id, prb_.kd, prb_.dd, prb_.pd, prb_.sd); + map_o2k(view, 4, prb_.oh, prb_.ih, prb_.kh, prb_.dh, prb_.ph, prb_.sh); + map_o2k(view, 5, prb_.ow, prb_.iw, prb_.kw, prb_.dw, prb_.pw, prb_.sw); view.set_tlayout(dst); return view; } diff --git a/src/gpu/intel/jit/conv/normalization.hpp b/src/gpu/intel/jit/conv/normalization.hpp index 7e49a4c4a2c..cf926487376 100644 --- a/src/gpu/intel/jit/conv/normalization.hpp +++ b/src/gpu/intel/jit/conv/normalization.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2024 Intel Corporation +* Copyright 2022-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,7 +32,8 @@ class conv_post_op_view_mapper_t : public post_op_view_mapper_t { const conv_problem_t &prb, const zero_points_config_t &zp_cfg, const layout_t &zp_dst) : post_op_view_mapper_t(schedule.c_view()) - , has_external_src_zps_(zp_cfg.needs_src_precalc) + , has_external_src_zps_(zp_cfg.needs_src_conv_precalc + || zp_cfg.needs_src_reorder_precalc) , schedule_(schedule) , prb_(prb) , zp_dst_(zp_dst) {} diff --git a/src/gpu/intel/jit/conv/zp_plan.cpp b/src/gpu/intel/jit/conv/zp_plan.cpp index f65eefcb28f..13f7aae303a 100644 --- a/src/gpu/intel/jit/conv/zp_plan.cpp +++ b/src/gpu/intel/jit/conv/zp_plan.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1446,7 +1446,6 @@ class zp_comp_apply_plan_t : public base_plan_t { struct zp_plan_impl_t : public base_plan_t { bool src_2d_loads = false; - bool needs_precalc = false; bool has_dpasw = false; split_dispatcher_t sd; send_plan_t load; @@ -1526,8 +1525,9 @@ void zp_plan_t::init(const conv_config_t &cfg, bool src_2d_loads, const layout_t &wei_layout, const layout_t &dst_layout) { impl->src_2d_loads = src_2d_loads; impl->has_dpasw = cfg.fma_kind() == fma_kind_t::dpasw; - impl->needs_precalc = cfg.zp_cfg().needs_src_precalc; - bool do_src = cfg.zp_cfg().do_src_compensation && !impl->needs_precalc; + bool do_src = cfg.zp_cfg().do_src_compensation + && !cfg.zp_cfg().needs_src_reorder_precalc + && !cfg.zp_cfg().needs_src_conv_precalc; bool do_wei = cfg.zp_cfg().do_wei_compensation; send_plan_t impl_load; @@ -1574,10 +1574,6 @@ bool zp_plan_t::has_zp_wei() const { return impl->has_zp_wei(); } -bool zp_plan_t::needs_precalc() const { - return impl->needs_precalc; -} - int zp_plan_t::load_reg_buf_size() const { return impl->load.reg_buf_size(); } diff --git a/src/gpu/intel/jit/conv/zp_plan.hpp b/src/gpu/intel/jit/conv/zp_plan.hpp index 14ec03419fd..267de03422f 100644 --- a/src/gpu/intel/jit/conv/zp_plan.hpp +++ b/src/gpu/intel/jit/conv/zp_plan.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -46,7 +46,6 @@ struct zp_plan_t : public base_plan_t { bool is_src_precomp_compatible() const; bool has_zp_src() const; bool has_zp_wei() const; - bool needs_precalc() const; int load_reg_buf_size() const; int mask_reg_buf_size() const; int comp_reg_buf_size() const; diff --git a/src/gpu/intel/jit/ir/epilogue.cpp b/src/gpu/intel/jit/ir/epilogue.cpp index 516acc19d0a..53dcfc295e4 100644 --- a/src/gpu/intel/jit/ir/epilogue.cpp +++ b/src/gpu/intel/jit/ir/epilogue.cpp @@ -278,6 +278,9 @@ class post_op_tensor_t { stmt_t build_prefetch_stmt(const view_t &c_view) const { ir_assert(needs_load()); + // Disable prefetching for precomputed ZPs stored at the end of 'wei' + if ((mem_buf().str() == "wei") || (mem_buf().str() == "wei_user")) + return stmt_t(); auto prefetch = make_access_builder(*ir_ctx_, mem_view(), mem_buf(), expr_t(), send_op_t::prefetch, send_address_t::a64, get_cache_hint(c_view)); diff --git a/src/gpu/intel/jit/ir/kernel_info.hpp b/src/gpu/intel/jit/ir/kernel_info.hpp index 86390264760..56700b72bf9 100644 --- a/src/gpu/intel/jit/ir/kernel_info.hpp +++ b/src/gpu/intel/jit/ir/kernel_info.hpp @@ -144,11 +144,11 @@ class kernel_info_t { // Returns stage ID, kernels with smaller stage IDs are executed first. int stage_id() const { switch (id()) { - case kernel_id_t::pre_reorder: return 0; case kernel_id_t::zero_out: return 0; case kernel_id_t::zp_precalc: return 1; - case kernel_id_t::convolution: return 2; - case kernel_id_t::post_reorder: return 3; + case kernel_id_t::pre_reorder: return 2; + case kernel_id_t::convolution: return 3; + case kernel_id_t::post_reorder: return 4; default: ir_error_not_expected(); } return -1; diff --git a/src/gpu/intel/jit/ir/post_ops.cpp b/src/gpu/intel/jit/ir/post_ops.cpp index 81429f8c34a..4ca7837763a 100644 --- a/src/gpu/intel/jit/ir/post_ops.cpp +++ b/src/gpu/intel/jit/ir/post_ops.cpp @@ -108,12 +108,27 @@ post_op_context_t::post_op_context_t(const primitive_attr_t &attr, if (po_vm_.can_use_simple_src_zps() && zp_cfg.do_src_compensation) { if (zp_cfg.is_runtime_src_zero_points) { - bool per_oc = !zp_cfg.is_common_src_zero_point - || zp_cfg.needs_src_precalc; - auto view = po_vm_.create_src_zp_view((per_oc) ? 1 << 1 : 0); + auto view = po_vm_.create_src_zp_view( + (!zp_cfg.is_common_src_zero_point) ? 1 << 1 : 0); auto buf = kernel_info.find_arg("src_zero_points"); - auto in = add_input_tensor(view, buf); - post_ops_.emplace_back(c, c - in); + if (zp_cfg.needs_src_reorder_precalc) { + auto wei = kernel_info.find_arg("wei_user", true); + if (wei.is_empty()) wei = kernel_info.find_arg("wei"); + + layout_t tlayout(view.tlayout()); + tlayout.set_offset( + utils::div_up(schedule.b_view().tlayout().size(), + tlayout.type().size())); + view.set_tlayout(tlayout); + layout_t scalar(zp_cfg.src_zp_type, 0, + std::vector(view.vvars().size(), 1), false); + auto zp = add_input_tensor(view_t(scalar, view.vvars()), buf); + auto in = add_input_tensor(view, wei); + post_ops_.emplace_back(c, c - in * zp); + } else { + auto in = add_input_tensor(view, buf); + post_ops_.emplace_back(c, c - in); + } } else { auto func = eltwise_t::make(alg_kind::eltwise_linear, /*scale=*/1.f, diff --git a/src/gpu/intel/jit/ir/post_ops.hpp b/src/gpu/intel/jit/ir/post_ops.hpp index 0d220bf8cf0..60cf6b1dae3 100644 --- a/src/gpu/intel/jit/ir/post_ops.hpp +++ b/src/gpu/intel/jit/ir/post_ops.hpp @@ -46,7 +46,8 @@ struct zero_points_config_t { bool is_common_src_zero_point = false; bool is_common_wei_zero_point = false; bool is_common_dst_zero_point = false; - bool needs_src_precalc = false; + bool needs_src_reorder_precalc = false; + bool needs_src_conv_precalc = false; int common_src_zero_point = 0; int common_wei_zero_point = 0; int common_dst_zero_point = 0; @@ -75,8 +76,10 @@ struct zero_points_config_t { pd && pd->attr()->zero_points_.common(DNNL_ARG_WEIGHTS)) , is_common_dst_zero_point( pd && pd->attr()->zero_points_.common(DNNL_ARG_DST)) - , needs_src_precalc( - pd && do_src_compensation && is_src_precalc_compatible(pd)) + , needs_src_reorder_precalc( + pd && do_src_compensation && can_use_src_reorder_precalc(pd)) + , needs_src_conv_precalc(pd && do_src_compensation + && !needs_src_reorder_precalc && can_use_src_conv_precalc(pd)) , common_src_zero_point(0) , common_wei_zero_point(0) , common_dst_zero_point(0) { @@ -102,12 +105,22 @@ struct zero_points_config_t { } private: - bool is_src_precalc_compatible(const primitive_desc_t *pd) { + bool can_use_src_reorder_precalc(const primitive_desc_t *pd) { if (pd->kind() != primitive_kind_t::dnnl_convolution) return false; - // In general, precomputed ZPs are slower than the regular ZPs up to a - // point where a nested convolution that does the precalc takes less - // time than the in-situ compensations; that usually happens around - // MB = 64, but the exact number is just a heuristic. + // Reorder-based precomputed ZPs are only available if the user did not + // specify the weights mem desc so the convolution can choose it freely + // and set a mem desc flag asking a reorder to precompute the values. + return (pd->invariant_wei_md()->format_kind == format_kind::any) + && pd->attr()->zero_points_.common(DNNL_ARG_SRC) + && pd->attr()->zero_points_.has_default_values( + DNNL_ARG_WEIGHTS); + } + bool can_use_src_conv_precalc(const primitive_desc_t *pd) { + if (pd->kind() != primitive_kind_t::dnnl_convolution) return false; + // In general, conv-based precomputed ZPs are slower than the regular + // ZPs up to a point where a nested convolution that does the precalc + // takes less time than the in-situ compensations; that usually happens + // around MB = 64, but the exact number is just a heuristic. // TODO: a finer-grained estimate return (pd->invariant_src_md()->dims[0] >= 64) && pd->attr()->zero_points_.has_default_values( diff --git a/src/gpu/intel/jit/ir/tensor_config.cpp b/src/gpu/intel/jit/ir/tensor_config.cpp index 20b8765df2b..a7c5e4f7c8d 100644 --- a/src/gpu/intel/jit/ir/tensor_config.cpp +++ b/src/gpu/intel/jit/ir/tensor_config.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2024 Intel Corporation +* Copyright 2024-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,14 +38,14 @@ void init_extra_tensors(const zero_points_config_t &zp_cfg, /*is_input=*/true, /*is_output=*/false, zp_layout); }; if (zp_cfg.do_src_compensation && zp_cfg.is_runtime_src_zero_points) { - if (!zp_cfg.needs_src_precalc) { - add_zp_buffer("src_zero_points", zp_cfg.src_zp_type, DNNL_ARG_SRC, - (zp_cfg.is_common_src_zero_point) ? 1 : ic); - } else { + if (zp_cfg.needs_src_conv_precalc) { ir_assert(zp_src); int arg_key = DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC; tensor_cfg.add_tensor("src_zero_points", arg_key, /*is_input=*/true, /*is_output=*/false, layout_t(zp_src, false), layout_t()); + } else { + add_zp_buffer("src_zero_points", zp_cfg.src_zp_type, DNNL_ARG_SRC, + (zp_cfg.is_common_src_zero_point) ? 1 : ic); } } if (zp_cfg.do_wei_compensation && zp_cfg.is_runtime_wei_zero_points) { diff --git a/src/gpu/intel/jit/reorder/gen_reorder.cpp b/src/gpu/intel/jit/reorder/gen_reorder.cpp index 5f048447146..974b35210a8 100644 --- a/src/gpu/intel/jit/reorder/gen_reorder.cpp +++ b/src/gpu/intel/jit/reorder/gen_reorder.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2024 Intel Corporation +* Copyright 2022-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -98,7 +98,7 @@ status_t gen_reorder_t::pd_t::init(impl::engine_t *engine, | sm::rounding_mode; VDISPATCH_REORDER( attr()->has_default_values(skip_mask), VERBOSE_UNSUPPORTED_ATTR); - VDISPATCH_REORDER(extra_ok(), VERBOSE_UNSUPPORTED_MD_FLAG, "extra_ok"); + VDISPATCH_REORDER(extra_ok(true), VERBOSE_UNSUPPORTED_MD_FLAG, "extra_ok"); VDISPATCH_REORDER(post_ops_ok(), VERBOSE_UNSUPPORTED_POSTOP); VDISPATCH_REORDER(scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG); @@ -148,6 +148,7 @@ status_t gen_reorder_t::pd_t::init(impl::engine_t *engine, check_layout(dst_layout), VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst"); VDISPATCH_REORDER(compute_engine->mayiuse_ngen_kernels(), VERBOSE_UNSUPPORTED_DEVICE_FEATURE, "ngen_kernels"); + auto *gpu_attr = utils::downcast(attr()->gpu_attr_.get()); hw_t hw(engine); @@ -158,7 +159,8 @@ status_t gen_reorder_t::pd_t::init(impl::engine_t *engine, cfg->set_zp_cfg(zp_cfg); VDISPATCH_REORDER_SC( init_kernel_info(), "kernel initialization unsuccessful"); - + VDISPATCH_REORDER_SC(maybe_create_zp_precompute_conv_pd(dst_engine), + "failed to create nested zp precompute convolution"); return status::success; } @@ -202,6 +204,9 @@ status_t gen_reorder_t::pd_t::init_kernel_info() { } status_t gen_reorder_t::init(impl::engine_t *engine) { + CHECK(pd()->maybe_create_zp_precompute_conv( + zp_precomp_conv_, engine, this)); + auto &cfg = *pd()->cfg; auto &info = *pd()->kernel_info; @@ -221,6 +226,7 @@ status_t gen_reorder_t::execute(const exec_ctx_t &ctx) const { info.set_args(arg_list, storage_list); CHECK(parallel_for(ctx, info.nd_range(), kernel_, arg_list)); + CHECK(pd()->maybe_exec_zp_precompute_conv(ctx, zp_precomp_conv_)); return status::success; } diff --git a/src/gpu/intel/jit/reorder/gen_reorder.hpp b/src/gpu/intel/jit/reorder/gen_reorder.hpp index c6aa048dfb3..478d5e030a4 100644 --- a/src/gpu/intel/jit/reorder/gen_reorder.hpp +++ b/src/gpu/intel/jit/reorder/gen_reorder.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2024 Intel Corporation +* Copyright 2022-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -58,6 +58,7 @@ class gen_reorder_t : public gpu_primitive_t { const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } compute::kernel_t kernel_; + std::shared_ptr zp_precomp_conv_; }; } // namespace jit diff --git a/src/gpu/intel/ocl/ref_reorder.cpp b/src/gpu/intel/ocl/ref_reorder.cpp index e058b7091e4..edebcb3d305 100644 --- a/src/gpu/intel/ocl/ref_reorder.cpp +++ b/src/gpu/intel/ocl/ref_reorder.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2024 Intel Corporation +* Copyright 2019-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -150,17 +150,19 @@ status_t ref_reorder_t::execute(const exec_ctx_t &ctx) const { CHECK(large_parallel_for( ctx, nd_range, kernels_[0], arg_list, arg_list.nargs())); - if (!conf.subbyte_pack) return status::success; - - compute::kernel_arg_list_t repack_arg_list; - repack_arg_list.set(0, *tmp); - repack_arg_list.set(1, dst); - repack_arg_list.set(2, into(conf.nelems)); - repack_arg_list.set(3, 4); - compute::range_t repack_gws((conf.nelems * 4 + 7) / 8); - compute::nd_range_t repack_nd_range(repack_gws); - return large_parallel_for( - ctx, repack_nd_range, kernels_[1], repack_arg_list, 4); + if (conf.subbyte_pack) { + compute::kernel_arg_list_t repack_arg_list; + repack_arg_list.set(0, *tmp); + repack_arg_list.set(1, dst); + repack_arg_list.set(2, into(conf.nelems)); + repack_arg_list.set(3, 4); + compute::range_t repack_gws((conf.nelems * 4 + 7) / 8); + compute::nd_range_t repack_nd_range(repack_gws); + CHECK(large_parallel_for( + ctx, repack_nd_range, kernels_[1], repack_arg_list, 4)); + } + CHECK(pd()->maybe_exec_zp_precompute_conv(ctx, zp_precomp_conv_)); + return status::success; } } // namespace ocl diff --git a/src/gpu/intel/ocl/ref_reorder.hpp b/src/gpu/intel/ocl/ref_reorder.hpp index 7b312fad53c..7feed0402f7 100644 --- a/src/gpu/intel/ocl/ref_reorder.hpp +++ b/src/gpu/intel/ocl/ref_reorder.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2024 Intel Corporation +* Copyright 2019-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -121,8 +121,10 @@ struct ref_reorder_t : public gpu_primitive_t { VERBOSE_UNSUPPORTED_DT_CFG); VDISPATCH_REORDER_SC(init_conf(engine), "init_conf()"); - init_scratchpad(); + VDISPATCH_REORDER_SC(maybe_create_zp_precompute_conv_pd(dst_engine), + "failed to create nested zp precompute convolution"); + init_scratchpad(); return status::success; } @@ -137,6 +139,9 @@ struct ref_reorder_t : public gpu_primitive_t { }; status_t init(impl::engine_t *engine) override { + CHECK(pd()->maybe_create_zp_precompute_conv( + zp_precomp_conv_, engine, this)); + compute::kernel_ctx_t kernel_ctx; auto status = pd()->init_kernel_ctx(kernel_ctx); @@ -161,6 +166,7 @@ struct ref_reorder_t : public gpu_primitive_t { private: const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } std::vector kernels_; + std::shared_ptr zp_precomp_conv_; }; } // namespace ocl From 90cea8472e9f85adcbb310fedf05ae6477640477 Mon Sep 17 00:00:00 2001 From: "Wang, Zhitao" Date: Tue, 24 Dec 2024 05:30:35 +0000 Subject: [PATCH 04/40] doc: graph: add document for sdpa with compressed key and value --- doc/graph/fusion_patterns/fusion_patterns.md | 1 + .../images/compressed_sdpa_pattern.png | Bin 0 -> 199629 bytes .../sdpa_with_compressed_kv.md | 119 ++++++++++++++++++ 3 files changed, 120 insertions(+) create mode 100644 doc/graph/fusion_patterns/images/compressed_sdpa_pattern.png create mode 100644 doc/graph/fusion_patterns/sdpa_with_compressed_kv.md diff --git a/doc/graph/fusion_patterns/fusion_patterns.md b/doc/graph/fusion_patterns/fusion_patterns.md index b39374e1571..b76446667df 100644 --- a/doc/graph/fusion_patterns/fusion_patterns.md +++ b/doc/graph/fusion_patterns/fusion_patterns.md @@ -73,6 +73,7 @@ ReduceProd | ReduceSum] |:--------|:-----------------------------| | Scaled Dot-Product Attention | Refer to @ref dev_guide_graph_sdpa for more details. | | Grouped Query Attention | Refer to @ref dev_guide_graph_gqa for more details. | +| Scaled Dot-Product Attention with Compressed Key/Value | Refer to @ref dev_guide_graph_sdpa_compressed_kv for more details. | | Gated Multi-Layer Perceptron (Gated-MLP) | Refer to @ref dev_guide_graph_gated_mlp for more details. | | Convolution + BiasAdd\f$^?\f$ + BatchNormInference\f$^?\f$ + [Unary \| Binary]\f$^{0-3}\f$\f$_{>out}\f$ | This pattern is widely used in Convolution Neural Networks, for example ResNet, ResNext, SSD, etc. | | ConvTranspose + BiasAdd\f$^?\f$ + [Unary \| Binary]\f$^{0-3}\f$\f$_{>out}\f$ | This pattern is widely used in Generative Adversarial Networks. | diff --git a/doc/graph/fusion_patterns/images/compressed_sdpa_pattern.png b/doc/graph/fusion_patterns/images/compressed_sdpa_pattern.png new file mode 100644 index 0000000000000000000000000000000000000000..b0563e7fb0e5b497154ac412c3aa312876d0dd62 GIT binary patch literal 199629 zcmeEP2V4_b6GzW>mJ>x0D>md%ffNW}Lpmx&Kn3K43nV}^5J-YF0kHyBKD(mWv7U;( zVFMMrhrNrv9~O!fzqiT0WZ7Mcl)z&A@w-blo4nn9Gyj?S&%Al@qaAG8wCvJSMMb5} z2wO{M6_wviRa6@N-l7>ea^%g5d*FYKM9wzmD*s&R_C}@gR&BABn>ZlMm&fO-XfZ9M zue2C6e}PD>#kAC7FxWvsdK@0xPsk1s=>>Ad;1Kwn?Z*v7Pp}t6@c4YT7Q@<5j|M*a z)0G#<5rm4sONTMwKXe*6%rpnDzy~Y?>B}&dfetug78Jzix^lfo@xbp|8#0abm@IJk z&r!Bx?CrD|7T`6XH;D`WXT$ZGBtX7m;V%>ffFss)CS4Ewv}8;dwl7=A!~6ttjgz@T z5l;{(y$yzeo{=8iNP6`_;POS%<8-F4p%5W1n!HW#kA6*nStL$ z{+Jf%GXZY`+2DF4&&oGQSHuYdj{+H<$B`Zsb9rGQBiuusc)`{#p_51Xx=oJ2{~Y|$ zfG`^&JILQ&z~S=2PjJG}E20@0N`I6SEw zU3na_zw}&V24+HRxja9AI5M4OBE6shHar=<6Olg~NSE~cglUmJNBA^_0s;6v%&cKQ zh9?DtS#bHt!-mfh9R~>0&wp3;G=wtssLszh_|hhZ7#Rf`1v|M{TMaP)T z=7*q<+@3AA4*|D;NS#O=4&_)V2npnHk$Iv6r3>}viMcL8Y#-!Hp+L&OF@JFYAN?30 z0N+Bu7f1!3N%v;cxj^L2MPi{~5*PCcjmEMv1CPesR}d)19QT#{LyxNKHvogqX0P z2$Ux#^$wQ8!a}*0CN|C@7auN*#j^JgjToVe8g2}Na5=!@qbe>Civ0zCf?l!HL;IKj2nNevR}ziR)LsF+t=C+z>D+7+Wah^4VhEKL~efVsVi04z<1`9VPpeF+Sn)u9;>uRQkhGTUkDDrSU8(0lVFHn(&twN z{3>%uBYNrhkI6*e!yzdPnefKyR*69Lk2>*)9hbTA(l9{cw#$6~N;&bhseGtXRj7Pu zpxw}v5780CCqSJChLMRLB(I?|)V%)TRi12A>Z%~*N9BxfHSJhgv2Dd_QB_5hUN8v> zSD`IK)ucjoLc_@hrcOrC`%7 zDUGE6=mtT+*Z^7?eh!ZL$TBDLgD{7JIZsN_J7I`Y>7df%1f>Sq5M+D|0ajXZ0|kK+ z<{Oi`K@U})HYZC9m3De1`5PQ{Unw^gq3UZWpTmG|2vh?m8p>fDPXpBOU>#4q8&(g} zIjDP3gZ`AvHn7tz$QUTAhSn3*d%5Pq-jYc)WA^6N|^x91O3CL zgOCUr=nxC~au7MFD-%RImw*x6pX2xHk11_naSV$krvaeg)xMP^8x?Yq)Hjr9Lamf1 zso+q?F+>~`0-`4AupnKdjAvx(U|$R&i1OJXos#E6#;r3f4HLh}ysO`#z# zsff_gE62ELh;?&OOs&w*IjsIo^zK0#0w1`#BqR-0`D z6(MX;Ku?1vGzh7W5+UPO{)v@+2L@<5kK_(2hF0m%V*iV{5BRC6u^}WSSklK)nOjFI zwq!M^<$tGI^G0H_?OM|M8P)dUb@VUG{a%938JOdh$s-cKDRahPKQ^=$=c$Ewp z$i)i64eCc3kkq_T!{EUi!O%C*W>v*49|ymzGe^>|nIxl930VXs(28zQ9!~l{}DTV=CY*cDt0QL@Q2?MG=1l2GA)B@m^0Q_Xhx3cP^Ou`%ClJKPkFjWFcNdpr-CfW;3 zaejaS$*2^=0L&fK76w##2&!QKS(?6DpinQ%x-eWVT*H9PbHihb@dX$N^Ng*-g17)d zpxWw4fF4=GAJ&!NhpaLOA(3}XKO04tEg95lGsT~&ca#@M>(OWirF*fkj9Bo1ysuiR zZY+{-kLC(_pj)q`vo^9sxh!3iNLR)6k(H>OMyaX_Yo{gQMEP?s&9~rkES1(c$yb<>JaJTU`=H?;e{JfdAKK@)gff3qdTHf(o z$dz_UCrxxpo|F7S- z-$g@gY~z;d{G6fKe1whpsPWF@$Bead2nu!@8Eyhw$(G6v!DdAS&u6# z%f}jg*%zV7RcK`!=}e;1f|*GdF=#>SGe*b`6a@)HVBJ>@PHZi<9aqT2P76F%X?LJ9 zmmtLtvOpY4280c`VFpBlns>nIc!4!Sg%b`*1qYlIw}^{a>r8&-9yWBQ-H~2C^430O z_}0=3u2}9FG^`&?`ZTJ#hOl1g2NRl6s2_M?t$F=mLSQayNXqI5kzkS9b!RTvjVDNH z)se+w=!!~$5Kz~ulPy!gxo?WH>a~rEl`+M*sJ~vJodMo7gLW&Z(n8~0^LS?nMH}KB zaZt)49&ufTvz+OupyO1HH($i;hlIqiF1Tiln3$F(`3F1))FH3c$dv)B&*A(-lZ#Oe zXcDoGGAsv&l3Dlh3vt=QpAOR92W_SQbCLef5^O+-LqKA~v6`NK%=G?b6jZy5s9?MU zXCFckX7)+9SgU{24;xWo;#WBT=)|L9P$%k)86$I!Gw9B+6}HOYcBy=V~@qY&G7V?#c>t8o>%@i@Z%pwtXe|A`Li8!hpUd z3N^AY8v`}60s*YES+`jd(wPL{fZzkh+mP&{iE}KgGno|6jNrLD133AZ$*zBy5z@J? zN6fNR>MCZ4sm_Jq8&etJ1CE+!LI?tjACd|@2&w!M+inxAj-oigUqyaF?Y8t*QB$V? z`89+zO<6-yR({dSI*hHYWZU?fW^u4Prqm}APqtHCIgS;S4y<2=M>vI&c%)E8CkFA= z#435#L7skfhuc3&DjSick9?mMi1-{43IHb|BGY)g@&=%AKe~XwSuj3gP3Xp1JiAd=p#fqTh z8?!YckO3bK*W1{eOa_=NrU_Fi10eF0{2@6Bi^`DF#LPJwykZvR*DkzEWv+Hp-7*3a zDlH7rF-l7flMF8$6A8XoDs_}5jTpf^q#|PxB{@2P>89& z#BX_6pS))UrKHFDnmKs;+k`oW>56@sM$GYJT)d+jIXD4qnhB7NHRV>uT$ig{{Ur-OT(xfv< zLea?Jp;W=EGBGXNeMaUNV&=R4C8kNIn@G(_B^l$lD-(N?Ew-?cc4b&?c?4Ba?;J?< z!I-}0Ddh%{m?o89WnvoWE2IV$v*c8nn3hl?&WYo**KV;BRB(*}VQ%ecd1iUC)=xGNyvj33ef(qYgap+dfyRO5)b zC}p}Cq(cCyj}d>k1uXPJ(HOQq5oU6=#U&$1XPuaj@(~0|7G0EzUu&1z#+KvAH-fJ0 zez3Kh#-PI$TIEpU5cRM~2qgsK0yZK-rl=-*n*H2V6VpzF=oRLYf8}Jetff#n6JGC< z&7`^wlQ)>WRUOYbRg%pONCt=no?>ALvHM8v0=!i0Y8R*(5OJa2BIt+|$YESo=&aRm zpeCvO^JVi%8o@PyNrN4R;G{v!qTZwt$zf%}xh(&=J|~=i&Z(l;IH#&4DjFbBfe#{J zoehi*Y9Wh(8?xaL6?7sETnFV?^*Lr`;^y9v#H!X+%&MSzp3yX1DMmKlb~2(alr`%T;d2wSXrlciSS_7 zPAWoJ@CJ-jSYUjQ4d&~gun@ySY0jElRwPYLtXd!Z^o<-Ihm)sKInuS}JqAHZRRoEm{da)I^+bX?*8M%`# zv8ZK5qSAO2Mx~`nJTUPH3|X*q4v#6O!UY8MT^BtKm^H_Uo#DX>h zwJWk!O>juc>JaqH%Bw7?eI=4-$^gqEdE<^WlalwHDr4K9dZBzMiBzHTN%#ZGD<8C) z(8B_|UevxB!>c@lWK`;^AhEEg?p!^jubUg9dq1a$HR$f4G+Tx0)F8YaMUrk-2tWVe=^uriRrb6|@7*~MEFvbOH)YT7S5`$9uQB_B=2JI6n3KiNX z2~S9McaJU$u22OuFf>qrzei`{?4p5TgV8`@eyExnSo=W`<)s*3rOGfi&|{$tD-7Xl zo_wPd!WJ;B3F@K1VX13MP>n2?IpFwlVP%4d>lXmv+e&^2MEV>vp->PC;td}@TLhX} zf~W)2@6snkI2rW$l13x)iW_Nc!a;ME#^5_5@D#RoukN z8>I@3jbtt;uckDdwU<8vMZ^0Vr1#A(LO3 z3B%_%@Lg5#`+dlsS;Y$zOaC-DUwWmLxhhHRCnnia{&&!`tJXh)l*pzNv#>C&iqP)h zY0Jy6SivCz*c4t@B&mM_dz8`W20>sAFxV;-eh!ZL2>1dazzOU@CQ9gI&XZPyAw{R) zJ5F$5@BpP_OFt%9b>bSI(v~|=5QuNwiyo?c-~3R2o|x+r#P$KdWEl#Y1cN&)4V;zB zO4RNGFPw8nRucxb5}UAtQ5jUEy00Ik*L(^}M!=P}Trxhc#w}M3TA>D!UNV3N08$Qc z`z~qiP(k33RklHOjnjyU0aAh8?ai9)BL*NVIV=ZhM*TrYOwHU+=5lKn zHStm+t0=(KIlw$+{f12jG2>Cm_D^+XG3%>Lupm;P$U$M5K;zVWf(VL-0MNq^6m|7-76M51XnylKll4PSoyraIMBqQdf^`43jXdKZk ziDcEL%BTh#DTCK#RG-S4mO-5`r0LPH|Ij(6p1me=)|TAh5O$y#iKg+MoE(J_x{WL% z#ZY{>ElOk#S7}2JRxwdxX7z36kV$_^;Ib=PY zb23&93EwbrT1A|_T>B9}i=f6b1+R#Qs!c+j>i!o4CvviEDRhHso}$1Pk~3*?7Z8q0 zU6$e(S!_Xdxrd}M^sitOFerdEFZKk3-jLQ$Fo-Qf3C^4n?qO(Mci}cp>=l6ac(HF- zXirSqC_P&;&(;Jlc+g{|o@WbFhc(A8D%P`=bD6-Ymb`IC%3|a>!ZOl=QarEB8nX&Z zOc)ZiX<#k{sFu)!k*?yr2w9B=9Fnr0Gn1IfQDSLju#~|<`4tLcY0YP9s<6Pk>iAn( zOr$#0(edSU3aWa=JC9_;QWx-uRJfCp2q)}AOISfeVYnj_;nj?it3qum;SSKy@;@X4 zhpU7&fj&NDhZIudkB^ZW7TDEzK0qn@K!%~$k4^Gb#@Ub;hTue>PUy0TU$9sU%%}lD z-%y~0w?|zYPfQ0;&Bj;0VjRiCS+fIppsOj<(w57g%oX!|*nnk_8oAmnX`~Z`h{ApX z8Jk`=6+EMqOA;`?0@iORyLbmJREEw2NfIBjq1Z$K#6XC7SV%|4t<}B3NLR)%iR?C% z21OT=je`R#SH|FEu(D#vBw^gCA%^h6OA+o0MnkY6fKmc~8VDisK_p}niAsuyAd#Df z;-D}jREl8P@>ezp;zbZfGZ?O%A|gmel`B!O{;yEQZw}3XrAE{f!FsT;K#77?KtTgM z3Q|`Di8&LcA*RAQeHILFp)W{X5hP3EC=GG_H^fHJ(WAZy5?N)1qoA~9iB#^Dq)cT= z857)c3ko}f^?;*RI)hbk2I(|dhE9DPtOo?OQU+^624z7q<$_|UOiFe-fXCs04KK;p zX_sEhS<#A3g(Qq+4NfSJgN90an$PnK1XTxAlR z59!qkV8GpXyyZxT5H=^=f0B!ZV3f0OfHQ&)#3Cn2Ck@c1(BEFb;euV3X&kt!Ub!nb z1Bp*S0)Zy-XLBS?a=}~4A>wnChB8cxs_n|H>!mv))XdPvEhxlpa=5#RGedWTE;@K+ zY8V2-B&|K=0zmqMDBcCJgLnrB@lsxjXqy-!Pl6SPhFZ!2B zB*x)0MyCr~6$6>;I;n~weGtSXh>LQgZw9SGU96@65iBKyG>ibrmW$MH2_oh&l)HbI zkK@&p9YZK9(9?(SF|q@JrhbbKG563AcLXQJ-D=7J3kK@t$N)7A5OF01wR{jo6_nS8 z6wzt;JSGX0P(zG}IV9!g+<)r7KQ6{#J2I+Z{P^1bm8OH5_GKG~FcuX-J zfQ?~&H)@!futXDvd^Nw|%osX*a7b8~42Pt`dNWdqfa&E@*$9PeR#jWf)auG)^5=!p zVuBE%WSDZSuw8$glj4tncS7%zvcMQM==bxaZiQ(ebQVEv;)b<6~3Ybm)KExC{p znfYoem||VZ`l^%-LfPoDut=fWMl3i*={xGXnZzWBQrpTgZIv~FD79Ci36yX*R5yWi z79=ZBtu$Z}8-dq+@JBbqnO*}}7fyY(BxaVXsg~97GON&tuLeC#8)1kg$HiA*Wytck z4d&2!3=^gij?ft_7vR?-(a*Itj<5ZfHi#sS%Y%zSFd&a`S1xf+Nnc zB?8;X&K%WL+X}56Y89~oB)NJw3}=;w3(}Z|IJdDj)Ej4tnXq|MwbdK>VPu=D+TFSu z-=9?07P;b@GwB7w%LM1w%dM-)gI;1PxQ^C={gPW(lg|l)hE;IQ(PCAcZ_0pFEGA_G z-BoPL24q^>hcTvhq&ZgO3=ZdDy$@m(V65KS=v3;nNp4zR=`ieGi->)H?$Lns&! z*5G@}2@2xcA|(JdSQ8K^7KXba(Vu+R7HaZYv0I{j4E5SL3umNAj`)qxQUCjg9h!F-6Oj;P@8UmcMV}= zgwbef*mvmD)*NhC9e%5l3$LS$Ha6%YWeMpGf+d9b46&R9cAM# zs_$P~N6Pk#Q1UoZPBAuzjQESf~%cpV1kocG(LoN zS0uwyS0N}~y!cBigfY%KAPNE6S;_yQMLD&n940VH1f9VOu7J9FqAWXs?wCUw=a{*)n%gY^%; zP)pj{;L&h4jS}q{Ng+$h_LNF~V0RU_hVsJUO$IOhKr<=xG2ua63${oMKLVnJ`BaJ% zBKfi{d1pg9_;RQ}u^YR;IDijO75Uhg$LCwfSDDkj*>o-_7cnOUBQzSz$_#AzC0T!t z`9|q=>F8BRei>}O=Ta}$sl$8|#5w4lU>H~!Ltt|%P~;#4E3|}#Tz2XYuKkMJ@F@V- z7$iwucnB+-kg=#ovRUKhRRjop^HnG6yQ)jV~ooVfM~SV-CI2B zn@}@5KyC<#Rar_W0%))_RICl0CM266DpSshAgwe&%5g9*iEkbR3Vt? zF&QTEe<+|pf1>8IwS<1F%<`{*3giPRHY2o6%~rQ5^$T6H6A)&@6$EfWtsj6Z4DdoC z3+o}nV~VMWBj}egkZy%??gag+;0ma#UqsGbPLPn%Cr%;kKruo@Fl3uUDd`i5MoHJ0 z$^J{;D4WnU0`?}uC?GZH69#cLy3%W|Rj-+l2&As;kWp`SK%_w3DtCWbVP{bumlu^W5 z6A5LdpI4g<7oiHF9vSOe-uek{?&Kh}H6Ozq?IW^fnUD9g^0nc*jF4JQ`Q}g@ZZJr| zihXn?MK45ar`-0*AC*!%orNw54PcA?rQ9sa``7xmPv|&crYh)tfJ_%|hbNM1rt19M zxG{l2Atpf~77=zl40V!aGmOJ>mMfi-+b>@e=`gpsFbm_FQHWnm0f^{BiaBgLZWdhTx zn5viR&N(6z$m5_=feSmx$T{3$vZd=J5u0N$+QHJnZ6e8xLli?&{-ttMtl$s8)L2?T0{Xc8u1l^&?Wje)YeJ5-60K{kZyO9H^_sRys15I z2z86FOYBG=VO7x#W1*lpVPyp#Q%ph!#{x0%l3MdDkjwx>9l{Sug^UfUKoAo)3P*Kc zCeUh`D=jr9&`n149i!f2ZWv-$Fg3EJy@EhNAhM@8ws00bRQY|x$cl31&MBhD6_vQ5 zFvS&u%auk}sCr`8U{qBT(O)DaHKA{b(Nt=nIx+E6=co>09D#I{x<^`=RsiaH#!O=b z@)4Am83W{v{EL!zrOd%-u8;?^=3F6aBc%6UZXN&?CM@M&wrUeO5TfURa~m)c1aZCY zu%002c;u~luqR~e@k7=lu&-tAjf7c4oiNDCh1k-aMg?1l$GCZoHem|6E?y%gXAtF1 zB4x@g*MWl9CHLkw)**ml={0H$Z&ZL4gX_X(vANaSa-H-CN$efzdDWrr#`4)d?p|Z; zxMr4N_O1aFt*wo{b<1{VFnpXJ*5AM!hh7NDAtiMuU-CpS&vNewF`Q)n6UJe~`es@T zrlP1jQt_*%4^|mIj{!M8tO=Ec6Lk_>lRgR*)-kbl1LgBb@4mW%Kt+;|^g)!3pGuMs z*=sfwI4}U?e@gV7RR8uaY`wm$PD#VDS&!Ha1i1mf}BYj?||AnOkNzaq0JJD6A zR*0yi1G#*#WZX)Nh9p_Y|B&Ddmq{lT2x3hbbcz**|Gy?3NIDzUgnv3LCZjz5*QWyC zDr{9Ip#oM~uMS2okC*?)Z`@)2G@_J1W`eAb{a^Lpn zqxrfn)~Hc!fSHy`c+n7Ou|Ylz{OcsfClvzJE5rza!i~GPFCAdHwdB9QT*|LDq*Jzc z8I^22sdN$O90X@*1o zI*B@%j&l(R9gMsMVLt2;LPJYmUkiU{Td8EqQ+Z+^fWX1Z5_z=h-B{YFl0ge%`*Cf# zJU?ZLJfIU8G$>L=cH@GX;kw>IjsYSL%_eB9IK-x$Vo*%-%nu-d%*bPo!=ogJlzBFa z=h>=DSwM;!H6#=%ktfv=qC+F^cF=%guIkP>-iSj9z&JSdF1#Y0?|P6s`!%`KYD!=w z;`@46Nl7Yz$}Ab6Sb?{T@*IR}+tSU=NgtsyAx54*!se9_&T+H=ti=JRq8yp5pasT% zR3uJG1)?4##FVR^u+m3-Eh#LO_a1|^ExxNBiC;ktSZ^@NuR1J{FI%_E&zno74JTn3 z&lLI&SmGZZQ_Ny0Wr4h;uA!`HtmOiENRpWIY(fJQNRdiQD_;wn>IWjj#_gp#>RE7SL>5T+IyKgTWR|h{njv#C?Nw{es!ZQR2mxnXF>a zj8GbcgIYH_9oFOc31PNOBFqdLasb_h8Vm#T^?o5d4p&r)N)QVKd@(Pm^g$m%U?A5A zwlkMrJzFRggqA+dSHK7TpG%bm`5~PgkP)R{Lh8O^ZUOyxC)RYhiKgmK=W%qU}!Tb7-{ zx`VLo0gM_H+7p~~y1}F(^NO*;Qng~xyuqe`gnS$j1fbM>@R7A8Z!%DNcn-cRehmr> zn`X#hApPJ=_YDP&s92VCYmfpV@9t?K>r&11mHdO}NqSpb(hb1-%6Aw=oMUTt01wc9`@zV%iJ;!e+>*GOz$s#Pu5%h&cnL*CE@P z7xg+!Am><~6`;m+lM(a;U&BSr1~%523-&e&QiPvTNs7`1wdHcDUPC`U{*13e4uoWo-Zs+oqd6BJE2JdCAk zzE=Sg4grURmjH*Xa<2mTxXpw@Hi)HY@CSpCB|0KRP%ATC*a<|Y4!6LEiF}TD$MEuD zG6J~NwetJ$@%-HHOXxG{gQ(0+OHRjZTwnfo(6h@eH;0o^IevsJe}W4^eidAiIBn%W zfBCbuk+1ftU$}yEqGb(Apib64t`LT#7OogThv4V*UM?CH;Fj$1{M_$T3|DGA9foXU z8Hp6rjhRv4BC5A8hIpbUvC8}!Y(0ZYK5~X6$ zDpbA(QT0n8OpQ(Sj8G~HDnreif4s^wNk*lv3Zm#;b1GPMwWz8h)}Spz(;XG6Q-cW6 zjUoQRfRjY9YhIlQEI8hRQdgHyFs?aWs){+IwEm%aqzd(~L4>jq0Ur=`X@;UQJqSn)A1QPg(x5~YtwKvkXQ#$O%1HNHdWRuq`VZ%syd+? zXM%E5wHFf68FlAIAX&t!QRh#PtgRv5S@b5ZIuY15zY{ zP(X+I@Yx~}&j*yAVRmTr2@y`_A~mx}HJrR826+d*iV}6k;GGCOMKR{RJVw$pl$>12 zjgYIZM^*7bE312zATH3ksE~zZND@8NSLV_=KtmX!L8YpNcnqc)Yn)z$8G~BQnwr2Y zCVKgX$f!tyuR&)YkR3jdn=A!U(uy2~EemtPRCr1UW%+@n0%(OThQbwd!;s?`_$!S@ zfr3D=F2EeSYb|=H^7X!CJK~nc)yic=Cf90VFiuuwYjrRgR!H_^D>tB<3PGUm%h2g$ zd7BjS%-q#`;8{v)y2ok5?647C4fB??4MBE+ZU1?Ja7TMH72|$H(kc9ZAy6FCT=S zBs9A^W!A*M3?CzYsny+(qRP}Xe2<-i!J zkS-@fVi~1OkUDeZmedH!02=l(;d{!NQY4N=rXM2{1dOPNPAeVhmBo?Cjb~k%QmUIs z7NB`y!3`8Vi1f8*CCNsGh7*oTT|-Hhv{Y~;;}{|i3K1*AnV$`3=nUwuL1kdc(;(CX zVq)?Rx1VaQZ^C=3Y zftN4+$4+o%d<-eKhmd)=<+CzzLanVK-2{|&NhmQWOtp`8B!ku?BQIH}C=>`l6^WG= z&8$??8?IrLZ|jCVNAf9$$MzGl14@0FX#tPo`*E_xT#jx4KS(+Su$`uxrLDif-6G7E zVZ?HB_M#g*)9sKmu!G5`*hYRe3>pVbzLWIrq#2^vtnAxKe30;QmND_tkKsv#W)R__ zEWVkeud|`ID~~lXf)x@h66peh2l4hQE)7HU6xV%|^as(^bJ%T=Nzbd!z%~XVSAPeN ziEu2<&elN`8e$V*4(|jLv^T&8T%?b(n#>J8nMOB3!4~5H*S}t%q%wdCA4UeG;*Ck9 z{bU4RA8Qb5R0uGW3?mw>7YC0iCdx4c%S}+>Lv26Zu+}X<$)HdJpu16zEPt8sPO1=O zsg&wP_ezD3LW*J(0-w}YfI6@k1RZQJ>Ojorz!aatYQ$aPh@ud0@KRDA4$PJ`zX{wh z%wf#YU!|J7Mok+UWfQr<#fNh^@LzIkg!5EWH-F(BZ&aakkS3*ibOZBaM0z^)O*woV| z>v;TQz(pT0f~xt71pJUPJ%K|~HfRI=w($#`*Xud!3VMh>e8Dmx6^#tl2dxS98$zZODm-C0Imj%M|z%A4XKXIEE*bHpxmwk zkQqOuOb4lt%(X`^OLPL^nh^!Tv~-Z`2=bv!NeGZF1M`a77Rgi+z&r99GajOavX!oi zR7n_g{31GO40C5tC-FQ+EqF_~A(>2E;RtoTB_hYIO#8_aex#uT%>3daO#>(s6!euE z0HZ7eW@)2N6q@VSSAwBP#lS4lLdrW!uqg%#%*@&cW)^NpnG#apSt28~ap7wfu0-7zu(|{kqHaf+|&j~EXvb@cVT>KpTbVGu? ztp#I!!*q#dCWttdYU){?+@`OIkBPTIIT=g>fzGm|Thow^kJ9U?Aa_~KQceaKNpic8 z0fz4}fWw5YKi2xoz6kBFLYG~T&Lq*Ox$yh~1OQlSazbmdT1X)ij_ah)faU}<;j)f0 z*a?SmMg5uY8ZYW0sN&D(vVMXxt!lGZsg#tPmPRp6N^nTwzC4Y=lB%TVNm4Xr83w6& zfJP!9tiF~8$@VOPoGOGhDA6^~2N^&#l?{o`3>=aQ5k%P%QHl(zmCB$72DY*;`P2hj zVsg_(>Uz`4smdkppH$=fBJm=Sm;j`Qq%~qLxLPnw%@|GcPf>mq1Q+O6!R!iV=AlK! zT3MJKhX7IcUKK2fD8>8Y9OR$No>bk;R@U@U-c%Xl*Y-0ZF#rXl=8Y8-$|EYW29WfF zLsHgQ(TR}(AtXTjDF_~l+!nKJWvDz;N7gnK@f7e==9h^I=0o#33X^L=M-N z9l}Qz@Zbxz_-t=3-<<6;$xjma62BnxTz@kCSy}ZhU9DcW|3P^*$7!5wKU}DBHLv8d zex#_9%Ql^aLsnVIVE|C8duHM>O7_GmrkAAGhy5=?5aL=I8sMX2XtrzV_ht9v9D8TplvJpP;nrR)I4-C2~LrWyMlvO8N5)Lmr)nx zPfxWpgk$bvL7yZt9^pH}IcVHOnvH`ijZ{NQ)mF$m+|H+T zi(sNOR!#_Ip<6u=LJ3%wkvfF1E6>tmfmiY=Y<4XD0Q~~ka2v2bNJnM&o87}yRC=n6 zurwPJ=K0}xr1-Cl)t?HtKNR|mO6|+%u=fu%ZE~P_Q0w5S3(q`QIQ5m?%g_6U_S(UW zKmTsW%+5Zd%pDoe9J2e~X7^j>Yle2NXrm3`;TI$cNsAr2(XR0*mZKjf~-9GSaPqWfL(lc81x1K7Ev44p@RdUpR zH}YACJqTW_n#MI!Rl)wn+Nw^?kDsxMZd>|zqd3j>hQiTh8J0$zO2y~cdhAd zz2OzL?su)o_V&Fr{VSe>6XWCOe{3G_zsaw&!+=OZVAPg7z4{G`c&T=GRs4<@$3N%} zo)gga+eMqTFA^WR-r-D09PrNr^?rkLhuEa=>Q*o`GVNg0t+ClZ+IJtC75~0d-0Q}# z9J_e`v&rYGpK*?MoPVdp5Gz6C#iyU!SzKLWsgj-Yzhm)xqz`dSw^&OXqfY^X)j?^0 zj4ztoGI^Qxlkv+eie?PHG$a4%o;yYnC$4?hG3}Gt!Sw5a4q6GLx6aDXvFO0P2p(}t zw;R{B(K~!T;$%wx-8=UmMQnG?N|?&C`!S(lR`i2|c?O?_qb##MqIyj^d$ViazpGZX zQC+d#>6B%mO=@GGyVHiuUf|tG^;+NPSsHsf4*jrqO=s;in`iy~E(N?zo2d~nX4Y%1 zElHQ!1{dz#z-?dFmovO!0yQCza~zt&!;pgdVL)iLK; z;Lu|GBj1}_wZ5#TzbNU8$%e<(8+9*9<*ZaRM z*6=vHFY)}nCu7h4eIt>+(=F#r+~bV+;EMD`|a-UrhoLT?o z7)6iJ+O+Jh)mbME|DH{HMmIi{V2$d~@V1STv$a=+C8h*t+RdMm)2;ZRAg9l=Ov9~( z`aPq+<_~#2E}^4o@82vgncTHcqUE2`oOyL_yLR@WwmBOHMt+%Vvd7hSZkCVN!bMS> zbbY6#8S%qj{(aT$T4K9-xxJ#-KlPj2UZU_lH#QN+Y3A)Yb)i=?!+*GfHZic~M^Ywz?O-~V<`~rTtnH-nm19)55>5D$D^_q;kdB`A$gE9vAH&8~zD4S~!Bf-J&RNj%e(j zXT^eVpC>F>-Zkb%$e@?U^h~;c_{8AE*d=WK?{4E9wPDGpyroy#qTP9$BFh%dXKjle zzAoFw;1$W1q%R0u@2_tM&OQBH_fAU*NUE9u98462KlyD| z{_oxBgJ{daX{WBl@*r5|OxL((uaa$J?dkUP^dh#E3 zzoF&>7>KEX;OQKRUw@ni&i#GS!M_JcZf5v%#86y$5!+THTRSNI^~t}%Y0U;V-yDlt z6rk+G>u6d}>PGt(uRR)mb(u$@IB$CH@ROF{;`&82Wl1m2a_Mkym%Q%juRZ4GKLp5a zWSf)q?;bxx?LyD&_X1$WiqD;IeEMXo-V4LTSADaKHq6~K_)o(rGtPNG-tCa}WB2yg ze>fCv=Zw|;R5&I3?zAp8TB@3Xl%Ta)h4g1>uu74A6N zyH&H_^b>|(-2BniWW${JV27mDJ=ag{%{p)5bt_c#E~djxaYxFt70g1lh!tAzv_sfmvz^8|G0J@*RzJb>wKMk;BoVbt-CaDq#8G*kzwKV9gfe> zn01}H=Y8Ac)5e7XS1<2e=0B-ho7mwkklQ^ug0{y1+-~HWKH0ASJJ;TbtJ5vjbW?Ei zh(?W6H7=K^KJ)5xW%upo?bgkkx;O9K`iuvgCoTMTeOAXdGiLx@we8lFIbCb*!0=Tm zzEPZAy4@p(oryO!OOMXFKkVJMzjvCwLvD>`eloMfZcxGb%&bKx;*JlCYdN8b@6cPJ zLl4dteB=p^d%CXjZ{FmzIe1C~*GyX-+a4%Y2SLufUE!bmo%J3NcSnnp;v013S%y)U zo$4AsBT~Ja$K`KNN?Eh^n^Dmr=ihg%v@jZ*;XT0k)v`NDSD&!5H$6!6RUb8eZuh_E z9v_r6a%j%U<}>mK9CT^@(R5^v`SQmJ^DR$&>6-3%{rvUen^c4JBfCC)_r<>VJ7Mp% z>+bDs-iLFdN50Ovf8aspz;|m#7#ePU%gcKA%+PR5#|3J0GDiAaa>W@gI$a$4>6jSX zTG;%dIe4hC#kaMd;b(A&w#TArrr^`1Z|k|IH2y&grP{?az%{ z*WIQ+*M+fSiYn+kR%2jVS+pXw!Yn|oUk80Wu&Vt7! zC9$#Vinl&}+Ew#)?{qKDu}?8;mvqpdQE)j%C4Wcar0D7SIlp(v7_e-351R>^9o*MG z%3Zp8$*5CP6L&s3vbbH$k+l7Wo}yds%{0c`?bo`;Aj@+{jxI{Nb93L*BYtOC_ipao zxA@$htev`-bM|&gnR&VB{Qbz+Pujix_-0gUhZ7MNM!iaUYqnGStwrQ9)t3kVXB~C` z+;_^xz~O%Eo=?Iv-^4vtH8mR?bo17nxJj@X>*Pb5d=;;wr<@YC9QR0H`o;NZh!pP_h7-fGm9IU7cJ3#a=MFt?`P8s zc7CNlGdy8dwD?im*Yu4|tgWv9u;NU(e*clsc;gw`J^s7gl*zaD_q}3uHsxzz`^O8v z<$X=_dmMVX=-)g~w*`~#ZJT_+z-aZHxsL22yKP!&*FU|v|Krj3E9~^-5%cLcmeS6) zdU^13(?yZs+36(fUt_kgoFzfn6la$-)2B|y4+b0-2`$b{$hv>!lxsdWVeXmLpYoE2 zM0|Zj>o%?DA4dLLb{&0}bl`5>GDgOnkAaUry?<*OvEv8w>D2tJ4^clp%+&Erf7)#M zBCq*CmDmLBg<<>r+T`3AqI+e>CvMzYL&rxQc5RK? z5?nIvd%|pXzDnDV4}u4p7zfSC%u|VvoHB9JB+c=(7`^O_|H3{`dNhCa(FJWg!Q(XDypYYRG0pPNh}BN3>c7uT7Mv~eweFhG3PSV|b$7tk( zfb^q2=_Us|_jz%4#G1Z6?*E(j@{_*cFWQNwqaIz*?0x0msoe)h=11MW)c=IxBlR6C z{64tvUb_=~X!mXO)5-Ch>;_L3=k9pS?Nie7oW_ck?6^*o3tb{T9EfRjGMt1-}0i94cw;5^y{~ zZ}9&7KCd!-=vFqXCJuc6*{bJ~)Hjz8KC*vwVpX4#DT#A@pVHD>c@*z;S^oG^oTCX4 z@#Rn3x;1Lt(Y|Mn|Lrc>w+1;Jxbtn1=f|bP@}C^g4)@OdUxKgd=2PF&N(NmoHqsbu zx?`pNl+6EGvCUh0M5%@qe+&3Jyi>1|DYQ94r>ti)X(u{&T;m|$2T%+Gyx`N_qVJ@i#gPo0~Z+yhMN zHFN2t&VBHO`L)=>%QNi3-ga@x)9yBNYL~6uVw*qHtMTR&m&}Wvzsvl0F>|PA&*+hA z%?olH=Pc6KS==XhpWoPl2d0Ul<1ZJq4*X)gdyIM`-#r^M?ld}kXZxA0H`KI-PE^l6 zx8?6?`Sc6j6PahfE_095+-v^t*Ra@p`T+22mp;!97v&S8w0{+oq3|BTPMSjdr^kQwk!)zqrMA(Sie&(~;5 zFEc;K_dl-vf=+SCPkc@~wpBHa_P5&ce%oEK(~fO#bMD=HyV+9Y(tZBod2TAIiN@yp z4hmv^oIH80yh&1);Z{^_?3zzfy5XWnI2zRrv;Z(sNR`}@U#=?nB6-|@~N*L?k% z--MmO*X`}j{;(r()21P3*8~Re=09tHs--Bf(V2G3&WGfE)pAepI=*hz8r`+=561S4 z7Bc&@e~(LUf7;shS&QW*j8-{5bo6ltpI`MK}YTnCRvo8tPMNhi5Z|ny?FK|$&&YH#i zGmqAvSNZn!LyREm)qhhy_<$Rnzi)Z7UeO8bW*^CDPb=oCXQ#NeeYaB(0mgX!Ep*RL zvm(2m(d+-50zj|Lw~)})S*)%)$=;?DI*$KQeC3~#4d=dpx*6S~ary$McE6Q%v7C0M z=^q9R(0#V(_3KBMOo~O1FYo-3sN350>i+rqf87~3=S-VR-`B3>%rkDnmLlMXt>ZsT z%fFtm_s-d#(Fs~{-!oLD7pL**?XF3Zi^~hA>zWsV$lLmddD2H=$r!Uy4}k|68RtKU zo*NnXMR3Gw;h}RW2M*+}Yx?EYtxjrVws;gYnf(4i6#wtPpQ!K5Q!y*j=xD5JYJIqm z@GLq{wz7G;8kUo_{7vuNDT2Zs+b;<{*thoazyEzWV2>ul0YszGXAABp?cLYn$Z_2R zCfBFrGTwjFnJZX4G5)ye@ueG@=&8bqV3djPG$b%4*&G3*#23O*kL-=={M6fnx)D zXYXD8d2XC2I;ma2GxfsRSEGHd{MJf+-~PWdJ<<#|csLAe`Uk4kn}>1>hvAMal7^R{p1B@c1CalYv6osh0;HqKhgiuhEZ z`}N(tNps$M_;ItIHO;n3IXFkqD%-DdPM?yEoS0c0>n&mLU+{Y6xLu97Qm`}qgW45m zBkqX>yZ_uWJ3QQaj_34#Yok(Q^MAj`O}jTcKXcwYwE?&PJ?yI2)pfV(p6r{e9(Qut z-zF~kLCJv4mZQIH@+j)|;myXK+XX>(W1qFj*3OE&_iBUpv@_fFUK%~LoHMa)w)X7& z;3lKbt$7ufzr^w@3uPzQZ&|kPv+b?fAO5zyA8_SU&Y+Kjzy)-kHAipM@Zp7=>6z=A zcwRiw3JkR4!Q*Kcmu0Q%)BN?O!cUuz-}ltYySU@K(e>$rMH@>Vjw;&nLOY_-xD3Z1 zj{;W?`{bC^D03N)xBd3T{ojv1=rB!ZU2(zsyO-9!92I&jX&8O^l?Rz&Y8FmWpVXgD z?7mCGrTE=^`dym?8Vh4gcRy=0X#0mLsoVM`zIXik;tFf&$Qi48=$|k0FHupy+b!KO zQRU2-;{|&jZHh{E9J1>2%e6gTvb~#le9@LXvP{kB-$w#_G;5$&w}<+xQa2Y`jVtkN znRGbjdnYyD`499r2d?XT@8Ye4M=UQ4ZJ)T}ZPC@_1(8mMyb+6XUyTY3{ocRbjVU@l zzQTtq48p=p?E@wTFJ1nj&FXj>aVm?qhhb@-HjmeqLe^Vb)bbqj?n+-&5CQW(@VeJInVanWlq{PvoO?q#%HUsX@9K}CM?T+H76$2 z!L84h0%yR&E@Bj>Ztu1yanNe<`|xKu^TZ|lKk#z;L~Aq58ChB4l8nwj)C;eqJL<*i zdXM|s%kp1K`jaJ@!9N^Y_FZwf|ID>4#CbA^QJKU4BRj=*>XI=-fhuazPrIFMA;qwk z`f*{ep#$Pe*d2k1aiJ&tVU{_$YnOo+GpS z+%H)tY_b_#=1a3ChF(W}Y+L%X(^L;LR`p)yaHQ#h+v~c%z4oy2p${&%H#a?UsboYi zt(l9)o8AV}Z4#t;a&$|x0K{_2^=`n{2q3j1y=Uy1;Z-!{!|8}4-==Lf$n+N9d-?Ur z{c~?31>Ho=a@!^^w>DbMOx@xCcyz+J6XF@s`#*6U4*uZqHn_YLUG1^4=e-}Mw^!SL zd8d7S!DG#hCknnLCgr~VQzd87s`a-UHTkSHcA%k}>D$%1saAO}gF+oWvuz$GeboE( zO}OIS$=|{|C4OH&@wARW$OoQhGjLxS3!e09YoK~jdsB~?kgHsa`u#T##WHO{s%&AN6# z#&WN zk2c%d;fsgEt7fACTcfgt9q{ksW%0W|&sq8Kp7tt6eDdmZ4p}J&KlaIc-b=g9Q&)Pg zqzTf}CjNx7Q;k4KZI;hSE?yKIYeD;mGOxF9=gYLo1%+Iuc#A7h zacZlMv{ieS6{islh&FG#gte>EZr(Y;$z0EfZhZRT{v+LXjVWXlsb-Z-$;m44cTZ1_ z@qgR?sB<@QPhEqdo_`!KaE|fLZ2x2UmmR&9?bte?Lo<+M5lM+lcKsu1{kxZA zvL-cmbQ)#9WbZBa=)#S97OP%28Va69N|Z~6V~m&PO+%x1t(H$4?~R1v8}{t)l)fn4 z$75;Kh8H4L)23!^tTvt7>}hr1G%9jMfK}0wyN$v_LB#O2-CMEC z9(in>=#Rw@=||pe`1a;Z>-6;h?w#U{FXBHkwEWO&V)wq@ou|6>=-JkEp|^kUoJHHC z_Ly%s8U||t-nb-a9`0rqUD(gzLws&*^9hsUJl+}K^}g*Gad_2`{GP3^Zl7X0>QAFJ zQvs|yo{zU_7c(qz$^Juca@v1gqnS6PZ_>IFhbxl?Ej#{#dE)SiA&Xff=+F26<=D~R ztZ7H7HU(;U_DhY|j6b?~(p-zz-aGqEO6Fha89gIkHB=)mvd#0)Dvd_0N$Z!fpfG>5 z(^=i<^iSyr$MwGa<4yXVHS6No>fcA-dOf`}fGC+|A!MbWJ!5 z{N*Y0yX3r5b6YV7IBjNgH1kp!Z~pjZc`*KuA*y-jGWm%`sg6V69=TR<`Q6E$*Ov}f z>zn4bctzj5-$Tr1sHZ!sj)?OJF)@UXsX$L91YJ96gw`$Z>3 zM$bx%d3~|foU~Uq$Co=5+a3AzBs1@#+P5$IddwxA21kG0H7@H@^uE~iDDCJ4-@i>{ z-u4@N!{pQ36YE2hOWIvdd*6SeKd-B6MpW?D$F0Tx1fTa?{{8(WZs@R>+a*&nQ(QJ( zIzQAjB3s|%_WuIX26PhcS(-D1H?qxf5P2jScQ9SAnX@w2Z@duAyd-$LEb{ako6y$RehfW& z@2*$j-iH?A;s+D`G7lf)|K&TY=z-%HTVT>VnC~=yH^aT?kp8Syd;T+DdHD6e`<4fv za5GpP6a4)a|K@68`~P*y%$)O#)7fN%-Riq<-Mxy2vUD8G^UrG*n7@dA6VysGG(78! zT_MM%caF~>SC{u{i++0;WASoc_+||o%jh+(z~3`^-0D>JmA;+&M{i3pyuQin`2BSo z(!}3Sjg8B1cXjdJD~C_M`7({`^|^!T>1H99qWKq6`2TeJJmubh{WlaBIB)6Y{ABujULr_$2gdz+0L;B3K>0{rE{j)bhf2J9q!9cD(17$D{w~Zt^;AQAyOpD+fmeKJ9hC-8g>a=h+(* zy6@`L^WdTO-z*DLOLC4symE0t{#knm?|*v~2yV_o{O!OG>e(Z6Qzo_lbMB5G<0XFL z4~G`Jtv4lTe)E|8&CN5`Ia>YU-9Pen=6$}XmRRucTei!-Cs$2hP04++z~5%dB3%ak zBdhKEu<2JO4t`uP&9eCHgs20Vtec(eb_pKb6ns6-FnHVQz-^KLli~WSBVVM>oqMp* zWbwS28oPGKJRZF3`i=!rj16s;OiFWa6Z+?*m^A;A9UTUHjkG?OmpA(I*M5E-r(Nnj z(q-(A7iY$&j|=eGt2W1Z#`e2rds?0zIy)ilfhaM5_4k>vT~a)QU#km0@_i1RIC?_m z>!R3}9(d<&PunlfyU)88y+^{Gwqldt;{;YnU@>F^^|4PfV zxOZ>coM)3$FJ0I&ZRD{gUZ0m7Wu!0iSbXTj^{WSlT%K_Cok`!b`d-H#e%SJ3>D95J z)_={-)cxm1u=;0b;8d(H$QpZT%GT%=2T;H2vfE}|yUR=0Il23J3|;$q+}`QEw?EU0 zd+2||#IeNw?AE~-bxQP3G?h^Zc+S+s5Iu^QH5b78V`U z*!6(f_uh>B>#B`ZO}q4bJ86CUj0IEFTlaWmBM=m<`|@wU60f2#F7J7#UTWOv^jD83 z-SuF#>vf{zvVa6`YWAqkc}(ud*W2U#mhEBo?|K^grzO-QYl9 zw|xUnX#B4;pFQVNw=Km9kLN5K`%P=?<)Djm53CG7zkkdCkMxx#43A4wR$SDc7XN(d zoc*fno&C>G&KNfCW=zi8t)1Pq4jyvaEBfdb8bd$yWA$xj+M!8-*T<3iX>)G%OQ|i!@L4Xmemly5sIC7B?g4Kor^-tjbSR%dmV`aMSSS;q(Q6 zv9y_kKfmaAe@=o|&IY|mk>g*U7d1y_+;%jTsD)|%Z~7kb*?xJWemK~_in@{6V*ADY zo0G!FjYwaxGD`iAF(-mDb1${iKas84G}D(qr;E=$N<8O#Vy%-ZtZ$@y${a zlJ9knZ~b&*+QvDrmS6JUK1yX%_h_%c{eXyx@lIHwzgPFnxa4BB?h7}hj+?p7Bdm4z zyPtMUHNJIpaMZ#D=}$aA>qP6Pr_HqpdAx1Mvzxm*7zG}49-TSeIQ2tH*8T_gB_wT8 zqk*Z16Vy#%Ce-rghNMURQvZ1!ATar>uhB@c*wof?K)k)z)JAznzD&tj5ZU=;Gu6R$Sm5TAFCFxnKSR`r3+uvawd)ZD?k2g59_DHbBw-qEjC|zJpR?f z@AfCoYs3|63&VH6xADzLXlEMTb^@R>4_qq{u7AOIp4uX3n?=$2>EWEo*WOLvb+1p8 ziMIu7z6`l!bE9L-vd%6#M}TQJQO|p1bg+%8Y536jPPWWgP$UG1vjWT9QI`2u}RLkJ+$=EBI_2a8TvKtMud{PE9>8 zKl+a?YSB!X9Q&7ZU&oOmO@}9s&iyYSX1-?5 zqED$ty`odQ&(!yFLqINl!FTUfA8w2o`y}p1sObRh@wxq&t%PrufN0Z{p0e~?@f4kX z8ydgs*grZo>W$Xrg*i_>bC=HBf8_n;2{r?DT=q_1-&TJQ@NE*hhO9W0a6aluY~<%k+#pOwJ-L(^!42YJJ-0kpN>6zZT4Y-#YneP zpH{1D8R&kUP_UZ6a`2HKtJ_5-XdL%ve$Rb$^ARnvsnhO$ADt%%)BR0- zMPObh+92RCqh{YzsJd8k{A?$qCGv$tOFEg^Z$B^jpi9Z1WG&nMLHWlJw-tzR-4LEN22EX6r!s`#Hr2(-xT}x zKx$KeyZ|o5XX4kYQarLL>`gl7-nj(p?~07Azk2g+g!s%0+Dm*reqXC*nJcFD&ezQ> zs5Gzsx_-YSU`cvbj-iJ&sIlDRjb>*=mFR856%{Hud6S)N265@N%`cIBzf*Q<&&417 zCec#A_9lLLP-l1M+X@QSJ#>if!X5-0*m{>9mEtbH{&%j5>IR%G7KTgSUc6>2M#l;Flm<{YsPIQ^ zjAKW63u3lhVj!rSwL$TYo_XKqo7PA^E11jd1WV1E+hiMK>b~=h@N2fPiQ3j9^S$GV z{n1$I%Le-S&Hj$zwY;7qLwD|*va+a%eH7g~_>OD2@ z44{k7)o+Vz*%suHkr2*2C*k_-D`%>E`ut|`IEI}7&*b2^~$*m4l$>=Hb1wiDthooVyx1`<5*$=qirRSwR068 z3lILFMCF(VHo$8wvu2)KuAoqgh?vL#!~Dd14WH{L+CEO6ESX2MO|Xu+Z|O$#z!Yqj z;h~JxJFj)xP0@|sw9mu}<7FGm3bW=Z2=7q(u_Gl$oU zVg1DR;@@DwyCK$9ZDA3OodIPP%*M+HbAi&sTCJhlAZ7@${SkWZVSpy~~~S zL$A;YV@$rw4J($59dOPAp}Mu6ZMBuLw!-OPUf5uDFNuWbo!j<#Jvw4%6%MSm=4wWhB)99T@SFD__aQ1nQYxf$J2pHrVsjlh{K|)%6tH| zeTsP%0#ZRJBIFZ*wNbW}usEcH)%G5nSm`bWVW5R+&zk9BXc(!M5Z@jGlHQqM(U!f` znX<>NfI-M4`ak}92m=mOm66z4t%L3hg3vh+PC=-`K!gCcW{{AdY5QN!H*!EB15<~@GcRxTnUS_kXnwVKVLP2Y=Di;_?o{Cf7H72|Hq z$~}gq9!$ra?^=_jbJ_47O4~Z-%DcWQjoYB?yGN7$|7N|rMazzJFZnw?zlRtl7UZ;X z{Ag&6o8X4G#;wSIUEc3?UmBYBYC==;qPP0R4*b4CeBVq&hf~K&{FX@;nyYnLO}c{A zFn}Jf^)Z9WrQ&S#0I*5bFJXK4w&LZcvT(<)4?v8}#g#v>{PcOUpS^Icd~MF`m%SsQ zSEFoIOfM}VQPH0}^VWak$H2Rx_1mgMiHt31?%rlc`}!-sbcWT|E-Z%mQT#0^MWd)+ z(pr6iS>FG@2J#HUPx4u?yj;8`xX)ALw&S0E^7pggQlx zf}Zqwc~*N)7N14@?A%~1W46??_ZXrDpH|`Z0~%DgIb4W?Z4PsO!!lZ>%Yg`~ z;ky@6;(dr(Sjb#9e30@j%TsSuQzxtGaRl)|Sxycwrr9AiQ@nO!!XlHR*Y>8wTi1-& z9G^<@FEEJvg!vupIMuxq3P<0FH&1>O&TKNenl_^vsdNJBK_v)IXmz3rd+ctM`%j@F zh9!Nq8w++`wC!E`b%R_Vao+RMLab%y#}Gri!z<9+-&SuLejjnKW4o{w^{UtoT;;XyR!c*r<1XQCJ~Sp=lssXIG~o^CB;+DqNoPDUr~ zMTEr6I@EW2vB~4OyhSL`E=Ve3&G9cPj(RBjSz+M1x7Qzt)Ns%`G*AZ7BAI@tPCpGS zvEd%8ZZ;S|*E$I+55SCny7EGRg>*w6HoU5_TI`^d^|T54isW3`)%upgU8#d+hxydz z-A{f-LwZ#euj-bJRt~wNbCs%)(=j>ChwCNl3<{X^SsHn#3zm&$@EfN5bqgYh+|9jD zMLF1Q6tU4pzz(q<7B zoM#LEgc7OAR*c&jDxI2Aa^8xAez}8ac`sfL^NP4-!r$_YOU7u*rxy#k+X#1UlEcs} zkmp~ysER>oz55sPGWN3OZ}ab2@S&;fS!8L!l@TfWfH*&Y{V8P?-)O0+@j(W~^LKAf z+^ghh{SAQOtH32NjG9BxcOF9??*}HMS1YK*XYq<}G zbVjm@J~I>Tf0l7PT3KIS)wl+kbRn4#dBXkkX92UIOxc?^?G6raGs%0^JVgm)gBll| zwH)=F>@oYhZ0PZ0 z0q2S3PmgM4n@va~n2`S)?sn4&e-57|UU@88mBap#8G<;-XJz!ucTJwwRZnx#NhVV^ zL`#6u93OSK4_O408N8eE(q8g!i54|3_yh5sfF~3|p@=t9;t&g@-gafb9|z>KDp%6t z=oSN6OQI+aBOc(W?6KX$SuR3h(Ct5A=rym+8Ep;t7mHP!V@84-tP!>IotZRGOPDND zGJ(`A!&VsJLP>XVbS%gh9AXRkn;iOYshZ43i4J< zSLhE~teml2`SwEx_dTy-M2p1*@;d+oDkTFT%};4twhYNo5mGZ)=ryY6R>MxwZ(n<) z^71DC)7<&kHAowc)8tn8@JM5kJSL6tX@S9gw+&0WGot{)cfz70V~Jl}oNrn!p(?ZJ z8DSbsr^tK!hUP9VWua)v#L6nH7a_H7WvsA#$rQWR4L?QE`z6q`L_Eor#I;ki{UL;HLGLexX!+#>cl7eSNAlIrZJaCA2_qmuZ<~YiSW1O zs0i%uo1_wjq9eahsD9RDh6~HUC~!SuA|=O9poB)MP1TU*$G~REMQA@M*7NgPQGOKO z7hw(^jp0jRS@yknL4#TFDi1xFb(cS+(m%(0Sm15QnBpJ2ZV2~%`nDotGRL274i7Ka zcz7IAary-#BDWrE2<9E@+yT&_*EvE%d&oOy6m zVsMl1sKeLQ@p50PjYoH6OwnI3Q}p7!4;`oEEhcdS7IwH`1>etZ^I*%iR9_6pAP_8M)he) zq%?kSoJWcERm0LPON)se^X~G3g&B^_UNK#vwjo7p*SwE314G>{Vhb(2`d@}aSuM5- zX}=|+ItCae#QlfY%WdyG!)v>8YCu{nqyihPLL%u00b!rrc@*7YA=gD`FF4ray~wJc zXMZG#jAM}-Jn+xrxt!26KISbfn<FWs|y4F7j?@no?nDE?Ab(jCobWdu3081y0B+K99Jj zzP#`3d|2kF!q97AXn61?bkBYHl0C#TPu{G(Z>s(L`SZ@NXY(KC|5S^nJmny{Q6R$5 z>EF+U2*~w)Jkw|IwE40ox+PLoBZb|0O?%9$$&JA$J`U}g@<$-=xHmF|Ji$LbcOX4? z`1qTr&BL@+b_;*e1=IU?;^@}?1;QSznIs7*N+DI5KD|J5=ycE1h@29pU@?HUwe7S2 zJlK`9nLC2T$CM{yS^wd!qEP8y;uAR#^T!Tx{R%7&pBJZJMzg-(Y7(w8f~0^0V0nMs zutxoBOKo=ji{`ab_i>w%XWi=C4!1*gB^vhQt}VCJBsyD*WQrD_Qk_CJv4i7=&I??+ zzErN2RO>fZUQqJTR>gg&ZuG#izEIgEcXWAZsOGroPxj|6l^F>?g1;PFE^0vjAy0}x zb2I+dtyp5QDN9}Gb&3%YX2R1E-?Z8F*tUbNj)NQP_A&ab@wR_ONx0|frayx(jwN-y zOjk4nc}AH*eJjr!ymJCO9u6w*475x!X7c079cYh#qg}5o@tfT~otFS0!K`n&awz|)`RZt5En;JFgXMH}qujOY{=f-P_1>gO#OEuj+gBZNl%z8ugMc2! z&FHXgMTF-{qWMp}axHVg$IEI2X|0J@EeUBX-w_wy)A)W3qGmR5`E(#*SsG7Z<;Fj=$9*e;vl zhbZsc^bq8{$2)!HU(O&PJit;wr>%_RLU3ab`^!uMGuOijaLdhcX3QrYl&J*$f=wLL zND&1GD}cVxndhDK1#|~|0jY6Da{!RMWrVR`8(OEig?q{e%am%r{~ql550Ai8Pb7%u zPg!$0;GtRhGiSg&d4Rsf^Kx#aP9r1I(g^c5fw>y72UVJP|kyKy!5`M51H4N}LceU%yT2vJp6F8FIL4=GI!GBRniiA#qY?- zq^}b-EkR~xc0mt)&~1eEi@WSXZXh-*zP`MH{+dxxKM$LF$*rwC?5QIz9*83Nv-@X< zGmd1fCSsL-DP&oBVWvo?O%!MS(crAL%u?xc`SMHeL6_yq(s0T=+8}1A%Y~GO>qdw? zVVtJ7hH2s}^f&tYrLhk&22GyatVbS!4i1g;#TEGLe7_j~lMVO%klyn<4HS2*7=TkI znX1JXkjO|XM!UD>tJ~=5e)R~EUVRq+VIcpo(!tw1aRT!*#3oqJsR?0#nX})TZ{-vb zPPQ)ATG^f*$oH@sDP^k}Yfr$7vLG<)!A=t?BRC9T$%~8dc!b{H(o{Wkplm^m1_1th*?=|#S;$k$&6*P^sr}1JnLyq&d)jwe>b{~ zI74Fp0k1UXYE7}z?TYn2j!?aS^E`wMg)=2BCeDNE-urgYAF#6%c2i^zkcn`K_ z)iyH5Bj^@x%3z*}$q=6s<}R(_U9VvM0smfsW8(yMm4N-cZ5hbK8I}a{WBdC?X z6m=Z&(@m=(c?1N@kQ!97H8ribIWziAo-|3bKT_yuAoUMAO(InpjcAUpLT8L-2Fow| z2YQh4HZm@3*Wgl6ReLp$95g9T_xm;=;A;82iJ*@@@_SYKnE0g1!n16Vy;iZR6Jy@}(tErsInHZ~8?VBR-K0edTsn^N{V@)Ryo9 z#yow=b@%KB1?66sBQ>f}@bJ3gxee{Hqt-?2Q%qghJ8~t^dW959L;wXJFkdd30%ZN~ zy~|f|SJSD(NM(8vRfsY!_6#P4K5(TQp+UyE@`cFaPT~x3P21tq?$`iddF|uiILU_g zgXvdmfVBG+IRL?p?KSvLa#ZDyLm<4c#`L)dupN7tGkO8f>oR38&U>N(y!dvS`^h7u zAe**HVz>?@C$`d{{>*U9hMYalAN&mtmO%+jUtqQjp`fuldIK$hX~Ple&Uda0Z9#w+QY(LKLo^y zixp6cG^`s8BG5|z;`8f#ME&t+mw>@=9JqEQu@jrROB_+|cE7AvEW(8ib)USf9f5XZ zA!q5{%e0dxGi^W=-|(9-Bra?vE_{XzbE>>&U-LX#$6E2M0wLCgcm`IuupDFX+YUrD z3e=rZq|j%pUW}-A6`#8gqefqojX~6lld#0Q^>9S4>(L|FWmfnF&LaXcS&WWXhlK}r zwc>%`L~f&&u9qw#w`=F!+JG$3%Xp~gFOJkGwSE{GE7ec7(Y;9qwGhj=@1O64=!vVK z?Bjhy@GTOlfvw@2k7XK^5jvb$j~zJnWGRT14nXYSn#^ni@{uS+a1T*BU;rggO1$KA zu#64_m$YX5b3pC{9uhY;C=pk$8N-r6^}PJC12APo6)sx40ELCvwRTrxmNfpY&-Sl6 zP(<%$uR_(^mXzsA9`|Tiw+rGX0V}nExDX*jgX|B_ZFqKd>Nh(e@L2owUXxav$iXl- zZBT5KTFv+8mIS8R@YN^pIQxemS+3si6&{SxC51|xg}*N%-bk7WA!JoFp-2#AkX_kJ z@DDa&l;8<*dzw4hJ$+u0_%10_9U22hby+p-#HpskT6nPj1vLO=ynD1R)e;VGa&=;_ z{frg>_Pm~#clj$bJPdE|nrAk5a$EWuIkayk+p=72fRVMZ2_PMwzG-u1Z?lEN61J6G z@8H3#ai(Yqu6&y1j6u4q92@TC$MSwlM-BGt)%O|i3chvQM>4JY%ySB_M>*I5j?j|h z7eFZcGFKDGc_^W!XcZ5Wcs-<)G5!@*nQvn~F7128c>i02{a|uNCF{%DL|U$@<{s(J z>N{S)>@+x?j05GL#5{!L>er?8HMxz|?x-y9eewfb^k_rCO8bvgP=%&8opXVV)C`G=^r1b1x_G;<{3M&g_-Ls&i z|EpZYJ~dU1BZ5N^%Hdz=GXVk-Tf|k|rP9NuS%zx=iO^kG9KxnJTkG@4Z?;}aLc7jv zu^ztWGe{a^a2aRNODEVULsBq$tA<&f zwF)gEISUMc6xQCjJ4`j#%1l7$-MGY{!Yg=-u)0O-Yq^MT=WD|=;Y;{BE$O+~*i`*h zLBR2sRQfpoHZxje+HdNZMVnsd9H#bR$1MaNs;L&R5&E&*yf(l|I3V z$?z*b7i`toHzCAPb14O8C$CLoMOflg{s@L8uspJQ|MGZ^9d0z2eF+VNa=;$AE1JG- zjhI2DRQ1~IOqsB7T}HLvRM~xT7QUPx;Dz7Z+MhoqAMF#Zz0y#qp1U#S%2K3?vRx<} z+FmL-u-j{-d*Hp4t|MVTo4_RmMIDP6Nyqe+#=bJY`+_w*4)eHZ>uq4+;<%W0R!<_j z^V^ZNZSD`NJmTESp>&S1Qc4?A=zJhKMICBPGRd3_cTKZD&5dSvxwZ>ceUyprDEU5l z#3L}iNXE`8661Y>Rcz;*Q8_j&vG}U?qnA;NgnaBci7eY9x+zh4qY9*HQZBp^1}H2d zeCa>?L}+TLlmnu0n5+Wf>g;M0cvGI>nLdUA+RI3@0p$y7Sa?Qa?`-^xRc?p4z|KaE zhgYFF(Ef47tm(czyG-_yi zHisd>eu>vB{!@(tTl#z7Ay3nJI%b$L{G(CkTo&{x^?OK1G{* z^;|Z4+2w3pB1jeXoTW96;nDuix&VHko3J|O;Vt*1D__v^6q2p|=UEQeS?I?8WNirD zGr_#sODL`CDd!uejjhgpQ&v1`*6ie#_mtLh2r*&l;3JyNg3{nUXj`FDN#W_L;ZA-k zipLDE4tP7kc}^kAJyW2(?fNF$#%Rj)%;ApvtEqNC_CC_pas1j+rg!(ks&2dNtKBvN6UQ-0u zfu>;O^PIgXnS}1K!S3Nn&30?QwpHe?4lFgljac8#cl&Y6msLp=#=>HyX0O^o+mZEW zz|LJ?kSI|U)8gc;hPRH`N}UH94ob;4xVDDM7D&l*rO5;5`Z1WThW4e{)<3aA{RpyE zX=&BtVNVuIl31gi6sO}}7>Y9&(sEzm)J*+Po>~l~8yZ9fN ztWwKz;NS55Xr6!3ASaq7L0IJD)$vfFtCKmO_Ii?++SKrn9?2?0J_F`&gcYWMo*4vgT?Fut z?h0L9y*7Gup11yo3R~Omns(fv???c1el%Bctk$Y&wUhGBCEy=H7*0Ir=#$iI9uVK- z?3%vc=3I_;wbcq6lY`Z!ws-c#`}N2DGU4U{&ivSTJgS2?+6pSid>1yu-$xP!o( zw1E{~kM3fu26%!UnfF)#RX24BF78XlX8!j^>AzKUq7;Mcoj)gi{W7Q^_x)?%UC~c6 zQQ6fRCO@s*!bC}F^HrMNQnmFiQq_%cc#1FG*vc2b8}%YzD!UR;iw#BQciS>1iuI_B?V2bNArR&fU>Qr>)QG#_6xg)A`+~p2pSDF@ zcK^8~h)laaPt}@yW`omF^LQa+4ipi&jQyCfd1F_#CyVjvtS#Nsu!A*unn{%6MILX+ zUKB(Z-BJB*rmmfoO5%4=ns*`e&VBh&j zr1U;`C*`%x)}VpScQ!o!?w^xYW+&y^nidopLcdEtml@Vd9Rq;Xm!gB|dkNiJlm5Pw z7HT~HLaT(RcHy|iDngXvO^r&NJfRse&~$VU(9%;;1ti1Am)U6Q+BqizW$vY6X~%p@ z+8UNgfwli%z%N8@Io#;%I>U)BA+D749IFo5MKE3ItacS@(k|I>F5X`(&5v#(gwaLQ z${{)OEJ>kl zbp+>eXO0~x%;aCN8qEYjMFXLV0~D!Y@0(f=5JLAT6oas##Toop;5*va2cLk}J7+&`*t<7X{8K#K}@ah@pmY?g7IsQXN-TJZv5 z;jwfLMvCsN!~CX&)q}U`>oyfYH4FAG-GKrbSJymX4jgD8wQ}}kAwdP_4g!Mnja3-7MfO*vX178gZZ)bT3V2qh{~|51R&Be_!IPN$75Uu z1Jq@jQ<`20al#071pq_^o7s*H?dc$j`DWfPVURjd5A#UtfQKOXXi8EC3Vu=QhViSn zprT75d;SGyR6vF%w@B^QmVA$2PKQ@m%>7K|+!%X9(Gs0soihtoZ#h#mb9@b2$?-(an2ja1wv$x4e7!PWi(qJR zCXC2Jq#hPkyui&S8Qw9Lz zoQ$izcD_A@x|v8`tq2f~$kSp2NUH65!*!1Jncn^{x8T+LwuLr)CJeS^9(KPT(ygM| zK4^1~AIxVeHUIE)I{IUeAN9BJWk%o@mD^L?U2AP~k0H80CmQf1<Cjtkt)M7K15P*+PBB{<@{+_ zF{qb$Kao2=R0{xKpeE#TE7I#C+oOL2Kk1x~c)w8U{8fHkCDtf8-RiD>TYt(ze~ncsuvp2)xNtow_klTB;_C7jsZxA_@-HQ8IZQ;+nEx|!VnBLHT+>E(-$^J-J;Ld{UJ?&Ax&TF}=khWLUEv<7Ewv&nmgl22rHv@Tg2q z7M6GV%>fKMWaB5#vL);|lY*C>WmgcoB?M)hx=B99=sb|G@by>S@J9HhiHBv|Y6lC& z7Ic*VM#jAH*Zh~ZAy42CwrZ2b?lN)A|6#=R-Bin-VMrqG)MFxL021B(h!3>hs7yDm z(B0O&y4Y8G5ez{4X>12WL~mp{ao5U;W-2q;mB6bxwcz|h7s~JmTk(_+oHAah>@B^~ ze5+D3l3@C;6l2hS@R9lf+IhllVfM+BhJu>x$LJwqk(To$-pw;Xn~MYfOGnoGhncw- z{sq4XA@8VL;QTVZ<=A+SkG|Kv)|$8a-;f$@JrSC*x_>}w9t(b{KYX3-$s6fjw#&)& zX>5YZKXBjQZ>D0sfrI}z1+KG5c3QI#1lJuCiy#pY@1qv@D}sq<`hitafrvqBT%*Cu zo3pMLdL%=AD9wM~^R3I7Gx+@2|096!ZM6xK=V-IicMAmoxcJNGuh^1{`fhVEj#UA% zyB$}*@Acm?mwum4qoxf=I9YoIM^3%_w+bEWS*yGVWrw8ygyl5ueltuJL}`rFRJSGI zUCg>IR@mU_%_&y)*8a2S#{&^G^9#aLtL;!`qC^oM-H^fqwM<=IDHB7(sbo9nHvn<^ zYFD<-pE81eduMeWLPIm2OLwhV;7fFz|9jAJD_w)Q_QGM|Z?`yqJt7J{vIDin?SE8s z6E+6=X?Sr0#AyHu(=M4yW1S#Mz}!~s|Hk2s_E3NEyLYck3VbBh%hIfqv-t~r{&!># zbqA2f%dPtnOU%=jnyB2soRby}NVWR_IBXZ|Tz8%I6B@P_VgumO*~fLE}<&aX$;|AYSkadX{gg-L+w9bG( NN_W&@rE+GW{|9&J4$A-l literal 0 HcmV?d00001 diff --git a/doc/graph/fusion_patterns/sdpa_with_compressed_kv.md b/doc/graph/fusion_patterns/sdpa_with_compressed_kv.md new file mode 100644 index 00000000000..e7a55ef571c --- /dev/null +++ b/doc/graph/fusion_patterns/sdpa_with_compressed_kv.md @@ -0,0 +1,119 @@ +SDPA with Compressed Key and Value {#dev_guide_graph_sdpa_compressed_kv} +======================================================================== + +## Overview + +int4 and int8 compressions for Key and Value are exploited in fused Scaled +Dot-Product Attention (SDPA)[1] to reduce the memory footprint of generative +inference of LLM, especially when KV cache mechanism is adopted. Specifically, +Key and Value tensors are stored using lower precision data types like int4 and +int8 to reduce memory usage, and are subsequently de-quantized to wider floating +point data types such as f16 and bf16 for computation. + +Note that grouped quantization is required to improve the model accuracy, +especially for int4 data types. In this case, group size is needed as an +attribute for quantization, which indicates the number of elements that share +the same scaling factor and zero-points in each quantization group. + +The notations used in this topic are: + +- N: The mini-batch size. +- H: The head number. +- S: The sequence length. +- D: The size of each head. +- G: The group size. + +## SDPA Pattern + +The SDPA pattern with compressed Key and Value is defined as a directional +acyclic graph (DAG) using oneDNN Graph API. oneDNN extends +[SDPA pattern](@ref dev_guide_graph_sdpa) to support the following three kinds +of compressed SDPA patterns: + +1. SDPA with compressed Key and Value. +2. SDPA with floating-point Key and compressed Value. +3. SDPA with compressed Key and floating-point Value. + +The floating-point data types include f32, f16 and bf16, and the compressed +data type refers to low-precision integral data types, including int4 (u4/s4) +and int8 (u8/s8) data types. + +In oneDNN Graph API, we support quantization through a pattern with quantization +operations such as [DynamicDequantize](@ref dev_guide_op_dynamicdequantize) and +[DynamicQuantize](@ref dev_guide_op_dynamicquantize). The supported pattern is +as follows. The blue nodes are required while the brown nodes are optional. + +![compressed SDPA pattern](images/compressed_sdpa_pattern.png) + +Compared to a typical SDPA pattern, there are a few differences: + +1. Two additional DynamicDequantize operations are applied to the input Key and +Value to convert the integral values to floating-point values. +2. Apart from the Query, Key and Value inputs, the pattern requires additional +quantization information such as scale and zero-points for the dequantization of +Key and Value tensors. Currently, oneDNN only supports grouped quantization +on one dimension; specifically, the shapes of scale and zero-points for Key and +Value de-quantization should be (N, H, S, D/G). +3. Additionally, the `group_shape` attribute of the quantization operations must +be specified as (1, 1, 1, G) for Key and Value dequantization. + +## Data Types + +oneDNN supports the following combinations of data types for Query, Key, Value, +output, scale for Key, zero-points for Key, scale for Value and zero-points for +Value: + +| Query | Key | Scale_K | Zp_K | Value | Scale_V | Zp_V | Output | +|:--------|:--------|:--------|:----------------|:-------|:--------|:----------------|:-------| +| dt_fp | dt_int | dt_fp | u4,s4,u8,s8,s32 | dt_int | dt_fp | u4,s4,u8,s8,s32 | dt_fp | +| dt_fp | dt_int | dt_fp | u4,s4,u8,s8,s32 | dt_fp | N/A | N/A | dt_fp | +| dt_fp | dt_fp | N/A | N/A | dt_int | dt_fp | u4,s4,u8,s8,s32 | dt_fp | + +Notes: +- dt_fp can be: f16, bf16 or f32. +- dt_int can be: u8, s8, u4 or s4. +- zero-point inputs are optional. + +You can specify the data type via the input and output data type fields of +logical tensors for each operation. The definition of the data types and support +status on different CPU and GPU platforms follow the general description in +@ref dev_guide_data_types. + +### Floating-point Math Mode + +You should set the floating-point math mode +(@ref dev_guide_attributes_fpmath_mode) when using SDPA with compressed Key and +Value. Generally, the math mode should align with the data type of the Query, +which indicates the computation data type. Additionally, the second boolean +flag, `apply_to_int`, should be set to true. You can configure these attribute +values using the `set_fpmath_mode` API +(@ref dnnl::graph::graph::set_fpmath_mode) on the graph object. + +## Implementation Limitations + +- oneDNN primitive-based SDPA with compressed Key and Value is implemented as +a reference implementation on both Intel Architecture Processors and Intel +Graphics Products. The reference implementation requires memory to store the +intermediate results of the dot products between Query and Key which takes +\f$O(S^2)\f$ memory. It may lead to Out-of-Memory error when computing long +sequence length inputs on platforms with limited memory. +- The compressed SDPA patterns functionally support all input shapes meeting +the shape requirements of each operation in the graph. +- CPU + - oneDNN does not provide optimized implementation on CPU currently. All + executions will be implemented with the primitive-based reference + computation. +- GPU + - Optimized implementation is available for 4D Q/K/V tensors with the shape + defined as (N, H, S, D) for Query and Value, (N, H, D, S) for Key, + (N, H, D/G, S) for scales and zero-points of Key (if available) and + (N, H, S, D/G) for scales and zero-points of Value (if available). + - Optimized implementation is available for compressed SDPA with `f16` + computation data type on Intel Graphics Products with Intel(R) Xe Matrix + Extensions (Intel(R) XMX) support. + - If int4 zero-points are specified, optimized implementation will be only + available when the group size equals 16. + +## References + +[1] Attention is all you need, https://arxiv.org/abs/1706.03762v7 From a2c675240f1efc659a535a7126178739197220c2 Mon Sep 17 00:00:00 2001 From: Jiexin-Zheng Date: Fri, 10 Jan 2025 06:48:58 +0000 Subject: [PATCH 05/40] graph: backend,interface: add select binary impl --- src/graph/backend/dnnl/dnnl_op_def.hpp | 1 + src/graph/backend/dnnl/dnnl_shape_infer.cpp | 66 ++++++-- src/graph/backend/dnnl/dnnl_shape_infer.hpp | 6 +- .../backend/dnnl/kernels/large_partition.cpp | 2 + src/graph/backend/dnnl/kernels/matmul.cpp | 3 + src/graph/backend/dnnl/kernels/sdp_decomp.cpp | 4 +- .../dnnl/kernels/sdp_decomp_config.cpp | 19 ++- src/graph/backend/dnnl/kernels/select.cpp | 3 + src/graph/backend/dnnl/op_executable.cpp | 17 +- src/graph/backend/dnnl/passes/lower.cpp | 123 +++------------ src/graph/backend/dnnl/passes/transform.cpp | 146 +++++++++++++++++- src/graph/backend/dnnl/passes/transform.hpp | 10 +- src/graph/backend/dnnl/passes/utils.cpp | 18 ++- src/graph/backend/dnnl/passes/utils.hpp | 5 +- src/graph/interface/shape_infer.cpp | 7 +- src/graph/interface/shape_infer.hpp | 4 +- 16 files changed, 306 insertions(+), 128 deletions(-) diff --git a/src/graph/backend/dnnl/dnnl_op_def.hpp b/src/graph/backend/dnnl/dnnl_op_def.hpp index 148efc50817..5dd7a8e1776 100644 --- a/src/graph/backend/dnnl/dnnl_op_def.hpp +++ b/src/graph/backend/dnnl/dnnl_op_def.hpp @@ -701,6 +701,7 @@ DNNL_GRAPH_OP_SCHEMA(dnnl_binary, 1, .set_num_outputs(2) .set_input(0, "a") .set_input(1, "b") + .set_input(2, "cond") .set_output(0, "output") .set_output(1, "scratchpad") // Attributes inherited from front binary ops (Add, Multiply, diff --git a/src/graph/backend/dnnl/dnnl_shape_infer.cpp b/src/graph/backend/dnnl/dnnl_shape_infer.cpp index 781fe979190..b94ab7a87aa 100644 --- a/src/graph/backend/dnnl/dnnl_shape_infer.cpp +++ b/src/graph/backend/dnnl/dnnl_shape_infer.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2024 Intel Corporation +* Copyright 2021-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,9 +15,9 @@ *******************************************************************************/ #include -#include - #include "graph/interface/shape_infer.hpp" +#include "oneapi/dnnl/dnnl.hpp" +#include #include "graph/backend/dnnl/dnnl_shape_infer.hpp" #include "graph/backend/dnnl/internal_attrs.hpp" @@ -484,17 +484,65 @@ status_t infer_dnnl_pool_bwd_output_shape(op_t *n, return status::success; } +status_t infer_binary_select_output_shape(op_t *n, + std::vector &inputs, + std::vector &outputs) { + auto in0 = logical_tensor_wrapper_t(inputs[0]); + auto in1 = logical_tensor_wrapper_t(inputs[1]); + auto in2 = logical_tensor_wrapper_t(inputs[2]); + + const bool shapes_should_match = n->has_attr(op_attr::auto_broadcast) + ? "none" == n->get_attr(op_attr::auto_broadcast) + : false; + + dims input0_dims = in0.vdims(); + dims input1_dims = in1.vdims(); + dims input2_dims = in2.vdims(); + dims inferred_out_shape; + + if (shapes_should_match) { // no broadcast + VCHECK_INVALID_SHAPE( + (input0_dims == input1_dims && input1_dims == input2_dims), + "%s, all input dims should match each other if there is no " + "broadcast. input0 dims: %s, input1 dims: %s, input2 dims: %s ", + op_t::kind2str(n->get_kind()).c_str(), + dims2str(input0_dims).c_str(), dims2str(input1_dims).c_str(), + dims2str(input2_dims).c_str()); + inferred_out_shape = std::move(input0_dims); + } else { // can broadcast + status_t ret1 = broadcast(input0_dims, input1_dims, inferred_out_shape); + VCHECK_INVALID_SHAPE((ret1 == status::success), + "%s, failed to implement numpy broadcasting", + op_t::kind2str(n->get_kind()).c_str()); + } + + auto out0 = logical_tensor_wrapper_t(outputs[0]); + // check if given or partial set shape aligns with inferred shape + if (!out0.is_shape_unknown() || out0.ndims() != -1) { + VCHECK_INVALID_SHAPE(validate(inferred_out_shape, out0.vdims()), + "%s, inferred out shape and output shape are not compatible", + op_t::kind2str(n->get_kind()).c_str()); + if (!out0.is_shape_unknown()) return status::success; + } + + set_shape_and_strides(*outputs[0], inferred_out_shape); + return status::success; +} + status_t infer_dnnl_binary_output_shape(op_t *n, std::vector &inputs, std::vector &outputs) { const bool is_bias_add = n->has_attr(op_attr::is_bias_add) && n->get_attr(op_attr::is_bias_add); - - auto ret = is_bias_add - ? infer_bias_add_output_shape(n, inputs, outputs) - : infer_elemwise_arithmetic_output_shape(n, inputs, outputs); - - return ret; + const algorithm algo = static_cast( + n->get_attr(op_attr::alg_kind)); + if (algo == algorithm::binary_select) { + return infer_binary_select_output_shape(n, inputs, outputs); + } else if (is_bias_add) { + return infer_bias_add_output_shape(n, inputs, outputs); + } else { + return infer_elemwise_arithmetic_output_shape(n, inputs, outputs); + } } } // namespace dnnl_impl diff --git a/src/graph/backend/dnnl/dnnl_shape_infer.hpp b/src/graph/backend/dnnl/dnnl_shape_infer.hpp index 22ef21b65ae..78368597062 100644 --- a/src/graph/backend/dnnl/dnnl_shape_infer.hpp +++ b/src/graph/backend/dnnl/dnnl_shape_infer.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2023 Intel Corporation +* Copyright 2021-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -103,6 +103,10 @@ status_t infer_dnnl_binary_output_shape(op_t *n, std::vector &inputs, std::vector &outputs); +status_t infer_binary_select_output_shape(op_t *n, + std::vector &inputs, + std::vector &outputs); + } // namespace dnnl_impl } // namespace graph } // namespace impl diff --git a/src/graph/backend/dnnl/kernels/large_partition.cpp b/src/graph/backend/dnnl/kernels/large_partition.cpp index 9962d2f473a..d7514da0c1d 100644 --- a/src/graph/backend/dnnl/kernels/large_partition.cpp +++ b/src/graph/backend/dnnl/kernels/large_partition.cpp @@ -36,6 +36,8 @@ void larger_partition_kernel_t::setup_pipeline_stage1( pass_pipeline_t &pipeline) { // Directly lower down (1 to 1 mapping) BACKEND_DNNL_ADD_PASS(pipeline, lower_down); + // Decompose select to binary ops if necessary + BACKEND_DNNL_ADD_PASS(pipeline, decompose_select_to_binary_ops); // Indirectly lower down (N to 1 mapping) BACKEND_DNNL_ADD_PASS(pipeline, fuse_reciprocal_mul_to_div); diff --git a/src/graph/backend/dnnl/kernels/matmul.cpp b/src/graph/backend/dnnl/kernels/matmul.cpp index f0fc7193e4a..17005554cba 100644 --- a/src/graph/backend/dnnl/kernels/matmul.cpp +++ b/src/graph/backend/dnnl/kernels/matmul.cpp @@ -50,6 +50,9 @@ status_t matmul_t::compile_impl(const dnnl_partition_impl_t *part, pass_pipeline_t pipeline(vis); BACKEND_DNNL_ADD_PASS(pipeline, lower_down); + // Decompose select to binary ops if necessary + BACKEND_DNNL_ADD_PASS(pipeline, decompose_select_to_binary_ops); + BACKEND_DNNL_ADD_PASS(pipeline, fuse_bias_add); // check if bias exists BACKEND_DNNL_ADD_PASS(pipeline, check_with_bias); diff --git a/src/graph/backend/dnnl/kernels/sdp_decomp.cpp b/src/graph/backend/dnnl/kernels/sdp_decomp.cpp index ffde91622e0..9e1361d7add 100644 --- a/src/graph/backend/dnnl/kernels/sdp_decomp.cpp +++ b/src/graph/backend/dnnl/kernels/sdp_decomp.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2024 Intel Corporation +* Copyright 2024-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -60,6 +60,8 @@ status_t sdp_decomp_kernel_t::compile_impl( pass_pipeline_t pipeline = pass_pipeline_t(vis); pass_pipeline_t select_pipeline = pass_pipeline_t(vis); BACKEND_DNNL_ADD_PASS(pipeline, lower_down); + // Decompose select to binary ops if necessary + BACKEND_DNNL_ADD_PASS(pipeline, decompose_select_to_binary_ops); BACKEND_DNNL_ADD_PASS(pipeline, fuse_reshape_for_gqa); // Fusion and canonicalization passes begin if (quantized) { diff --git a/src/graph/backend/dnnl/kernels/sdp_decomp_config.cpp b/src/graph/backend/dnnl/kernels/sdp_decomp_config.cpp index 8e49149d2c5..d09567286ad 100644 --- a/src/graph/backend/dnnl/kernels/sdp_decomp_config.cpp +++ b/src/graph/backend/dnnl/kernels/sdp_decomp_config.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2024 Intel Corporation +* Copyright 2024-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -58,6 +58,23 @@ bool sdp_decomp_config_t::initial_check(const std::shared_ptr &sg, "Only supports single scale value, but got %lld", scale_sz); } + // Check select cond and src0 shape + if (graph_inport[5] != -1 && graph_inport[6] != -1) { + const auto select_cond_dims = ltw(inputs[graph_inport[5]]).vdims(); + const auto select_src0_dims = ltw(inputs[graph_inport[6]]).vdims(); + VCHECK_SDP_DECOMP(select_cond_dims.size() == select_src0_dims.size(), + false, + "Select cond and src0 dims should be same, but got %zu and %zu", + select_cond_dims.size(), select_src0_dims.size()); + for (size_t i = 0; i < select_cond_dims.size(); i++) { + + VCHECK_SDP_DECOMP(select_cond_dims[i] == select_src0_dims[i], false, + "Select cond and src0 dims should be same, but got %lld " + "and %lld", + select_cond_dims[i], select_src0_dims[i]); + } + } + #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_OMP // RATIO is an empirical value used to determine the numerical relationship // between batch_size, num_head_q and thread number to determine whether to use diff --git a/src/graph/backend/dnnl/kernels/select.cpp b/src/graph/backend/dnnl/kernels/select.cpp index 1434c8bc2cd..9e7b60fe118 100644 --- a/src/graph/backend/dnnl/kernels/select.cpp +++ b/src/graph/backend/dnnl/kernels/select.cpp @@ -49,6 +49,9 @@ status_t select_t::compile_impl(const dnnl_partition_impl_t *part, pass_pipeline_t pipeline(vis); BACKEND_DNNL_ADD_PASS(pipeline, lower_down); + // Decompose select to binary ops if necessary + BACKEND_DNNL_ADD_PASS(pipeline, decompose_select_to_binary_ops); + BACKEND_DNNL_ADD_PASS(pipeline, binary_canonicalization); BACKEND_DNNL_ADD_PASS(pipeline, fuse_post_ops); diff --git a/src/graph/backend/dnnl/op_executable.cpp b/src/graph/backend/dnnl/op_executable.cpp index 55b0c7e9f96..ef433a0a413 100644 --- a/src/graph/backend/dnnl/op_executable.cpp +++ b/src/graph/backend/dnnl/op_executable.cpp @@ -1252,8 +1252,15 @@ binary_executable_t::desc_t binary_executable_t::create_desc( op->get_attr(op_attr::alg_kind)); dnnl::binary::primitive_desc pd; - pd = dnnl::binary::primitive_desc( - p_engine, algo, src0, src1, dst, prm_attr); + if (algo == algorithm::binary_select) { + auto src2 = make_dnnl_memory_desc( + op->get_input_value(2)->get_logical_tensor()); + pd = dnnl::binary::primitive_desc( + p_engine, algo, src0, src1, src2, dst, prm_attr); + } else { + pd = dnnl::binary::primitive_desc( + p_engine, algo, src0, src1, dst, prm_attr); + } pd_cache.insert({op.get(), pd}); @@ -1891,12 +1898,16 @@ arg_indices_t matmul_executable_t::get_arg_indices( arg_indices_t binary_executable_t::get_arg_indices( const op_t *op, fusion_info_mgr_t &mgr) { arg_indices_t arg_indices; + const algorithm algo = static_cast( + op->get_attr(op_attr::alg_kind)); // add input args size_t index = 0; arg_indices.insert({DNNL_ARG_SRC_0, indices_t {input, index++}}); arg_indices.insert({DNNL_ARG_SRC_1, indices_t {input, index++}}); - + if (algo == algorithm::binary_select) { + arg_indices.insert({DNNL_ARG_SRC_2, indices_t {input, index++}}); + } get_arg_indices_for_post_ops(op, mgr, arg_indices, index); // add output args diff --git a/src/graph/backend/dnnl/passes/lower.cpp b/src/graph/backend/dnnl/passes/lower.cpp index 18589444b90..888520dbb46 100644 --- a/src/graph/backend/dnnl/passes/lower.cpp +++ b/src/graph/backend/dnnl/passes/lower.cpp @@ -664,114 +664,33 @@ static status_t select_handler( auto cond = in_vals[0]; auto src0 = in_vals[1]; auto src1 = in_vals[2]; - cond->set_data_type(dnnl::impl::data_type::u8); - - //TODO: This reorder can be removed once eltwise_clip support int8 input - op_ptr type_cast = std::make_shared(op_kind::dnnl_reorder); - type_cast->set_attr(op_attr::change_layout, false); - - op_ptr clip = std::make_shared(op_kind::dnnl_eltwise); - clip->set_attr(op_attr::alg_kind, - static_cast(dnnl::algorithm::eltwise_clip)); - clip->set_attr(op_attr::alpha, 0.f); - clip->set_attr(op_attr::beta, 1.f); - - // After reorder and clip. The cond value is 0 or 1. - // Then output = src0.*cond+src1.*(cond*-1 + 1) - op_ptr mul1 = std::make_shared(op_kind::dnnl_binary); - mul1->set_attr(op_attr::alg_kind, - static_cast(dnnl::algorithm::binary_mul)); - mul1->merge_attributes(op->get_attributes()); - - op_ptr mul2 = std::make_shared(op_kind::dnnl_binary); - mul2->set_attr(op_attr::alg_kind, - static_cast(dnnl::algorithm::binary_mul)); - mul2->merge_attributes(op->get_attributes()); - - op_ptr linear = std::make_shared(op_kind::dnnl_eltwise); - linear->set_attr(op_attr::alg_kind, - static_cast(dnnl::algorithm::eltwise_linear)); - const float alpha_value = -1.0f, beta_value = 1.0f; - linear->set_attr(op_attr::alpha, alpha_value); - linear->set_attr(op_attr::beta, beta_value); - - op_ptr add = std::make_shared(op_kind::dnnl_binary); - add->set_attr(op_attr::alg_kind, - static_cast(dnnl::algorithm::binary_add)); + // For the binary select operation, the conditional input tensor can + // only be of `s8` data type. + cond->set_data_type(dnnl::impl::data_type::s8); + + op_ptr new_op = std::make_shared(op_kind::dnnl_binary); + new_op->set_attr(op_attr::alg_kind, + static_cast(get_binary_alg_map().at(op->get_kind()))); + new_op->merge_attributes(op->get_attributes()); // reconnect cond->remove_consumer(*op, 0); src0->remove_consumer(*op, 1); src1->remove_consumer(*op, 2); - // first reorder and clip - cond->add_consumer(*type_cast, 0); - type_cast->add_input(cond); - logical_tensor_t float_cond = empty_logical_tensor_with_default_id(); - auto float_cond_val - = std::make_shared(*type_cast, 0, float_cond, true); - float_cond_val->set_data_type(dnnl::impl::data_type::f32); - type_cast->add_output(float_cond_val); - insert_empty_scratchpad(type_cast); - - float_cond_val->add_consumer(*clip, 0); - clip->add_input(float_cond_val); - logical_tensor_t clip_cond = empty_logical_tensor_with_default_id(); - auto clip_cond_val = std::make_shared(*clip, 0, clip_cond, true); - clip_cond_val->set_data_type( - float_cond_val->get_logical_tensor().data_type); - clip->add_output(clip_cond_val); - insert_empty_scratchpad(clip); - - // first multiply - src0->add_consumer(*mul1, 0); - clip_cond_val->add_consumer(*mul1, 1); - mul1->add_input(src0); - mul1->add_input(clip_cond_val); - - logical_tensor_t src0_cond = empty_logical_tensor_with_default_id(); - auto src0_val = std::make_shared(*mul1, 0, src0_cond, true); - src0_val->set_data_type(src0->get_logical_tensor().data_type); - mul1->add_output(src0_val); - insert_empty_scratchpad(mul1); - - //cond.*{-1} + 1 - clip_cond_val->add_consumer(*linear, 0); - linear->add_input(clip_cond_val); - - logical_tensor_t cond_inv = empty_logical_tensor_with_default_id(); - auto cond_inv_val = std::make_shared(*linear, 0, cond_inv, true); - cond_inv_val->set_data_type(clip_cond_val->get_logical_tensor().data_type); - linear->add_output(cond_inv_val); - insert_empty_scratchpad(linear); - - //src1.*(cond_inv) - - src1->add_consumer(*mul2, 0); - cond_inv_val->add_consumer(*mul2, 1); - mul2->add_input(src1); - mul2->add_input(cond_inv_val); - - logical_tensor_t src1_cond = empty_logical_tensor_with_default_id(); - auto src1_val = std::make_shared(*mul2, 0, src1_cond, true); - src1_val->set_data_type(src1->get_logical_tensor().data_type); - mul2->add_output(src1_val); - insert_empty_scratchpad(mul2); - - src0_val->add_consumer(*add, 0); - src1_val->add_consumer(*add, 1); - add->add_input(src0_val); - add->add_input(src1_val); - add->add_output(out_vals[0]); - insert_empty_scratchpad(add); - - // add new ops and delete select op - rewriter.to_insert(type_cast); - rewriter.to_insert(clip); - rewriter.to_insert(mul1); - rewriter.to_insert(linear); - rewriter.to_insert(mul2); - rewriter.to_insert(add); + // binary select primitive places the condition input tensor as the + // third input tensor. + src0->add_consumer(*new_op, 0); + src1->add_consumer(*new_op, 1); + cond->add_consumer(*new_op, 2); + + new_op->add_input(src0); + new_op->add_input(src1); + new_op->add_input(cond); + new_op->add_output(out_vals[0]); + + insert_empty_scratchpad(new_op); + rewriter.to_insert(new_op); rewriter.to_remove(op); return status::success; diff --git a/src/graph/backend/dnnl/passes/transform.cpp b/src/graph/backend/dnnl/passes/transform.cpp index a16106babad..b5133e6135b 100644 --- a/src/graph/backend/dnnl/passes/transform.cpp +++ b/src/graph/backend/dnnl/passes/transform.cpp @@ -1,5 +1,5 @@ /******************************************************************************* - * Copyright 2021-2024 Intel Corporation + * Copyright 2021-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -2268,6 +2268,9 @@ status_t binary_canonicalization(std::shared_ptr &sg) { std::vector in_ndims {src0_ndims, src1_ndims}; for (size_t i = 0; i < cur_op->num_inputs(); ++i) { if (in_ndims[i] == target_ndims) { continue; } + // For binary select op, broadcast for the third input is + // unsupported. + if (i == 2) { continue; } std::vector axes(target_ndims - in_ndims[i]); std::iota(axes.begin(), axes.end(), 0); @@ -2297,6 +2300,147 @@ status_t binary_canonicalization(std::shared_ptr &sg) { return infer_shape(sg); } +status_t decompose_select_to_binary_ops(std::shared_ptr &sg) { + subgraph_rewriter_t rewriter(sg); + for (auto &op : sg->get_ops()) { + if (op->get_kind() != op_kind::dnnl_binary) continue; + const algorithm algo = static_cast( + op->get_attr(op_attr::alg_kind)); + + if (algo != algorithm::binary_select) continue; + + // For the binary select primitive, broadcast semantics are not + // supported for the third conditional input tensor. For this case, the + // shape of the conditional input tensor must match that of the source 0 + // tensor. + // The binary select primitive is unsupported on GPU. + const bool require_broadcast = need_broadcast_for_inputs(op, 0, 2); + if (!require_broadcast && sg->get_engine_kind() != engine_kind::gpu) + continue; + + auto in_vals = op->get_input_values(); + auto out_vals = op->get_output_values(); + + auto src0 = in_vals[0]; + auto src1 = in_vals[1]; + auto cond = in_vals[2]; + cond->set_data_type(dnnl::impl::data_type::u8); + + //TODO: This reorder can be removed once eltwise_clip support int8 input + op_ptr type_cast = std::make_shared(op_kind::dnnl_reorder); + type_cast->set_attr(op_attr::change_layout, false); + + op_ptr clip = std::make_shared(op_kind::dnnl_eltwise); + clip->set_attr(op_attr::alg_kind, + static_cast(dnnl::algorithm::eltwise_clip)); + clip->set_attr(op_attr::alpha, 0.f); + clip->set_attr(op_attr::beta, 1.f); + + // After reorder and clip. The cond value is 0 or 1. + // Then output = src0.*cond+src1.*(cond*-1 + 1) + op_ptr mul1 = std::make_shared(op_kind::dnnl_binary); + mul1->merge_attributes(op->get_attributes()); + mul1->set_attr(op_attr::alg_kind, + static_cast(dnnl::algorithm::binary_mul)); + + op_ptr mul2 = std::make_shared(op_kind::dnnl_binary); + mul2->merge_attributes(op->get_attributes()); + mul2->set_attr(op_attr::alg_kind, + static_cast(dnnl::algorithm::binary_mul)); + + op_ptr linear = std::make_shared(op_kind::dnnl_eltwise); + linear->set_attr(op_attr::alg_kind, + static_cast(dnnl::algorithm::eltwise_linear)); + const float alpha_value = -1.0f, beta_value = 1.0f; + linear->set_attr(op_attr::alpha, alpha_value); + linear->set_attr(op_attr::beta, beta_value); + + op_ptr add = std::make_shared(op_kind::dnnl_binary); + add->set_attr(op_attr::alg_kind, + static_cast(dnnl::algorithm::binary_add)); + + // reconnect + src0->remove_consumer(*op, 0); + src1->remove_consumer(*op, 1); + cond->remove_consumer(*op, 2); + + // first reorder and clip + cond->add_consumer(*type_cast, 0); + type_cast->add_input(cond); + logical_tensor_t float_cond = empty_logical_tensor_with_default_id(); + auto float_cond_val + = std::make_shared(*type_cast, 0, float_cond, true); + float_cond_val->set_data_type(dnnl::impl::data_type::f32); + type_cast->add_output(float_cond_val); + insert_empty_scratchpad(type_cast); + + float_cond_val->add_consumer(*clip, 0); + clip->add_input(float_cond_val); + logical_tensor_t clip_cond = empty_logical_tensor_with_default_id(); + auto clip_cond_val + = std::make_shared(*clip, 0, clip_cond, true); + clip_cond_val->set_data_type( + float_cond_val->get_logical_tensor().data_type); + clip->add_output(clip_cond_val); + insert_empty_scratchpad(clip); + + // first multiply + src0->add_consumer(*mul1, 0); + clip_cond_val->add_consumer(*mul1, 1); + mul1->add_input(src0); + mul1->add_input(clip_cond_val); + + logical_tensor_t src0_cond = empty_logical_tensor_with_default_id(); + auto src0_val = std::make_shared(*mul1, 0, src0_cond, true); + src0_val->set_data_type(src0->get_logical_tensor().data_type); + mul1->add_output(src0_val); + insert_empty_scratchpad(mul1); + + //cond.*{-1} + 1 + clip_cond_val->add_consumer(*linear, 0); + linear->add_input(clip_cond_val); + + logical_tensor_t cond_inv = empty_logical_tensor_with_default_id(); + auto cond_inv_val + = std::make_shared(*linear, 0, cond_inv, true); + cond_inv_val->set_data_type( + clip_cond_val->get_logical_tensor().data_type); + linear->add_output(cond_inv_val); + insert_empty_scratchpad(linear); + + //src1.*(cond_inv) + + src1->add_consumer(*mul2, 0); + cond_inv_val->add_consumer(*mul2, 1); + mul2->add_input(src1); + mul2->add_input(cond_inv_val); + + logical_tensor_t src1_cond = empty_logical_tensor_with_default_id(); + auto src1_val = std::make_shared(*mul2, 0, src1_cond, true); + src1_val->set_data_type(src1->get_logical_tensor().data_type); + mul2->add_output(src1_val); + insert_empty_scratchpad(mul2); + + src0_val->add_consumer(*add, 0); + src1_val->add_consumer(*add, 1); + add->add_input(src0_val); + add->add_input(src1_val); + add->add_output(out_vals[0]); + insert_empty_scratchpad(add); + + // add new ops and delete select op + rewriter.to_insert(type_cast); + rewriter.to_insert(clip); + rewriter.to_insert(mul1); + rewriter.to_insert(linear); + rewriter.to_insert(mul2); + rewriter.to_insert(add); + rewriter.to_remove(op); + } + rewriter.run(); + return infer_shape(sg); +} + status_t binary_broadcast_swap(std::shared_ptr &sg) { subgraph_rewriter_t rewriter(sg); diff --git a/src/graph/backend/dnnl/passes/transform.hpp b/src/graph/backend/dnnl/passes/transform.hpp index 4378c527e20..ef7329a4d89 100644 --- a/src/graph/backend/dnnl/passes/transform.hpp +++ b/src/graph/backend/dnnl/passes/transform.hpp @@ -1,5 +1,5 @@ /******************************************************************************* - * Copyright 2021-2024 Intel Corporation + * Copyright 2021-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -119,6 +119,14 @@ status_t fuse_to_dnnl_sum(std::shared_ptr &sg); // make the input shape meet the requirement of oneDNN binary primitive status_t binary_canonicalization(std::shared_ptr &sg); +// For now, we support two impl paths for select op: one is to use binary +// primitive with select alorithm, the other is to use multiple binary ops(we +// call it "legacy impl" here). However, during the lowering pass, we directly +// lower the front-end select op to single binary select op, this pass is used +// to decide which impl path to apply and then decompose the select binary op +// back to multiple binary ops if it's the case to use legacy impl. +status_t decompose_select_to_binary_ops(std::shared_ptr &sg); + // This pass is used to swap two inputs to broadcast src1 which is optimized in // oneDNN binary primitive. Notice that this should be applied after // binary_canonicalization and infer_shape diff --git a/src/graph/backend/dnnl/passes/utils.cpp b/src/graph/backend/dnnl/passes/utils.cpp index 98e7093d668..88159700f96 100644 --- a/src/graph/backend/dnnl/passes/utils.cpp +++ b/src/graph/backend/dnnl/passes/utils.cpp @@ -250,7 +250,8 @@ const std::map &get_binary_alg_map() { {graph::op_kind::Maximum, dnnl::algorithm::binary_max}, {graph::op_kind::Subtract, dnnl::algorithm::binary_sub}, {graph::op_kind::BiasAdd, dnnl::algorithm::binary_add}, - {graph::op_kind::GreaterEqual, dnnl::algorithm::binary_ge}}; + {graph::op_kind::GreaterEqual, dnnl::algorithm::binary_ge}, + {graph::op_kind::Select, dnnl::algorithm::binary_select}}; return binary_alg_map; } @@ -646,6 +647,21 @@ bool inverse_mul_scales(std::shared_ptr &scale_op) { return true; } +bool need_broadcast_for_inputs( + const std::shared_ptr &op, size_t index1, size_t index2) { + auto in_vals = op->get_input_values(); + + const dims input1_dims + = logical_tensor_wrapper_t(in_vals[index1]->get_logical_tensor()) + .vdims(); + const dims input2_dims + = logical_tensor_wrapper_t(in_vals[index2]->get_logical_tensor()) + .vdims(); + + if (input1_dims != input2_dims) { return true; } + + return false; +} } // namespace dnnl_impl } // namespace graph } // namespace impl diff --git a/src/graph/backend/dnnl/passes/utils.hpp b/src/graph/backend/dnnl/passes/utils.hpp index 912f2bc531b..6ab4536157d 100644 --- a/src/graph/backend/dnnl/passes/utils.hpp +++ b/src/graph/backend/dnnl/passes/utils.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2024 Intel Corporation +* Copyright 2021-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -348,6 +348,9 @@ std::shared_ptr clone_mul_scales(const std::shared_ptr &scale_op); // This function is used to inverse scales of a dnnl_mul_scales op bool inverse_mul_scales(std::shared_ptr &scale_op); +bool need_broadcast_for_inputs( + const std::shared_ptr &op, size_t index1, size_t index2); + } // namespace dnnl_impl } // namespace graph } // namespace impl diff --git a/src/graph/interface/shape_infer.cpp b/src/graph/interface/shape_infer.cpp index 8f1c8a3d94e..556eb631958 100644 --- a/src/graph/interface/shape_infer.cpp +++ b/src/graph/interface/shape_infer.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2024 Intel Corporation +* Copyright 2021-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,9 +32,6 @@ namespace dnnl { namespace impl { namespace graph { -// utils function -namespace { - std::string dims2str(const dims &dims) { if (dims.empty()) return std::string(""); @@ -45,8 +42,6 @@ std::string dims2str(const dims &dims) { return str; } -} // namespace - /// convert shape to ncx or oix dims canonicalize(const dims &shape, const std::string &format) { dims ret(shape); diff --git a/src/graph/interface/shape_infer.hpp b/src/graph/interface/shape_infer.hpp index 976e4c481ff..a9b72305cd3 100644 --- a/src/graph/interface/shape_infer.hpp +++ b/src/graph/interface/shape_infer.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2024 Intel Corporation +* Copyright 2020-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -74,6 +74,8 @@ status_t infer_auto_pad(const dim_t in_dim, const dim_t stride, /// TODO(xxx): 0-D broadcasting? status_t broadcast(const dims &lhs, const dims &rhs, dims &broadcasted); +std::string dims2str(const dims &dims); + status_t one_way_broadcast(const dims &lhs, const dims &rhs); /// This function assumes the size of all vectors are correct. Eg. size of From c39b83bc20f1f0afbba3cdc86c9925d85214adb3 Mon Sep 17 00:00:00 2001 From: Jiexin-Zheng Date: Fri, 10 Jan 2025 06:52:47 +0000 Subject: [PATCH 06/40] benchdnn: graph: add select broadcast cases --- tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all | 2 ++ tests/benchdnn/inputs/graph/op/harness_bf16_all | 2 ++ tests/benchdnn/inputs/graph/op/harness_f16_all | 2 ++ tests/benchdnn/inputs/graph/op/harness_f32_all | 1 + 4 files changed, 7 insertions(+) diff --git a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all index 13d8e7ccd6d..6f1d9b9680c 100644 --- a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all +++ b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all @@ -35,6 +35,7 @@ --reset --dt=f32,bf16,f16 --in-shapes=3:20x16x384x64+4:20x16x64x384+0:20x16x384x64+1:20x1x1x384 --case=complex_fusion/mha/MHA-bert_large-inf-fp32-bs1.json --reset --dt=f32,bf16,f16 --in-shapes=3:10x16x384x64+4:10x1x64x384+0:10x1x384x64+1:10x1x1x384 --case=complex_fusion/mha/MHA-bert_large-inf-fp32-bs1.json --reset --dt=f32,bf16,f16 --in-shapes=4:56x12x128x64+5:56x12x64x128+0:56x12x128x64+1:56x1x1x128 --case=complex_fusion/mha/MHA-distill_bert-inf-fp32-bs1.json +--reset --dt=f32,bf16,f16 --in-shapes=2:1x1x1x128 --case=complex_fusion/mha/MHA-distill_bert-inf-fp32-bs1.json --reset --dt=f32,bf16,f16 --in-shapes=0:56x8x1024x80+1:56x8x77x80+2:56x8x77x80 --case=complex_fusion/mha/MHA-stable_diffusion-inf-fp32-bs1.json --reset --expected-n-partitions=0 --dt=f32,bf16,f16 --in-shapes=5:20x117x48x128+6:20x1x128x117+19:20x1x117x128 --case=complex_fusion/mha/MHA-starcoder-inf-fp32-bs1.json --reset --expected-n-partitions=0 --dt=f32,bf16,f16 --in-shapes=2514:32x16x512x64+2518:32x16x512x64+2543:32x1x512x512+2547:32x16x512x512+2525:32x16x512x64 --op-attrs=4837:shape:16384x1024 --case=complex_fusion/mha/MHA_forward-Bert_large-train-fp32-bs4.json @@ -51,6 +52,7 @@ --reset --expected-n-partitions=0 --in-shapes=4:4x32x32x128+3:4x32x128x33+0:4x32x33x128+1:4x1x32x33 --case=complex_fusion/mha/MHA-LLaMa-inf-int8-bs1.json --reset --in-shapes=4:20x16x384x64+3:20x16x64x384+0:20x16x384x64+1:20x1x1x384 --case=complex_fusion/mha/MHA-bert_large-inf-int8-bs1.json --reset --in-shapes=5:56x12x128x64+4:56x12x64x128+0:56x12x128x64+1:56x1x1x128 --case=complex_fusion/mha/MHA-distill_bert-inf-int8-bs1.json +--reset --in-shapes=2:1x1x1x128 --case=complex_fusion/mha/MHA-distill_bert-inf-int8-bs1.json --reset --expected-n-partitions=0 --in-shapes=4:20x117x48x128+3:20x1x128x117+0:20x1x117x128 --case=complex_fusion/mha/MHA-starcoder-inf-int8-bs1.json --reset --expected-n-partitions=0 --in-shapes=4:32x16x384x64+3:32x16x64x384+0:32x16x384x64+1:32x1x1x384 --case=complex_fusion/mha/dynamic_quantized_mha-Bert_large-inf-int8-bs1-fake.json --reset --in-shapes=4:20x16x384x64+3:20x16x64x384+0:20x16x384x64+1:20x1x1x384 --case=complex_fusion/mha/sdpa-plain-wo-scale-int8-bs1.json diff --git a/tests/benchdnn/inputs/graph/op/harness_bf16_all b/tests/benchdnn/inputs/graph/op/harness_bf16_all index 7e4e8abc9aa..70847b2f2b9 100644 --- a/tests/benchdnn/inputs/graph/op/harness_bf16_all +++ b/tests/benchdnn/inputs/graph/op/harness_bf16_all @@ -153,6 +153,8 @@ --reset --dt=bf16 --in-shapes=1:1x1x1x1 --case=op/f32/greaterequal.json --reset --dt=bf16 --in-shapes=1:1 --case=op/f32/greaterequal.json +# select +--reset --dt=bf16 --in-shapes=2:1x1x1x128 --case=op/f32/select.json # concat --reset --dt=bf16 --in-shapes=0:1x4096x14x14+1:1x4096x14x14 --case=op/f32/concat.json --reset --dt=bf16 --in-shapes=0:64x128x28x28+1:64x128x28x28 --op-attrs=0:axis:1 --case=op/f32/concat.json diff --git a/tests/benchdnn/inputs/graph/op/harness_f16_all b/tests/benchdnn/inputs/graph/op/harness_f16_all index ee77a8943d8..b6539efd726 100644 --- a/tests/benchdnn/inputs/graph/op/harness_f16_all +++ b/tests/benchdnn/inputs/graph/op/harness_f16_all @@ -153,6 +153,8 @@ --reset --dt=f16 --in-shapes=1:1x1x1x1 --case=op/f32/greaterequal.json --reset --dt=f16 --in-shapes=1:1 --case=op/f32/greaterequal.json +# select +--reset --dt=bf16 --in-shapes=2:1x1x1x128 --case=op/f32/select.json # concat --reset --dt=f16 --in-shapes=0:1x4096x14x14+1:1x4096x14x14 --case=op/f32/concat.json --reset --dt=f16 --in-shapes=0:64x128x28x28+1:64x128x28x28 --op-attrs=0:axis:1 --case=op/f32/concat.json diff --git a/tests/benchdnn/inputs/graph/op/harness_f32_all b/tests/benchdnn/inputs/graph/op/harness_f32_all index ff8781a57e1..da402ae12ff 100644 --- a/tests/benchdnn/inputs/graph/op/harness_f32_all +++ b/tests/benchdnn/inputs/graph/op/harness_f32_all @@ -948,6 +948,7 @@ --reset --in-shapes=0:2x9x3x5x7*acdeb+1:2x9x2x8x12*acdeb --op-attrs=0:sizes:2x8x12*mode:linear --case=op/f32/interpolate_bwd.json --reset --in-shapes=0:2x9x3x8x6*acdeb+1:2x9x2x5x12*acdeb --op-attrs=0:sizes:2x5x12*mode:linear --case=op/f32/interpolate_bwd.json --reset --in-shapes=0:2x9x3x8x7*acdeb+1:2x9x2x5x12*acdeb --op-attrs=0:sizes:2x5x12*mode:linear --case=op/f32/interpolate_bwd.json +--reset --in-shapes=2:1x1x1x128 --case=op/f32/select.json --reset --case=op/f32/select.json --reset --case=op/f32/gnorm.json --reset --case=op/f32/static_reshape.json From d9bfcf0f130f608af80d1f838f5c7344240b4071 Mon Sep 17 00:00:00 2001 From: "Zhang, Rong A" Date: Thu, 26 Dec 2024 18:11:48 -0800 Subject: [PATCH 07/40] graph: backend: dnnl: passes: verbose log enhancement --- src/graph/backend/dnnl/passes/compile_ops.cpp | 28 ++-- src/graph/backend/dnnl/passes/insert_ops.cpp | 52 ++++-- .../dnnl/passes/layout_propagation.cpp | 39 ++--- src/graph/backend/dnnl/passes/lower.cpp | 51 +++--- .../backend/dnnl/passes/memory_planning.cpp | 65 ++++---- src/graph/backend/dnnl/passes/transform.cpp | 148 +++++++++++------- src/graph/backend/dnnl/passes/utils.cpp | 51 +++--- src/graph/backend/dnnl/passes/utils.hpp | 15 +- 8 files changed, 255 insertions(+), 194 deletions(-) diff --git a/src/graph/backend/dnnl/passes/compile_ops.cpp b/src/graph/backend/dnnl/passes/compile_ops.cpp index 9eb65a4c0d4..d3ac4c34b0f 100644 --- a/src/graph/backend/dnnl/passes/compile_ops.cpp +++ b/src/graph/backend/dnnl/passes/compile_ops.cpp @@ -1,5 +1,5 @@ /******************************************************************************* - * Copyright 2021-2024 Intel Corporation + * Copyright 2021-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,6 +27,10 @@ #include "oneapi/dnnl/dnnl.hpp" +#define VCHECK_COMPILE_OPS(cond, status, msg, ...) \ + VCONDCHECK(graph, create, check, compile_ops, (cond), status, msg, \ + ##__VA_ARGS__); + namespace dnnl { namespace impl { namespace graph { @@ -43,15 +47,14 @@ status_t compile_ops(std::shared_ptr &sg) { return topo_order_visit(sg->get_output_ops(), [&](op_t *op) { const op_schema_t *opm = op_schema_registry_t::get_op_schema(op->get_kind()); - if (!opm) { - assertm(false, "no schema for current op"); - return status::invalid_graph_op; - } - if (!opm->has_additional_item("executable_creator")) { - assertm(false, "no executable creator in this op schema"); - return status::invalid_graph_op; - } + VCHECK_COMPILE_OPS(opm != nullptr, status::invalid_graph_op, + "no schema for current op %s", op->get_name().c_str()); + + VCHECK_COMPILE_OPS(opm->has_additional_item("executable_creator"), + status::invalid_graph_op, + "no executable creator in schema of op %s", + op->get_name().c_str()); auto cur_op = op->shared_from_this(); auto creator = opm->get_additional_item( @@ -59,10 +62,9 @@ status_t compile_ops(std::shared_ptr &sg) { std::shared_ptr exec = creator(cur_op, p_engine, mgr, pd_cache); - if (!exec) { - assertm(false, "unimplemented op, can't compile it"); - return status::unimplemented; - } + VCHECK_COMPILE_OPS(exec != nullptr, status::invalid_graph_op, + "unimplemented op, can't compile op %s", + op->get_name().c_str()); sg->execs_.emplace_back(exec); sg->is_constant_.push_back(op->has_attr(op_attr::is_constant) diff --git a/src/graph/backend/dnnl/passes/insert_ops.cpp b/src/graph/backend/dnnl/passes/insert_ops.cpp index 52dbf5012cc..d29c08341f3 100644 --- a/src/graph/backend/dnnl/passes/insert_ops.cpp +++ b/src/graph/backend/dnnl/passes/insert_ops.cpp @@ -1,5 +1,5 @@ /******************************************************************************* - * Copyright 2021-2024 Intel Corporation + * Copyright 2021-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,6 +29,10 @@ #include "graph/backend/dnnl/passes/insert_ops.hpp" #include "graph/backend/dnnl/passes/utils.hpp" +#define VCHECK_INSERT_OPS(cond, status, msg, ...) \ + VCONDCHECK(graph, create, check, insert_ops, (cond), status, msg, \ + ##__VA_ARGS__); + namespace dnnl { namespace impl { namespace graph { @@ -332,15 +336,21 @@ status_t insert_to_group_for_reorder(std::shared_ptr &sg) { // reorder's input has blocked format with group // while output has plain format, perhaps for // backward path. No such case for now, disable - return status::unimplemented; + VCHECK_INSERT_OPS(false, status::unimplemented, + "unsupported i/o dimentions to insert to_group for " + "reorder, input ndims: %d, output ndims: %d", + in_md.get_ndims(), out_md.get_ndims()); } else if (in_md.get_ndims() + 1 == out_md.get_ndims()) { // reorder's input has plain format while output // has blocked format with group, typically for // weight prepacking auto group = out_md.get_dims()[0]; - if (group * out_md.get_dims()[1] != in_md.get_dims()[0]) - return status::invalid_shape; - + VCHECK_INSERT_OPS( + group * out_md.get_dims()[1] == in_md.get_dims()[0], + status::invalid_shape, + "unmatched shape to insert to_group for reorder, group: %d," + "output dims[1]: %d, input dims[0], %d", + group, out_md.get_dims()[1], in_md.get_dims()[0]); // insert to_group op op_ptr to_group_op = std::make_shared(op_kind::dnnl_to_group); to_group_op->set_attr(op_attr::groups, group); @@ -348,7 +358,8 @@ status_t insert_to_group_for_reorder(std::shared_ptr &sg) { rewriter.insert_op_before(to_group_op, cur_op, 0); } else { // illegal shape - return status::invalid_shape; + VCHECK_INSERT_OPS(false, status::invalid_shape, + "invalid shape to insert to_group for reorder"); } } @@ -573,7 +584,11 @@ status_t insert_unsqueeze_and_squeeze_for_matmul( int32_t src_ndims = op->get_input_value(0)->get_logical_tensor().ndims; int32_t wei_ndims = op->get_input_value(1)->get_logical_tensor().ndims; - assertm(src_ndims >= 1 && wei_ndims >= 1, "invalid dims"); + VCHECK_INSERT_OPS(src_ndims >= 1 && wei_ndims >= 1, + status::invalid_shape, + "src_ndims and wei_ndims should >= 1, src_ndims: %d, " + "wei_ndims: %d", + src_ndims, wei_ndims); int32_t unsqueezed_dst_ndims = std::max(std::max(src_ndims, wei_ndims), 2); @@ -690,8 +705,9 @@ impl::status_t insert_runtime_u8_to_s8_for_matmul( // add a binary add here. } } else { - assertm(cur_op->num_inputs() == index, - "only support insert input at the end of inputs"); + VCHECK_INSERT_OPS(cur_op->num_inputs() == index, + status::unimplemented, + "only support insert input for wei at the end of inputs"); std::vector zp {-128}; auto zps_op = std::make_shared(op_kind::dnnl_add_zps); zps_op->set_attr(op_attr::qtype, "per_tensor"); @@ -833,10 +849,11 @@ status_t insert_unsqueeze_for_prelu(std::shared_ptr &sg) { const bool per_channel_broadcast = cur_op->get_attr(op_attr::per_channel_broadcast); - if (!prelu_doable(ltw(src_lt).vdims(), ltw(wei_lt).vdims(), data_format, - per_channel_broadcast)) { - return status::invalid_shape; - } + const bool prelu_doable_status = prelu_doable(ltw(src_lt).vdims(), + ltw(wei_lt).vdims(), data_format, per_channel_broadcast); + VCHECK_INSERT_OPS(prelu_doable_status, status::invalid_shape, + "invalid shape to insert unsqueeze for prelu"); + // insert unsqueeze op int32_t src_ndims = src_lt.ndims; int32_t wei_ndims = wei_lt.ndims; @@ -886,10 +903,11 @@ status_t insert_unsqueeze_and_squeeze_for_prelu_bwd( const bool per_channel_broadcast = wei_vdims.size() == 1 && wei_vdims[0] != 1; - if (!prelu_doable(ltw(src_lt).vdims(), wei_vdims, data_format, - per_channel_broadcast)) { - return status::invalid_shape; - } + const bool prelu_doable_status = prelu_doable(ltw(src_lt).vdims(), + wei_vdims, data_format, per_channel_broadcast); + VCHECK_INSERT_OPS(prelu_doable_status, status::invalid_shape, + "invalid shape to insert unsqueeze for prelu"); + // insert unsqueeze op int32_t src_ndims = src_lt.ndims; int32_t wei_ndims = wei_lt.ndims; diff --git a/src/graph/backend/dnnl/passes/layout_propagation.cpp b/src/graph/backend/dnnl/passes/layout_propagation.cpp index cdd57b93665..916c8054b07 100644 --- a/src/graph/backend/dnnl/passes/layout_propagation.cpp +++ b/src/graph/backend/dnnl/passes/layout_propagation.cpp @@ -1,5 +1,5 @@ /******************************************************************************* - * Copyright 2021-2024 Intel Corporation + * Copyright 2021-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,6 +26,10 @@ #include "graph/backend/dnnl/common.hpp" #include "graph/backend/dnnl/layout_propagator.hpp" +#define VCHECK_LAYOUT_PROPAGATION(cond, status, msg, ...) \ + VCONDCHECK(graph, create, check, layout_propagation, (cond), status, msg, \ + ##__VA_ARGS__); + namespace dnnl { namespace impl { namespace graph { @@ -117,15 +121,14 @@ status_t layout_propagation(std::shared_ptr &sg) { const op_schema_t *opm = op_schema_registry_t::get_op_schema(op->get_kind()); - if (!opm) { - assertm(false, "no schema for current op"); - return status::invalid_graph_op; - } + VCHECK_LAYOUT_PROPAGATION(opm != nullptr, status::invalid_graph_op, + "no schema for current op: %s", op->get_name().c_str()); - if (!opm->has_additional_item("layout_propagator")) { - assertm(false, "no layout propagator in this op schema"); - return status::invalid_graph_op; - } + VCHECK_LAYOUT_PROPAGATION( + opm->has_additional_item("layout_propagator"), + status::invalid_graph_op, + "no layout propagator in the schema of op: %s", + op->get_name().c_str()); auto cur_op = op->shared_from_this(); auto propagator = opm->get_additional_item( @@ -137,14 +140,14 @@ status_t layout_propagation(std::shared_ptr &sg) { return status; }); - if (ret != status::success) return ret; + VCHECK_LAYOUT_PROPAGATION( + ret == status::success, ret, "layout propagation failed"); rewriter.run(); propagation_number++; - if (propagation_number >= LAYOUT_PROPAGATION_NUMBER) { - assertm(false, - "expect layout propagation number to be less than 10"); - return status::invalid_arguments; - } + VCHECK_LAYOUT_PROPAGATION( + propagation_number < LAYOUT_PROPAGATION_NUMBER, + status::invalid_arguments, + "expect layout propagation number to be less than 10"); } while (need_prop_once_more(sg)); // Add check for the layout type of partition outputs to make partition @@ -160,8 +163,7 @@ status_t layout_propagation(std::shared_ptr &sg) { auto lt = in_val->get_logical_tensor(); if (lt.id == sg->ins_[i].id) { auto md = make_dnnl_memory_desc(lt); - auto status = fill_layout_info(&(sg->ins_[i]), md); - if (status != status::success) return status; + CHECK(fill_layout_info(&(sg->ins_[i]), md)); } } } @@ -172,8 +174,7 @@ status_t layout_propagation(std::shared_ptr &sg) { auto lt = out_val->get_logical_tensor(); if (lt.id == sg->outs_[i].id) { auto md = make_dnnl_memory_desc(lt); - auto status = fill_layout_info(&(sg->outs_[i]), md); - if (status != status::success) return status; + CHECK(fill_layout_info(&(sg->outs_[i]), md)); } } } diff --git a/src/graph/backend/dnnl/passes/lower.cpp b/src/graph/backend/dnnl/passes/lower.cpp index 888520dbb46..c7de715df3c 100644 --- a/src/graph/backend/dnnl/passes/lower.cpp +++ b/src/graph/backend/dnnl/passes/lower.cpp @@ -381,18 +381,17 @@ static status_t static_quant_handler( auto in_vals = op->get_input_values(); auto out_vals = op->get_output_values(); - assertm(in_vals.size() == 1 && out_vals.size() == 1, - "static quantize/dequantize should only have one input and " - "output"); - + VCHECK_INVALID_ARGUMENT(in_vals.size() == 1 && out_vals.size() == 1, + "static quantize/dequantize should only have one input and output" + " but got %zu input and %zu output", + in_vals.size(), out_vals.size()); + VCHECK_INVALID_ARGUMENT(std::all_of(scales.begin(), scales.end(), + [](float i) { return i != 0.f; }), + "scales can't be zero"); // int8 = f32 / scales + zps op_ptr mul_scales_op = std::make_shared(op_kind::dnnl_mul_scales); op_ptr add_zps_op = std::make_shared(op_kind::dnnl_add_zps); - assertm(std::all_of(scales.begin(), scales.end(), - [](float i) { return i != 0.f; }), - "scales can't be zero"); - std::vector inv_scales = dnnl_impl::utils::fmap(scales, [](float s) { return 1.f / s; }); mul_scales_op->set_attr>(op_attr::scales, inv_scales); @@ -439,8 +438,10 @@ static status_t static_dequant_handler( auto in_vals = cur_op->get_input_values(); auto out_vals = cur_op->get_output_values(); - assertm(in_vals.size() == 1 && out_vals.size() == 1, - "static dequantize should only have one input and output"); + VCHECK_INVALID_ARGUMENT(in_vals.size() == 1 && out_vals.size() == 1, + "static dequantize should only have one input and output but " + "got %zu input and %zu output", + in_vals.size(), out_vals.size()); // f32 = scales * (int8 - zps) op_ptr sub_zps_op = std::make_shared(op_kind::dnnl_sub_zps); @@ -484,9 +485,11 @@ static status_t dynamic_quant_handler( auto &in_vals = cur_op->get_input_values(); auto &out_vals = cur_op->get_output_values(); - assertm((in_vals.size() == 3 || in_vals.size() == 2) + VCHECK_INVALID_ARGUMENT((in_vals.size() == 3 || in_vals.size() == 2) && out_vals.size() == 1, - "dynamic quantize must have 2 or 3 inputs and 1 output"); + "dynamic quantize must have 2 or 3 inputs and 1 output, but " + "got %zu input and %zu output", + in_vals.size(), out_vals.size()); // DynamicQuantize has optional zps bool has_zps = in_vals.size() == 3; @@ -543,9 +546,11 @@ static status_t dynamic_dequant_handler( auto &in_vals = cur_op->get_input_values(); auto &out_vals = cur_op->get_output_values(); - assertm((in_vals.size() == 3 || in_vals.size() == 2) + VCHECK_INVALID_ARGUMENT((in_vals.size() == 3 || in_vals.size() == 2) && out_vals.size() == 1, - "dynamic dequantize must have 2 or 3 inputs and 1 output"); + "dynamic dequantize must have 2 or 3 inputs and 1 output, but " + "got %zu input and %zu output", + in_vals.size(), out_vals.size()); // DynamicDequantize has optional zps bool has_zps = in_vals.size() == 3; @@ -659,8 +664,10 @@ static status_t select_handler( auto in_vals = op->get_input_values(); auto out_vals = op->get_output_values(); - assertm(in_vals.size() == 3 && out_vals.size() == 1, - "select should have three inputs and a output"); + VCHECK_INVALID_ARGUMENT(in_vals.size() == 3 && out_vals.size() == 1, + "select should have three input and one output but " + "got %zu input and %zu output", + in_vals.size(), out_vals.size()); auto cond = in_vals[0]; auto src0 = in_vals[1]; auto src1 = in_vals[2]; @@ -828,13 +835,11 @@ status_t lower_down(std::shared_ptr &sg) { for (auto &cur_op : sg->get_ops()) { auto kind = cur_op->get_kind(); - if (!handler_table.count(kind)) { - assertm(false, - "All spec ops should be lowered to internal ops, except " - "for some utility ops like End, Wildcard"); - return status::invalid_graph_op; - } - + VCHECK_INVALID_ARGUMENT(handler_table.count(kind), + "All spec ops should be lowered to internal ops, except " + "for some utility ops like End, Wildcard. Current op name is " + "%s", + cur_op->get_name().c_str()); // lower this spec op to dnnl backend internal op const auto &handler = handler_table.at(kind); auto status = handler(cur_op, rewriter); diff --git a/src/graph/backend/dnnl/passes/memory_planning.cpp b/src/graph/backend/dnnl/passes/memory_planning.cpp index 8c2c17c7405..b6d4dc9367d 100644 --- a/src/graph/backend/dnnl/passes/memory_planning.cpp +++ b/src/graph/backend/dnnl/passes/memory_planning.cpp @@ -1,5 +1,5 @@ /******************************************************************************* - * Copyright 2021-2024 Intel Corporation + * Copyright 2021-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,6 +33,10 @@ #include "oneapi/dnnl/dnnl.hpp" +#define VCHECK_MEMORY_PLANNING(cond, status, msg, ...) \ + VCONDCHECK(graph, create, check, memory_planning, (cond), status, msg, \ + ##__VA_ARGS__); + namespace dnnl { namespace impl { namespace graph { @@ -343,7 +347,8 @@ status_t memory_planner_t::assign_external_inputs_buffer( // assign alias auto aliases = alias_analyzer_.get_all_aliases(val); for (auto &alias : aliases) { - assertm(!buffer_assignments_.count(alias), + VCHECK_MEMORY_PLANNING(!buffer_assignments_.count(alias), + status::runtime_error, "alias of input has been assigned buffer"); buffer_assignments_.insert(std::make_pair(alias, info)); } @@ -716,7 +721,11 @@ status_t memory_planner_t::book_buffers(std::shared_ptr &sg) { persistent_registrar.book(info.index_, persistent_buffer_assigner_.query_size(info.index_)); break; - default: return status::unimplemented; + default: + VCHECK_MEMORY_PLANNING(false, status::unimplemented, + "booking memory failed for unimplemented buffer kind " + "%d", + info.kind_); } } return status::success; @@ -770,21 +779,20 @@ status_t memory_planner_t::prepare_execution_args_set( } return status::success; }); - if (ret != status::success) return ret; + VCHECK_MEMORY_PLANNING( + ret == status::success, ret, "prepare memory failed"); // construct the dnnl execution args for each op ret = topo_order_visit(sg->get_output_ops(), [&](op_t *op) { const op_schema_t *opm = op_schema_registry_t::get_op_schema(op->get_kind()); - if (!opm) { - assertm(false, "no schema for current op"); - return status::invalid_graph_op; - } + VCHECK_MEMORY_PLANNING(opm != nullptr, status::invalid_graph_op, + "no schema for current op: %s", op->get_name().c_str()); - if (!opm->has_additional_item("arg_indices_getter")) { - assertm(false, "no arg indices getter in this op schema"); - return status::invalid_graph_op; - } + VCHECK_MEMORY_PLANNING(opm->has_additional_item("arg_indices_getter"), + status::invalid_graph_op, + "no arg indices getter in the schema of op: %s", + op->get_name().c_str()); auto getter = opm->get_additional_item( "arg_indices_getter"); @@ -805,7 +813,9 @@ status_t memory_planner_t::prepare_execution_args_set( // find the corresponding memory object dnnl::memory mem; if (!exec_args_set_.find_value_mem_map(val, mem)) { - return status::invalid_arguments; + VCHECK_MEMORY_PLANNING(false, status::invalid_arguments, + "can't find memory for value id: %zu", + val->get_logical_tensor().id); } dnnl_exec_args.insert({dnnl_arg, mem}); @@ -829,8 +839,6 @@ status_t memory_planner_t::prepare_execution_args_set( // - Assign internal allocated persistent buffer to corresponding edges. // - Prepare the memory objects which will be used in execution. status_t memory_planner_t::run(std::shared_ptr &sg) { - status_t ret; - auto &mgr = sg->fusion_info_mgr_; const auto &p_engine = *(sg->p_engine_); const auto &inputs = sg->ins_; @@ -866,21 +874,17 @@ status_t memory_planner_t::run(std::shared_ptr &sg) { } // Assign external_input buffers to subgraph's inputs and their alias - ret = assign_external_inputs_buffer(sg, inputs); - if (ret != status::success) return ret; + CHECK(assign_external_inputs_buffer(sg, inputs)); // Assign internal temporary buffer for all other edges - ret = assign_internal_temporary_buffer(sg, edge_ref_count, mgr, false); - if (ret != status::success) return ret; + CHECK(assign_internal_temporary_buffer(sg, edge_ref_count, mgr, false)); // Replace some internal temporary buffers to user given external output // buffer - ret = assign_external_outputs_buffer(sg, outputs, mgr); - if (ret != status::success) return ret; + CHECK(assign_external_outputs_buffer(sg, outputs, mgr)); // Replace some internal temporary buffers to cached persistent buffer - ret = assign_internal_persistent_buffer(sg, mgr); - if (ret != status::success) return ret; + CHECK(assign_internal_persistent_buffer(sg, mgr)); // Reset the unreplaced internal temporary buffer temporary_buffer_assigner_.clear(); @@ -895,20 +899,13 @@ status_t memory_planner_t::run(std::shared_ptr &sg) { // Re-assign internal temporary buffer for reset ones (will re-do memory // sharing between temporary buffers) - ret = assign_internal_temporary_buffer(sg, edge_ref_count, mgr, true); - if (ret != status::success) return ret; - + CHECK(assign_internal_temporary_buffer(sg, edge_ref_count, mgr, true)); // Check which input/output pair of the subgraph can be inplaced - ret = prepare_subgraph_inplace_pairs(sg, false); - if (ret != status::success) return ret; - - ret = book_buffers(sg); - if (ret != status::success) return ret; + CHECK(prepare_subgraph_inplace_pairs(sg, false)); + CHECK(book_buffers(sg)); // Bind memory object to each value - ret = prepare_execution_args_set(sg, p_engine, mgr); - if (ret != status::success) return ret; - + CHECK(prepare_execution_args_set(sg, p_engine, mgr)); return status::success; } diff --git a/src/graph/backend/dnnl/passes/transform.cpp b/src/graph/backend/dnnl/passes/transform.cpp index b5133e6135b..db0f75b2b63 100644 --- a/src/graph/backend/dnnl/passes/transform.cpp +++ b/src/graph/backend/dnnl/passes/transform.cpp @@ -40,10 +40,9 @@ #include "graph/backend/dnnl/passes/transform.hpp" #include "graph/backend/dnnl/passes/utils.hpp" -#define VCHECK_UNIMPLEMENTED(cond, msg, ...) \ - VCONDCHECK(graph, create, check, compile, (cond), status::unimplemented, \ - msg, ##__VA_ARGS__); - +#define VCHECK_TRANSFORM(cond, status, msg, ...) \ + VCONDCHECK(graph, create, check, transform, (cond), status, msg, \ + ##__VA_ARGS__); namespace dnnl { namespace impl { namespace graph { @@ -260,8 +259,9 @@ status_t convert_to_runtime_src_scales(std::shared_ptr &sg) { subgraph_rewriter_t rewriter(sg); for (auto &cur_op : scales_ops) { - assertm(cur_op->num_outputs() == 1, - "scale_op should have only one output value."); + VCHECK_TRANSFORM(cur_op->num_outputs() == 1, status::invalid_graph_op, + "scale_op should have only one output value, but got %zu", + cur_op->num_outputs()); auto out_val = cur_op->get_output_values()[0]; auto consumers = out_val->get_consumers(); if (consumers.empty()) continue; @@ -318,8 +318,9 @@ status_t convert_to_runtime_src_zero_points(std::shared_ptr &sg) { subgraph_rewriter_t rewriter(sg); for (auto &zp_op : zp_ops) { - assertm(zp_op->num_outputs() == 1, - "zp_op should have only one output value."); + VCHECK_TRANSFORM(zp_op->num_outputs() == 1, status::invalid_graph_op, + "zp_op should have only one output value, but got %zu", + zp_op->num_outputs()); auto out_val = zp_op->get_output_values()[0]; auto consumers = out_val->get_consumers(); @@ -377,8 +378,9 @@ status_t convert_to_runtime_dst_zero_points(std::shared_ptr &sg) { subgraph_rewriter_t rewriter(sg); for (auto &zp_op : zp_ops) { - assertm(zp_op->num_outputs() == 1, - "zp_op should have only one output value."); + VCHECK_TRANSFORM(zp_op->num_outputs() == 1, status::invalid_graph_op, + "zp_op should have only one output value, but got %zu", + zp_op->num_outputs()); auto in_val = zp_op->get_input_values()[0]; bool is_output_zps = in_val->has_producer() && impl::utils::one_of(in_val->get_producer().get_kind(), @@ -426,8 +428,10 @@ status_t fold_mul_scales(std::shared_ptr &sg) { || visited.count(cur_op.get()) != 0) continue; - assertm(cur_op->num_outputs() == 1, - "cur_op should have only one output value."); + VCHECK_TRANSFORM(cur_op->num_outputs() == 1, false, + "dnnl_mul_scales should have only one output value, but " + "got %zu", + cur_op->num_outputs()); auto out_val = cur_op->get_output_values()[0]; auto consumers = out_val->get_consumers(); if (consumers.empty()) continue; @@ -493,8 +497,10 @@ impl::status_t fold_sub_zps_add_zps(std::shared_ptr &sg) { || visited.count(cur_op.get()) != 0) continue; - assertm(cur_op->num_outputs() == 1, - "cur_op should have only one output value."); + VCHECK_TRANSFORM(cur_op->num_outputs() == 1, false, + "dnnl_sub_zps should have only one output value, but got " + "%zu", + cur_op->num_outputs()); auto out_val = cur_op->get_output_values()[0]; auto consumers = out_val->get_consumers(); if (consumers.empty()) continue; @@ -593,8 +599,12 @@ status_t fuse_to_int8_concat(std::shared_ptr &sg) { rewriter.fuse_op_to_successor(scale_op.shared_from_this()); } - assertm(concat_op->get_output_value(0)->get_consumers().size() == 1, - "concat's successor op should only have one consumer."); + VCHECK_TRANSFORM( + concat_op->get_output_value(0)->get_consumers().size() == 1, + status::invalid_graph, + "concat's successor op should only have one consumer, but got " + "%zu", + concat_op->get_output_value(0)->get_consumers().size()); op_t &scale_op = concat_op->get_output_value(0)->get_consumers()[0].get_op(); op_t &zp_op = scale_op.get_output_value(0)->get_consumers()[0].get_op(); @@ -845,7 +855,8 @@ status_t fuse_post_ops(std::shared_ptr &sg) { return status::success; }); - if (ret != status::success) return ret; + VCHECK_TRANSFORM(ret == status::success, ret, + "Error finding fusible post_op groups"); if (fuse_groups.empty()) { changed = false; @@ -992,14 +1003,12 @@ status_t fuse_post_ops(std::shared_ptr &sg) { bool changed = true; do { - auto ret = fuse_post_ops_func(changed); - if (ret != status::success) return ret; + CHECK(fuse_post_ops_func(changed)); cnt++; } while (changed && cnt <= max_num_limit); - assertm(cnt <= max_num_limit + 1, - "Failed to fuse all post ops since there has unsupported ones."); - if (cnt > max_num_limit + 1) return status::unimplemented; + VCHECK_TRANSFORM(cnt <= max_num_limit + 1, status::unimplemented, + "Failed to fuse all post ops since there has unsupported ones"); return status::success; } @@ -1019,8 +1028,9 @@ status_t fuse_src_zero_points(std::shared_ptr &sg) { subgraph_rewriter_t rewriter(sg); for (auto &zp_op : zp_ops) { - assertm(zp_op->num_outputs() == 1, - "zp_op should have only one output value."); + VCHECK_TRANSFORM(zp_op->num_outputs() == 1, status::invalid_graph_op, + "zp_op should have only one output value, but got %zu", + zp_op->num_outputs()); auto out_val = zp_op->get_output_values()[0]; auto consumers = out_val->get_consumers(); @@ -1071,7 +1081,7 @@ status_t fuse_src_zero_points(std::shared_ptr &sg) { auto zps = zp_op->get_attr>(op_attr::zps); not_all_zero = !utils::all_zero(zps); if (not_all_zero) { - assertm(zps.size() == 1, + VCHECK_TRANSFORM(zps.size() == 1, status::unimplemented, "zp attr only support scalar zp, need to use " "runtime arg to support vector zp"); fusion_info.set_zero_points( @@ -1105,8 +1115,9 @@ status_t fuse_src_scales(std::shared_ptr &sg) { subgraph_rewriter_t rewriter(sg); for (auto &scale_op : scales_ops) { - assertm(scale_op->num_outputs() == 1, - "scale_op should have only one output value."); + VCHECK_TRANSFORM(scale_op->num_outputs() == 1, status::invalid_graph_op, + "scale_op should have only one output value, but got %zu", + scale_op->num_outputs()); auto out_val = scale_op->get_output_values()[0]; auto consumers = out_val->get_consumers(); if (consumers.empty()) continue; @@ -1132,10 +1143,15 @@ status_t fuse_src_scales(std::shared_ptr &sg) { int ndims = scale_op->get_input_value(0) ->get_logical_tensor() .ndims; - if ((!trans_flag && axis != ndims - 1 && axis != -1) - || (trans_flag && axis != ndims - 2 && axis != -2)) { - return status::unimplemented; - } + VCHECK_TRANSFORM( + (!trans_flag && (axis == ndims - 1 || axis == -1)) + || (trans_flag + && (axis == ndims - 2 || axis == -2)), + status::unimplemented, + "Matmul only support applying per channel scale " + "along the last dimension for DNNL_ARG_WEIGHTS. " + "trans_flag: %d, axis: %lld, ndims: %d", + trans_flag, axis, ndims); } int64_t key = -1; if (next_op.has_attr(op_attr::fusion_info_key)) { @@ -1161,7 +1177,8 @@ status_t fuse_src_scales(std::shared_ptr &sg) { scale_op->shared_from_this(), true, offset); rewriter.to_remove(scale_op->shared_from_this()); } else { - assertm(false, "src scales must be runtime scales."); + VCHECK_TRANSFORM(false, status::unimplemented, + "src scales must be runtime scales."); } } } @@ -1315,8 +1332,9 @@ status_t fuse_dst_zero_points(std::shared_ptr &sg) { subgraph_rewriter_t rewriter(sg); for (auto &zp_op : zp_ops) { - assertm(zp_op->num_outputs() == 1, - "zp_op should have only one output value."); + VCHECK_TRANSFORM(zp_op->num_outputs() == 1, status::invalid_graph_op, + "zp_op should have only one output value, but got %zu", + zp_op->num_outputs()); auto out_val = zp_op->get_output_values()[0]; auto consumers = out_val->get_consumers(); @@ -2259,7 +2277,9 @@ status_t binary_canonicalization(std::shared_ptr &sg) { = binary_doable(ltw(src0_lt).vdims(), ltw(src1_lt).vdims()); } - if (!shape_check_ok) return status::invalid_shape; + VCHECK_TRANSFORM(shape_check_ok, status::invalid_shape, + "Binary op shape check failed for op: %s .", + cur_op->get_name().c_str()); // insert unsqueeze op int32_t src0_ndims = src0_lt.ndims; @@ -2570,7 +2590,8 @@ status_t fuse_adjacent_reorders(std::shared_ptr &sg) { return status::success; }); - if (ret != status::success) return ret; + VCHECK_TRANSFORM(ret == status::success, ret, + "Error finding adjacent reorders."); if (fuse_groups.empty()) { changed = false; @@ -2698,8 +2719,7 @@ status_t fuse_adjacent_reorders(std::shared_ptr &sg) { const auto &pd = reorder_executable_t::create_desc( fused_op, *p_engine, mgr, pd_cache); const memory::desc scratchpad_desc = pd.scratchpad_desc(); - auto status = fill_layout_info(scratchpad_val, scratchpad_desc); - if (status != status::success) return status; + CHECK(fill_layout_info(scratchpad_val, scratchpad_desc)); rewriter.to_insert(fused_op); rewriter.to_remove(op1->shared_from_this()); @@ -2714,13 +2734,12 @@ status_t fuse_adjacent_reorders(std::shared_ptr &sg) { bool changed = true; do { - auto ret = fuse_two_adjacent_reorders(changed); - if (ret != status::success) return ret; + CHECK(fuse_two_adjacent_reorders(changed)); cnt++; } while (changed && cnt <= max_num_limit); - assertm(cnt <= max_num_limit + 1, "reorder fusion failed."); - if (cnt > max_num_limit + 1) return status::unimplemented; + VCHECK_TRANSFORM(cnt <= max_num_limit + 1, status::unimplemented, + "Reorder fusion failed."); return status::success; } @@ -3108,11 +3127,11 @@ status_t reorder_canonicalization(std::shared_ptr &sg) { }; if (qtype == "per_channel") { - VCHECK_UNIMPLEMENTED( - (!(cur_op->has_attr(op_attr::with_runtime_src_zps) - || cur_op->has_attr( - op_attr::with_runtime_dst_zps))), - "reorder primitive does not support zero points for " + VCHECK_TRANSFORM((!(cur_op->has_attr(op_attr::with_runtime_src_zps) + || cur_op->has_attr( + op_attr::with_runtime_dst_zps))), + status::unimplemented, + "Reorder primitive does not support zero points for " "per-channel quantization"); } @@ -3243,15 +3262,13 @@ status_t common_reorder_elimination(std::shared_ptr &sg) { bool changed = true; do { - auto ret = cse_func(changed); - if (ret != status::success) return ret; + CHECK(cse_func(changed)); cnt++; } while (changed && cnt <= max_iter_num); - assertm(cnt <= max_iter_num + 1, + VCHECK_TRANSFORM(cnt <= max_iter_num + 1, status::unimplemented, "Failed to eliminate common reorders since the pass can't " "converge."); - if (cnt > max_iter_num + 1) return status::unimplemented; return status::success; } @@ -3348,22 +3365,31 @@ status_t combine_binary_post_op_scales(std::shared_ptr &sg) { continue; op_t &scales_in0_op = bin_in0_val->get_producer(); - assertm(scales_in0_op.get_kind() == op_kind::dnnl_mul_scales, - "the first predecessor of a binary op should be mul_scales."); + VCHECK_TRANSFORM(scales_in0_op.get_kind() == op_kind::dnnl_mul_scales, + status::invalid_graph, + "the first predecessor of a binary op should be mul_scales. " + "but got %s", + scales_in0_op.get_name().c_str()); if (scales_in0_op.has_attr(op_attr::with_runtime_scales) && scales_in0_op.get_attr(op_attr::with_runtime_scales)) continue; op_t &scales_in1_op = bin_in1_val->get_producer(); - assertm(scales_in1_op.get_kind() == op_kind::dnnl_mul_scales, - "the second predecessor of a binary op should be mul_scales."); + VCHECK_TRANSFORM(scales_in1_op.get_kind() == op_kind::dnnl_mul_scales, + status::invalid_graph, + "the second predecessor of a binary op should be mul_scales. " + "but got %s", + scales_in1_op.get_name().c_str()); if (scales_in1_op.has_attr(op_attr::with_runtime_scales) && scales_in1_op.get_attr(op_attr::with_runtime_scales)) continue; op_t &scales_out_op = bin_out_val->get_consumers()[0].get_op(); - assertm(scales_out_op.get_kind() == op_kind::dnnl_mul_scales, - "the successor of a binary op should be mul_scales."); + VCHECK_TRANSFORM(scales_out_op.get_kind() == op_kind::dnnl_mul_scales, + status::invalid_graph, + "the successor predecessor of a binary op should be " + "mul_scales. but got %s", + scales_out_op.get_name().c_str()); if (scales_out_op.has_attr(op_attr::with_runtime_scales) && scales_out_op.get_attr(op_attr::with_runtime_scales)) continue; @@ -3403,9 +3429,10 @@ status_t combine_binary_post_op_scales(std::shared_ptr &sg) { const auto multiplier = std::multiplies(); switch (bin_kind) { case dnnl::algorithm::binary_add: - assertm(std::all_of(in0_scales.begin(), in0_scales.end(), + VCHECK_TRANSFORM( + std::all_of(in0_scales.begin(), in0_scales.end(), [](float v) { return v != 0.f; }), - "scales can't be zero"); + status::invalid_arguments, "scales can't be zero"); new_scales_in0 = fuse_scales(in0_scales, inv_out_scales, multiplier); new_scales_in1 @@ -3425,7 +3452,8 @@ status_t combine_binary_post_op_scales(std::shared_ptr &sg) { {&scales_in0_op, &scales_in1_op, &scales_out_op}); break; default: - assertm(false, "unsupported binary post-op was provided."); + VCHECK_TRANSFORM(false, status::unimplemented, + "unsupported binary post-op was provided."); break; } diff --git a/src/graph/backend/dnnl/passes/utils.cpp b/src/graph/backend/dnnl/passes/utils.cpp index 88159700f96..b31bb7e4301 100644 --- a/src/graph/backend/dnnl/passes/utils.cpp +++ b/src/graph/backend/dnnl/passes/utils.cpp @@ -64,9 +64,8 @@ status_t set_given_inputs_outputs(std::shared_ptr &sg, // partition in/outs should not have default id. There must be some // errors in previous graph transformation stage - if (edge_id == std::numeric_limits::max()) - return status::invalid_graph; - + VCHECK_UTILS(edge_id != std::numeric_limits::max(), + status::invalid_graph, "Invalid edge_id %zu", edge_id); bool found = false; for (const auto &given : givens) { if (edge_id == given.id) { @@ -86,7 +85,9 @@ status_t set_given_inputs_outputs(std::shared_ptr &sg, } } } - if (!valid) return status::invalid_arguments; + VCHECK_UTILS(valid, status::invalid_arguments, + "Invalid given logical tensor for given.id %zu", + given.id); } edge->set_logical_tensor(given); @@ -95,17 +96,15 @@ status_t set_given_inputs_outputs(std::shared_ptr &sg, } } - if (!found) return status::invalid_arguments; + VCHECK_UTILS(found, status::invalid_arguments, + "Can't find given logical tensor for edge_id %zu", edge_id); } return status::success; }; - status_t ret; - ret = func(graph_in_vals, inputs, true, true); - if (ret != status::success) return ret; - - ret = func(graph_out_vals, outputs, true, false); - return ret; + CHECK(func(graph_in_vals, inputs, true, true)); + CHECK(func(graph_out_vals, outputs, true, false)); + return status::success; } status_t set_given_inputs_outputs(std::vector &subgraph, @@ -187,7 +186,8 @@ std::vector get_constant_block_output_values( return status::success; }; status_t status = topo_order_visit(sg->get_output_ops(), func); - if (status != status::success) return {}; + VCHECK_UTILS(status == status::success, {}, + "Failed to get constant block output values"); return ret; } @@ -220,8 +220,7 @@ status_t infer_shape(std::shared_ptr &sg) { } } - auto ret = sg->infer_shape(); - if (ret != status::success) return ret; + CHECK(sg->infer_shape()); // Fill the inferred shape and strides to subgraph's outputs for (size_t i = 0; i < sg->outs_.size(); i++) { @@ -238,7 +237,7 @@ status_t infer_shape(std::shared_ptr &sg) { op->remove_attr(op_attr::dw_type); } - return ret; + return status::success; } const std::map &get_binary_alg_map() { @@ -273,8 +272,11 @@ bool binary_doable( static bool post_binary_fusible_impl(const op_t *base_op, const std::vector &fused_shape, const std::vector &other_shape, engine_kind_t ekind) { - assertm(fused_shape.size() == other_shape.size(), - "must have same ndims, pls run binary_canonicalization pass first"); + VCHECK_UTILS(fused_shape.size() == other_shape.size(), false, + "binary fusible ops must have same ndims, " + "fused_shape size is %zu, other_shape size is %zu." + "pls run binary_canonicalization pass first", + fused_shape.size(), other_shape.size()); // full tensor and per tensor broadcasted if (fused_shape == other_shape || std::all_of(other_shape.begin(), other_shape.end(), @@ -622,10 +624,10 @@ bool is_layout_reorder(const op_t *op) { } std::shared_ptr clone_mul_scales(const std::shared_ptr &scale_op) { - assertm(scale_op->num_inputs() <= 1, - "scale_op should have only one input value."); - assertm(!scale_op->has_attr(op_attr::with_runtime_scales), - "scale_op should be static"); + VCHECK_UTILS(scale_op->num_inputs() <= 1 + && !scale_op->has_attr(op_attr::with_runtime_scales), + nullptr, + "scale_op should be static and have only one input value."); auto new_op = std::make_shared(op_kind::dnnl_mul_scales); new_op->set_attr>(op_attr::scales, scale_op->get_attr>(op_attr::scales)); @@ -637,10 +639,9 @@ std::shared_ptr clone_mul_scales(const std::shared_ptr &scale_op) { } bool inverse_mul_scales(std::shared_ptr &scale_op) { - assertm(scale_op->num_inputs() <= 1, - "scale_op should have only one input value."); - assertm(!scale_op->has_attr(op_attr::with_runtime_scales), - "scale_op should be static"); + VCHECK_UTILS(scale_op->num_inputs() <= 1 + && !scale_op->has_attr(op_attr::with_runtime_scales), + false, "scale_op should be static and have only one input value."); auto scales = scale_op->get_attr>(op_attr::scales); scales = dnnl_impl::utils::fmap(scales, [](float s) { return 1.f / s; }); scale_op->set_attr(op_attr::scales, scales); diff --git a/src/graph/backend/dnnl/passes/utils.hpp b/src/graph/backend/dnnl/passes/utils.hpp index 6ab4536157d..aac58b1ee1b 100644 --- a/src/graph/backend/dnnl/passes/utils.hpp +++ b/src/graph/backend/dnnl/passes/utils.hpp @@ -41,6 +41,9 @@ #include "oneapi/dnnl/dnnl.hpp" +#define VCHECK_UTILS(cond, status, msg, ...) \ + VCONDCHECK(graph, create, check, utils, (cond), status, msg, ##__VA_ARGS__); + namespace dnnl { namespace impl { namespace graph { @@ -88,7 +91,8 @@ class pass_pipeline_t { status_t ret; for (size_t i = 0; i < passes_.size(); i++) { ret = passes_[i](sg); - if (ret != status::success) { return ret; } + VCHECK_UTILS(ret == status::success, ret, "run pass %s failed", + names_[i].c_str()); // Dump the subgraph to dot file if (enable_visualizer_) { @@ -97,8 +101,13 @@ class pass_pipeline_t { } // Validate the subgraph after each pass - if (enable_validator_) { ret = validator_.run(sg); } - if (ret != status::success) { return ret; } + if (enable_validator_) { + ret = validator_.run(sg); + VCHECK_UTILS(ret == status::success, ret, + "validation failed " + "after run pass %s", + names_[i].c_str()); + } } return status::success; } From 3ad5d52f05d2ed413c3f70faf734034f976a11a4 Mon Sep 17 00:00:00 2001 From: "Kassen, Andrew" Date: Thu, 9 Jan 2025 10:32:36 -0800 Subject: [PATCH 08/40] ngen: use mask for bitfield cast --- src/gpu/intel/jit/ngen/ngen_gen12.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpu/intel/jit/ngen/ngen_gen12.hpp b/src/gpu/intel/jit/ngen/ngen_gen12.hpp index 0f08295ef8e..3ea47408d5b 100644 --- a/src/gpu/intel/jit/ngen/ngen_gen12.hpp +++ b/src/gpu/intel/jit/ngen/ngen_gen12.hpp @@ -726,7 +726,7 @@ static inline void encodeTernarySrc0(Instruction12 &i, S0 src0, Tag tag) auto vs0 = encodeTernaryVS01(src0); - i.ternary.src0VS0 = vs0; + i.ternary.src0VS0 = vs0 & 1; i.ternary.src0VS1 = vs0 >> 1; } @@ -745,7 +745,7 @@ static inline void encodeTernarySrc1(Instruction12 &i, S1 src1, Tag tag) auto vs1 = encodeTernaryVS01(src1); - i.ternary.src1VS0 = vs1; + i.ternary.src1VS0 = vs1 & 1; i.ternary.src1VS1 = vs1 >> 1; } From 681de1a0c9da969b226a230568d49bb73606e068 Mon Sep 17 00:00:00 2001 From: "Kassen, Andrew" Date: Thu, 9 Jan 2025 10:51:14 -0800 Subject: [PATCH 09/40] ngen: fix problematic type-casts --- src/gpu/intel/jit/ngen/ngen_gen12.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gpu/intel/jit/ngen/ngen_gen12.hpp b/src/gpu/intel/jit/ngen/ngen_gen12.hpp index 3ea47408d5b..77ba78ae115 100644 --- a/src/gpu/intel/jit/ngen/ngen_gen12.hpp +++ b/src/gpu/intel/jit/ngen/ngen_gen12.hpp @@ -946,18 +946,18 @@ bool Instruction12::getOperandRegion(autoswsb::DependencyRegion ®ion, int opN case -1: if (send.dstRegFile == RegFileARF) return false; base = send.dstReg; - len = send.descIsReg ? -1 : send.desc20_24; + len = send.descIsReg ? -1 : (int)send.desc20_24; if (len == 31) len++; break; case 0: if (send.src0RegFile == RegFileARF) return false; base = send.src0Reg; - len = send.descIsReg ? -1 : (send.desc25_29 & 0xF); + len = send.descIsReg ? -1 : (int)(send.desc25_29 & 0xF); break; case 1: if (send.src1RegFile == RegFileARF) return false; base = send.src1Reg; - len = send.exDescIsReg ? -1 : send.exDesc6_10; + len = send.exDescIsReg ? -1 : (int)send.exDesc6_10; break; case 2: case 3: // TODO: May need to track indirect acc usage From 5d8be3c895ec33f15338608895f8138f0efb55cc Mon Sep 17 00:00:00 2001 From: "Kassen, Andrew" Date: Thu, 9 Jan 2025 12:10:36 -0800 Subject: [PATCH 10/40] ngen: mark const guard variables as such --- src/gpu/intel/jit/ngen/ngen.hpp | 2 +- src/gpu/intel/jit/ngen/ngen_pseudo.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpu/intel/jit/ngen/ngen.hpp b/src/gpu/intel/jit/ngen/ngen.hpp index 5f5ca1f324f..f14a4090e05 100644 --- a/src/gpu/intel/jit/ngen/ngen.hpp +++ b/src/gpu/intel/jit/ngen/ngen.hpp @@ -2299,7 +2299,7 @@ BinaryCodeGenerator::opSend(Opcode op, const InstructionModifier &mod, Share InstructionModifier emod = mod | defaultModifier; auto src0 = src0_; - bool src0Indirect = (hw >= HW::Xe3 && src0.isIndirect()); + const bool src0Indirect = (hw >= HW::Xe3 && src0.isIndirect()); if (src0Indirect) src0 = src0.getIndirectReg(); diff --git a/src/gpu/intel/jit/ngen/ngen_pseudo.hpp b/src/gpu/intel/jit/ngen/ngen_pseudo.hpp index 6cc19569f56..aebd287c198 100644 --- a/src/gpu/intel/jit/ngen/ngen_pseudo.hpp +++ b/src/gpu/intel/jit/ngen/ngen_pseudo.hpp @@ -548,7 +548,7 @@ void loadlid(int argBytes, int dims = 3, int simd = 8, const GRF &temp = GRF(127 const int grfOW = grfSize / 16; int simdGRFs = (simd > 16 && grfSize < 64) ? 2 : 1; int insns = 0; - bool lsc = (hardware >= HW::XeHPG); + const bool lsc = (hardware >= HW::XeHPG); auto tempAddr = temp[lsc ? 0 : 2]; if (dims > 0) { From 8b8ace5d0bed02cabb217911ef5a76679762822f Mon Sep 17 00:00:00 2001 From: "Kassen, Andrew" Date: Thu, 9 Jan 2025 12:48:24 -0800 Subject: [PATCH 11/40] ngen: adjust VS for Vx indirect --- src/gpu/intel/jit/ngen/ngen_gen12.hpp | 4 ++-- src/gpu/intel/jit/ngen/ngen_gen8.hpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/gpu/intel/jit/ngen/ngen_gen12.hpp b/src/gpu/intel/jit/ngen/ngen_gen12.hpp index 77ba78ae115..5a1d582fcac 100644 --- a/src/gpu/intel/jit/ngen/ngen_gen12.hpp +++ b/src/gpu/intel/jit/ngen/ngen_gen12.hpp @@ -521,7 +521,7 @@ static inline constexpr14 BinaryOperand12 encodeBinaryOperand12(const RegData &r op.indirect.addrReg = rd.getIndirectOff(); op.indirect.addrMode = 1; if (srcN >= 0) - op.indirect.vs = (rd.isVxIndirect()) ? 0xFFFF : pow2Encode(rd.getVS()); + op.indirect.vs = (rd.isVxIndirect()) ? 0xF : pow2Encode(rd.getVS()); } else { op.direct.regFile = getRegFile(rd); op.direct.subRegNum = rd.getByteOffset(); @@ -553,7 +553,7 @@ static inline constexpr14 BinaryOperand12 encodeBinaryOperand12(const RegData &r op.indirect.addrReg = rd.getIndirectOff(); op.indirect.addrMode = 1; if (srcN >= 0) { - op.indirect.vs = (rd.isVxIndirect()) ? 0xFFFF : pow2Encode(rd.getVS()); + op.indirect.vs = (rd.isVxIndirect()) ? 0xF : pow2Encode(rd.getVS()); op.indirectXeHPC.addrOff0 = (rd.getOffset() & 1); } } else { diff --git a/src/gpu/intel/jit/ngen/ngen_gen8.hpp b/src/gpu/intel/jit/ngen/ngen_gen8.hpp index abf7f84cf1c..45af298eab2 100644 --- a/src/gpu/intel/jit/ngen/ngen_gen8.hpp +++ b/src/gpu/intel/jit/ngen/ngen_gen8.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2024 Intel Corporation +* Copyright 2019-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -318,7 +318,7 @@ static inline constexpr14 BinaryOperand8 encodeBinaryOperand8(const RegData &rd) result.indirect1.addrMode = 1; result.indirect1.addrSubreg = rd.getIndirectOff(); if (!dest) { - result.indirect1.vs = (rd.isVxIndirect()) ? 0xFFFF : + result.indirect1.vs = (rd.isVxIndirect()) ? 0xF : (rd.getVS() == 0) ? 0 : (1 + utils::log2(rd.getVS())); } From 3cc67a8a6241510d4b9d165cede90dc26dce8f2f Mon Sep 17 00:00:00 2001 From: "Kassen, Andrew" Date: Thu, 9 Jan 2025 10:29:58 -0800 Subject: [PATCH 12/40] xe: jit: gemm: mark const guard variables as such --- src/gpu/intel/jit/gemm/generator/pieces/walk_orders.cxx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gpu/intel/jit/gemm/generator/pieces/walk_orders.cxx b/src/gpu/intel/jit/gemm/generator/pieces/walk_orders.cxx index d10b67b761a..e9751d02ff0 100644 --- a/src/gpu/intel/jit/gemm/generator/pieces/walk_orders.cxx +++ b/src/gpu/intel/jit/gemm/generator/pieces/walk_orders.cxx @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2024 Intel Corporation +* Copyright 2019-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -124,8 +124,8 @@ void BLASKernelGenerator::gemmHilbertlikeOrder(const Subregister &groupIDMN, { if (aLeader.isValid() || bLeader.isValid()) stub(); - bool triangular = false; - bool rectangular = !triangular && state.inputs.hilbertVD.isValid(); + const bool triangular = false; + const bool rectangular = !triangular && state.inputs.hilbertVD.isValid(); auto storage = state.ra.alloc(); auto u = storage.ud(0); From ff9ad774959e7d0a90c465ca34125856624db615 Mon Sep 17 00:00:00 2001 From: "Kassen, Andrew" Date: Thu, 9 Jan 2025 10:44:50 -0800 Subject: [PATCH 13/40] xe: pooling: fix problematic type-cast --- src/gpu/intel/jit/pooling/ir_builder.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/gpu/intel/jit/pooling/ir_builder.cpp b/src/gpu/intel/jit/pooling/ir_builder.cpp index 82a247d0e9b..35143a9c366 100644 --- a/src/gpu/intel/jit/pooling/ir_builder.cpp +++ b/src/gpu/intel/jit/pooling/ir_builder.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -236,9 +236,10 @@ stmt_t pooling_ir_builder_t::try_build(pooling_ir_builder_t &pb, schedule.bind(fuse[0], kg.idx(idx)); }; auto odhw_to_schedule = [&](expr_t s1, expr_t ns, expr_t s0) { - int s0_idx = (s0.is_empty()) ? -1 : src_view.vvar_index(s0); - int s1_idx = src_view.vvar_index(s1); - int ns_idx = src_view.vvar_index(ns); + dim_idx_t s0_idx + = (s0.is_empty()) ? dim_idx::invalid : src_view.vvar_index(s0); + dim_idx_t s1_idx = src_view.vvar_index(s1); + dim_idx_t ns_idx = src_view.vvar_index(ns); ir_assert((s0_idx <= 4) && (s1_idx <= 4) && (ns_idx <= 4)); // s1 and ns may swap sides, which affects their fusing order: it has @@ -266,7 +267,7 @@ stmt_t pooling_ir_builder_t::try_build(pooling_ir_builder_t &pb, schedule.bind(s1_tg, tg.idx(s1_idx - 2)); s1_fuse.emplace_back(s1_kg); - if (s0_idx >= 0) { + if (s0_idx != dim_idx::invalid) { ir_assert(s0_idx == s1_idx + 1); const dim_t s0_tlg_unroll = lg[s0_idx]; const dim_t s0_unroll = s0_tlg_unroll * tg[s0_idx - 2]; From bb414f2c0128ca0282f2096100d082e5dfeb44fa Mon Sep 17 00:00:00 2001 From: "Kassen, Andrew" Date: Thu, 9 Jan 2025 11:23:56 -0800 Subject: [PATCH 14/40] xe: jit, ocl: fix dim index checks --- src/gpu/intel/jit/conv/normalization.cpp | 2 +- src/gpu/intel/jit/ir/epilogue.cpp | 2 +- src/gpu/intel/jit/ir/slm_reduce_builder.cpp | 4 ++-- src/gpu/intel/jit/ir/tensor.hpp | 12 ++++++------ src/gpu/intel/ocl/custom_reorder.cpp | 6 +++--- src/gpu/intel/ocl/reduction/atomic_reduction.cpp | 8 +++++--- 6 files changed, 18 insertions(+), 16 deletions(-) diff --git a/src/gpu/intel/jit/conv/normalization.cpp b/src/gpu/intel/jit/conv/normalization.cpp index e1914e3eba3..de2ff13ce61 100644 --- a/src/gpu/intel/jit/conv/normalization.cpp +++ b/src/gpu/intel/jit/conv/normalization.cpp @@ -35,7 +35,7 @@ layout_t insert_dimension(const layout_t &layout, dim_idx_t dim_idx) { } layout_t remove_size_1_dimension(const layout_t &layout, dim_idx_t dim_idx) { - ir_assert(0 <= dim_idx && dim_idx < layout.ndims()); + ir_assert(dim_idx != dim_idx::invalid && dim_idx < layout.ndims()); ir_assert(layout.dim(dim_idx) == 1); dim_assignment_t a(layout.ndims(), layout.ndims() - 1); for (dim_idx_t i = 0; i < layout.ndims(); i++) { diff --git a/src/gpu/intel/jit/ir/epilogue.cpp b/src/gpu/intel/jit/ir/epilogue.cpp index 53dcfc295e4..dfcc47bf9cc 100644 --- a/src/gpu/intel/jit/ir/epilogue.cpp +++ b/src/gpu/intel/jit/ir/epilogue.cpp @@ -204,7 +204,7 @@ class post_op_tensor_t { const expr_t &compute_expr() const { return info_.compute_expr(); } bool is_broadcast_dim(dim_idx_t dim_idx) const { - ir_assert(dim_idx >= 0 && dim_idx < mem_view().nvdims()); + ir_assert(dim_idx != dim_idx::invalid && dim_idx < mem_view().nvdims()); return (mask() & (1 << dim_idx)) == 0; } diff --git a/src/gpu/intel/jit/ir/slm_reduce_builder.cpp b/src/gpu/intel/jit/ir/slm_reduce_builder.cpp index c1eaccb4ee0..127513a8c1d 100644 --- a/src/gpu/intel/jit/ir/slm_reduce_builder.cpp +++ b/src/gpu/intel/jit/ir/slm_reduce_builder.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2024 Intel Corporation +* Copyright 2022-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,7 +38,7 @@ slm_reduce_builder_t::slm_reduce_builder_t(ir_context_t &ir_ctx, , reg_layout_(reg_layout) , thr_tile_(thr_tile) , dim_(dim) { - ir_assert((dim_ >= 0) && (dim_ <= 2)); + ir_assert((dim_ != dim_idx::invalid) && (dim_ <= 2)); ir_assert(tg_grid_.dim(dim_) > 1); tmp_reg_buf_ = ir_ctx.create_tmp_var(type_t::byte_ptr()); diff --git a/src/gpu/intel/jit/ir/tensor.hpp b/src/gpu/intel/jit/ir/tensor.hpp index d536b9a7709..cd4352e23fd 100644 --- a/src/gpu/intel/jit/ir/tensor.hpp +++ b/src/gpu/intel/jit/ir/tensor.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2024 Intel Corporation +* Copyright 2021-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -290,7 +290,7 @@ class grid_splitter_t { grid_splitter_t(const grid_info_t &grid) : grid_(grid), cur_idx_(grid.ndims() - 1), cur_stride_(1) { skip_size_1_dims(); - ir_assert(cur_idx_ >= 0); + ir_assert(cur_idx_ != dim_idx::invalid); } dim_t cur_block() const { @@ -1528,7 +1528,7 @@ class view_t { } bool has_tmask(dim_idx_t tidx) const { - ir_assert(tidx >= 0 && tidx < ntdims()); + ir_assert(tidx != dim_idx::invalid && tidx < ntdims()); return !tdims_[tidx].mask().is_empty(); } @@ -1581,7 +1581,7 @@ class view_t { } bool is_masked_vdim(dim_idx_t vidx) const { - ir_assert(vidx >= 0 && vidx < nvdims()); + ir_assert(vidx != dim_idx::invalid && vidx < nvdims()); ir_assert(has_zero_vstart()) << "Can't be reliably determined if the view is a sub-view."; for (dim_idx_t i = 0; i < ntdims(); i++) { @@ -1900,8 +1900,8 @@ class dim_assignment_t { , assignments_(old_ndims, -1) {} void assign(dim_idx_t old_idx, dim_idx_t new_idx) { - ir_assert(0 <= old_idx && old_idx < old_ndims_); - ir_assert(0 <= new_idx && new_idx < new_ndims_); + ir_assert(old_idx != dim_idx::invalid && old_idx < old_ndims_); + ir_assert(new_idx != dim_idx::invalid && new_idx < new_ndims_); assignments_[old_idx] = new_idx; } diff --git a/src/gpu/intel/ocl/custom_reorder.cpp b/src/gpu/intel/ocl/custom_reorder.cpp index d9767647e3e..9a9c0315b3c 100644 --- a/src/gpu/intel/ocl/custom_reorder.cpp +++ b/src/gpu/intel/ocl/custom_reorder.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2024 Intel Corporation +* Copyright 2019-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -619,8 +619,8 @@ status_t custom_reorder_t::pd_t::init_conf(impl::engine_t *engine) { if (!may_use_sg8 && vect_size == 8) { return status_t::dnnl_unimplemented; } - for (dim_idx_t dim = last - 1; - dim >= 0 && dim < MAX_NDIMS && temp_block == 1; dim--) { + for (dim_idx_t dim = last - 1; dim < MAX_NDIMS && temp_block == 1; + dim--) { if (padded_dims[dim] % 4 == 0) { temp_block = 4; } if (padded_dims[dim] % 8 == 0) { temp_block = 8; } if (padded_dims[dim] % 16 == 0) { temp_block = 16; } diff --git a/src/gpu/intel/ocl/reduction/atomic_reduction.cpp b/src/gpu/intel/ocl/reduction/atomic_reduction.cpp index 982a88b084f..cc5330c59d4 100644 --- a/src/gpu/intel/ocl/reduction/atomic_reduction.cpp +++ b/src/gpu/intel/ocl/reduction/atomic_reduction.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -237,7 +237,8 @@ status_t atomic_reduction_conf_t::init_dispatcher( src.append_block(all_dims[dim_idx], sizes[dim_idx]); } // the loop dim may have padding - update the outer block's stride to avoid it - size_t src_outer_idx = src.get_dim_idx(reduction_dims::outer); + dim_idx_t src_outer_idx = src.get_dim_idx(reduction_dims::outer); + gpu_assert(src_outer_idx != dim_idx::invalid); src.format_desc.blocking.strides[src_outer_idx] = outer_block.stride / conf.vect_size; @@ -251,7 +252,8 @@ status_t atomic_reduction_conf_t::init_dispatcher( src.remove_dim(reduction_dims::local, false); // Once again, loop dim padding causes issues - size_t dst_outer_idx = dst.get_dim_idx(reduction_dims::outer); + dim_idx_t dst_outer_idx = dst.get_dim_idx(reduction_dims::outer); + gpu_assert(dst_outer_idx != dim_idx::invalid); dst.format_desc.blocking.strides[dst_outer_idx] = inner_block.block / conf.vect_size; From 21b90d3a65a7ecd074f1a2adf4927f37cc405476 Mon Sep 17 00:00:00 2001 From: "Kassen, Andrew" Date: Thu, 9 Jan 2025 11:32:06 -0800 Subject: [PATCH 15/40] xe: jit: fix type-widening comparisons --- src/gpu/intel/jit/ir/blocking.cpp | 10 +++++----- src/gpu/intel/jit/pooling/config.hpp | 2 +- src/gpu/intel/jit/v2/ir/tensor.cpp | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/gpu/intel/jit/ir/blocking.cpp b/src/gpu/intel/jit/ir/blocking.cpp index 7df9a413198..3e91529e0e6 100644 --- a/src/gpu/intel/jit/ir/blocking.cpp +++ b/src/gpu/intel/jit/ir/blocking.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -78,13 +78,13 @@ std::vector tile_info_t::loop_blocks(dim_t size, int iter_blk) const { std::vector tile_info_t::get_factors(dim_t n) { std::vector ret; - int n_sqrt = std::sqrt(n); - for (int i = 1; i <= n_sqrt; i++) { + dim_t n_sqrt = std::sqrt(n); + for (dim_t i = 1; i <= n_sqrt; i++) { if (n % i == 0) ret.push_back(i); } - int lo = n_sqrt; + dim_t lo = n_sqrt; if (n_sqrt * n_sqrt == n) lo--; - for (int i = lo; i >= 1; i--) { + for (dim_t i = lo; i >= 1; i--) { if (n % i == 0) ret.push_back(n / i); } return ret; diff --git a/src/gpu/intel/jit/pooling/config.hpp b/src/gpu/intel/jit/pooling/config.hpp index e247ca1b7dd..15728877195 100644 --- a/src/gpu/intel/jit/pooling/config.hpp +++ b/src/gpu/intel/jit/pooling/config.hpp @@ -348,7 +348,7 @@ class pooling_config_t : public prim_config_t { if (total_simds <= thr_count) return dim_t(1); const dim_t orig = num; num = 0; - for (int div = sqrtf(orig); div >= 1; div--) + for (dim_t div = sqrtf(orig); div >= 1; div--) if (orig % div == 0) { if (total_simds >= thr_count * (orig / div)) num = std::max(num, orig / div); diff --git a/src/gpu/intel/jit/v2/ir/tensor.cpp b/src/gpu/intel/jit/v2/ir/tensor.cpp index 74527b31c5c..5501c1dba9e 100644 --- a/src/gpu/intel/jit/v2/ir/tensor.cpp +++ b/src/gpu/intel/jit/v2/ir/tensor.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1370,7 +1370,7 @@ view_t view_t::scatterize(int stride_bytes, const prover_t &prover) const { layout_t split_layout(const layout_t &layout, dim_t inner_elems, dim_t outer_elems, std::vector &inner_block_idxs, std::vector &outer_block_idxs) { - int cur_elems = 1; + dim_t cur_elems = 1; auto in_inner = [&]() { return cur_elems < inner_elems; }; auto in_outer = [&]() { return cur_elems >= inner_elems @@ -1394,7 +1394,7 @@ layout_t split_layout(const layout_t &layout, dim_t inner_elems, } else if (in_outer()) { outer_block_idxs.push_back(i); if (cur_elems * b_size > inner_elems * outer_elems) { - int b_inner = ir_utils::safe_div( + dim_t b_inner = ir_utils::safe_div( cur_elems, inner_elems * outer_elems); int b_outer = ir_utils::safe_div(b_size, b_inner); auto new_layout = layout.split_block(&b, b_inner, b_outer); From 2093c49122d7b60a0807b1c04ea13dd26fd40936 Mon Sep 17 00:00:00 2001 From: "Kassen, Andrew" Date: Thu, 9 Jan 2025 12:07:44 -0800 Subject: [PATCH 16/40] xe: codegen: fix int range check --- src/gpu/intel/jit/codegen/codegen.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpu/intel/jit/codegen/codegen.cpp b/src/gpu/intel/jit/codegen/codegen.cpp index 1fefff7f84e..fc9c15bf376 100644 --- a/src/gpu/intel/jit/codegen/codegen.cpp +++ b/src/gpu/intel/jit/codegen/codegen.cpp @@ -1448,7 +1448,7 @@ class expr_evaluator_t : public ir_visitor_t { std::vector vec(vec_size); for (int i = 0; i < vec_size; i++) { if (!is_const(obj.vec[i])) return false; - int value = to_cpp(obj.vec[i]); + int64_t value = to_cpp(obj.vec[i]); if (value < int_min || value > int_max) return false; vec[i] = (int)value; } From e873edc029934492698718e2b5b631bd72a02f58 Mon Sep 17 00:00:00 2001 From: "Kassen, Andrew" Date: Thu, 9 Jan 2025 12:08:10 -0800 Subject: [PATCH 17/40] xe: conv: check kernel creation status --- src/gpu/intel/jit/conv/gen_convolution.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpu/intel/jit/conv/gen_convolution.cpp b/src/gpu/intel/jit/conv/gen_convolution.cpp index 2a71b661da7..1a3b15b569e 100644 --- a/src/gpu/intel/jit/conv/gen_convolution.cpp +++ b/src/gpu/intel/jit/conv/gen_convolution.cpp @@ -243,7 +243,7 @@ class gen_convolution_t { } if (!ok) return status::runtime_error; ir_assert(kernels_.size() == data.kernel_infos.size()); - primitive->register_kernels(kernels_); + CHECK(primitive->register_kernels(kernels_)); conv_tiler_t::after_create_hook(cfg, primitive); return status::success; From b6ee3475472f23f5f9c78cb514f54fbbcff196aa Mon Sep 17 00:00:00 2001 From: "Kassen, Andrew" Date: Thu, 9 Jan 2025 12:09:49 -0800 Subject: [PATCH 18/40] xe: jit: avoid temporaries of overly wide type --- .../intel/jit/gemm/generator/pieces/matrix_access.cxx | 4 ++-- src/gpu/intel/jit/ir/fma.hpp | 10 +++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/gpu/intel/jit/gemm/generator/pieces/matrix_access.cxx b/src/gpu/intel/jit/gemm/generator/pieces/matrix_access.cxx index 8eb340d6ffb..52d8f055922 100644 --- a/src/gpu/intel/jit/gemm/generator/pieces/matrix_access.cxx +++ b/src/gpu/intel/jit/gemm/generator/pieces/matrix_access.cxx @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2024 Intel Corporation +* Copyright 2019-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -519,7 +519,7 @@ void BLASKernelGenerator::loadLoadStoreDescriptors(bool load, bool store, Re descLoad.parts.responseLen = 0; int underlyingSIMD = std::max(block.simdSize, maxScatteredSIMD(hw, astrategy) >> 1); - int log2GRFs = ilog2(underlyingSIMD * block.ebytes) - GRF::log2Bytes(hw); + int log2GRFs = ilog2(underlyingSIMD * (int)block.ebytes) - GRF::log2Bytes(hw); int log2Components = int(block.splitComplex); if (channel) mov(1, t2, 0x1000 << log2Components); diff --git a/src/gpu/intel/jit/ir/fma.hpp b/src/gpu/intel/jit/ir/fma.hpp index 8ba8552c042..70f6c6e2464 100644 --- a/src/gpu/intel/jit/ir/fma.hpp +++ b/src/gpu/intel/jit/ir/fma.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2024 Intel Corporation +* Copyright 2021-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -157,9 +157,13 @@ class dpas_t : public func_impl_t { return call({dst, src0, src1, src2}); } - int dst_size() const { return exec_size * rcount * sizeof(uint32_t); } + int dst_size() const { + return exec_size * (int)(rcount * sizeof(uint32_t)); + } int src0_size() const { return dst_size(); } - int src1_size() const { return exec_size * sdepth * sizeof(uint32_t); } + int src1_size() const { + return exec_size * (int)(sdepth * sizeof(uint32_t)); + } int src2_size() const { const int dpas_size = sdepth * rcount * sizeof(uint32_t); return is_dpasw ? dpas_size / 2 : dpas_size; From 7a67739a0e1f008cb7e43b77b678c5964561e18f Mon Sep 17 00:00:00 2001 From: "Kassen, Andrew" Date: Thu, 9 Jan 2025 12:11:34 -0800 Subject: [PATCH 19/40] xe: jit: v2: add assert to fallback path --- src/gpu/intel/jit/v2/ir/tensor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gpu/intel/jit/v2/ir/tensor.cpp b/src/gpu/intel/jit/v2/ir/tensor.cpp index 5501c1dba9e..7d3dc0adf4e 100644 --- a/src/gpu/intel/jit/v2/ir/tensor.cpp +++ b/src/gpu/intel/jit/v2/ir/tensor.cpp @@ -837,6 +837,7 @@ int layout_t::to_linear_index( if (i_coord == coord) return i; advance(idx, blocks_, tile_blocks); } + ir_error_not_expected(); return -1; } From 9617b2f45a68e8526afd962ace7f534c874211a1 Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Thu, 26 Dec 2024 14:39:40 -0800 Subject: [PATCH 20/40] xe: sdpa: Improve scale and zp alignment --- src/gpu/intel/ocl/micro_sdpa.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/gpu/intel/ocl/micro_sdpa.cpp b/src/gpu/intel/ocl/micro_sdpa.cpp index 87cfaadbc9f..6a6aa30cf1b 100644 --- a/src/gpu/intel/ocl/micro_sdpa.cpp +++ b/src/gpu/intel/ocl/micro_sdpa.cpp @@ -238,17 +238,20 @@ status_t micro_sdpa_t::pd_t::init_microkernels(impl::engine_t *engine) { micro::GEMMProtocol::Options opts_kq; opts_kq.localB = true; opts_kq.slmPtr = true; + if (with_key_scales() && !kq_common_scales) { auto scale_dt = key_scales_dt(); problem_kq.Ta_scale = jit::convert_dnnl_to_kernel_type(scale_dt); - problem_kq.A_scale.alignment = uint8_t(types::data_type_size(scale_dt)); + problem_kq.A_scale.setAlignment( + int8_t(d->keys() * types::data_type_size(scale_dt))); problem_kq.A_scale.layout = MatrixLayout::N; problem_kq.aScale2D = true; } if (with_key_zp()) { auto zp_dt = key_zp_dt(); problem_kq.Tao = jit::convert_dnnl_to_kernel_type(zp_dt); - problem_kq.AO.alignment = uint8_t(types::data_type_size(zp_dt)); + problem_kq.AO.setAlignment( + int8_t(d->keys() * types::data_type_size(zp_dt))); problem_kq.AO.layout = MatrixLayout::N; problem_kq.aoPtrDims = kq_common_zp ? 0 : 2; problem_kq.aOffset = ABOffset::Calc; @@ -312,7 +315,8 @@ status_t micro_sdpa_t::pd_t::init_microkernels(impl::engine_t *engine) { if (with_value_scales() && !vs_common_scales) { auto scale_dt = value_scales_dt(); problem_vs.Ta_scale = jit::convert_dnnl_to_kernel_type(scale_dt); - problem_vs.A_scale.alignment = uint8_t(types::data_type_size(scale_dt)); + problem_vs.A_scale.setAlignment(uint8_t(d->head_size() + / value_group_size() * types::data_type_size(scale_dt))); problem_vs.A_scale.layout = MatrixLayout::N; problem_vs.aScale2D = true; } @@ -320,6 +324,8 @@ status_t micro_sdpa_t::pd_t::init_microkernels(impl::engine_t *engine) { auto zp_dt = value_zp_dt(); problem_vs.Tao = jit::convert_dnnl_to_kernel_type(zp_dt); problem_vs.AO.alignment = uint8_t(types::data_type_size(zp_dt)); + problem_vs.AO.setAlignment(uint8_t(d->head_size() / value_group_size() + * types::data_type_size(zp_dt))); problem_vs.AO.layout = MatrixLayout::N; problem_vs.aoPtrDims = vs_common_zp ? 0 : 2; problem_vs.aOffset = ABOffset::Calc; From d42faa92eae2f9cebe5b96ca8bbe7659816857d5 Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Fri, 27 Dec 2024 12:46:36 -0800 Subject: [PATCH 21/40] xe: sdpa: Prefetch scales and zero_points --- src/gpu/intel/ocl/micro_sdpa.cl | 38 ++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/src/gpu/intel/ocl/micro_sdpa.cl b/src/gpu/intel/ocl/micro_sdpa.cl index fdf5f4ccb21..fbd44542eda 100644 --- a/src/gpu/intel/ocl/micro_sdpa.cl +++ b/src/gpu/intel/ocl/micro_sdpa.cl @@ -281,6 +281,17 @@ micro_sdpa(const global KEY_DATA_T *K, const global half *Q, /* Prefetch first K tile. */ cooperative_prefetch_2d_k(K, k, d, ugemm_kq_wg_tile_m, PREFETCH_D_MAX, ldk, sg_ij, sg_per_wg, SUBGROUP_SIZE, LSC_LDCC_L1C_L3C); + +#if KEY_SCALES == QUANTIZE_2D + cooperative_prefetch_2d(K_scales, ugemm_kq_wg_tile_m, + PREFETCH_D_MAX / KEY_GROUP_SIZE, ldkq, sg_ij, sg_per_wg, + SUBGROUP_SIZE, LSC_LDCC_L1C_L3C); +#endif +#if KEY_ZERO_POINTS == QUANTIZE_2D + cooperative_prefetch_2d(K_zp, ugemm_kq_wg_tile_m, + PREFETCH_D_MAX / KEY_GROUP_SIZE, ldkq, sg_ij, sg_per_wg, + SUBGROUP_SIZE, LSC_LDCC_L1C_L3C); +#endif #endif /* Initialize S column sums in SLM to -inf */ @@ -414,8 +425,22 @@ micro_sdpa(const global KEY_DATA_T *K, const global half *Q, cooperative_prefetch_2d_maybe_rem(V, d, k - k0, D_MAX, (ugemm_kq_wg_tile_m * PREFETCH_D_MAX) / D_MAX, ldv, sg_ij, sg_per_wg, SUBGROUP_SIZE, LSC_LDCC_L1C_L3C); -#endif +#if VAL_SCALES == QUANTIZE_2D + /* Prefetch V scales. */ + cooperative_prefetch_2d_maybe_rem(V_scales, d / VAL_GROUP_SIZE, k - k0, + d / VAL_GROUP_SIZE, + (ugemm_kq_wg_tile_m * PREFETCH_D_MAX) / D_MAX, ldvq, sg_ij, + sg_per_wg, SUBGROUP_SIZE, LSC_LDCC_L1C_L3C); +#endif +#if VAL_ZERO_POINTS == QUANTIZE_2D + /* Prefetch V zero points. */ + cooperative_prefetch_2d_maybe_rem(V_zp, d / VAL_GROUP_SIZE, k - k0, + d / VAL_GROUP_SIZE, + (ugemm_kq_wg_tile_m * PREFETCH_D_MAX) / D_MAX, ldvq, sg_ij, + sg_per_wg, SUBGROUP_SIZE, LSC_LDCC_L1C_L3C); +#endif +#endif #ifndef ALT_MAX /* Read back WG-wide maxima */ intel_work_group_barrier_wait(CLK_LOCAL_MEM_FENCE); @@ -501,6 +526,17 @@ micro_sdpa(const global KEY_DATA_T *K, const global half *Q, k - k0 - ugemm_kq_wg_tile_m, d, ugemm_kq_wg_tile_m, PREFETCH_D_MAX, ldk, sg_ij, sg_per_wg, SUBGROUP_SIZE, LSC_LDCC_L1C_L3C); +#if KEY_SCALES == QUANTIZE_2D + cooperative_prefetch_2d( + K_scales + ((k0 + ugemm_kq_wg_tile_m) * ldkq), + ugemm_kq_wg_tile_m, PREFETCH_D_MAX / KEY_GROUP_SIZE, ldkq, + sg_ij, sg_per_wg, SUBGROUP_SIZE, LSC_LDCC_L1C_L3C); +#endif +#if KEY_ZERO_POINTS == QUANTIZE_2D + cooperative_prefetch_2d(K_zp + ((k0 + ugemm_kq_wg_tile_m) * ldkq), + ugemm_kq_wg_tile_m, PREFETCH_D_MAX / KEY_GROUP_SIZE, ldkq, + sg_ij, sg_per_wg, SUBGROUP_SIZE, LSC_LDCC_L1C_L3C); +#endif } #endif #if WITH_ATTN_MASK && defined(PREFETCH_MASK) From 24966a892658603932c8225bbe1130a24c2bb46f Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Fri, 3 Jan 2025 00:30:40 -0800 Subject: [PATCH 22/40] xe: sdpa: Update prefetch functions to improve performance --- src/gpu/intel/ocl/micro_sdpa.cl | 180 +++++++++++++++++++++++--------- 1 file changed, 132 insertions(+), 48 deletions(-) diff --git a/src/gpu/intel/ocl/micro_sdpa.cl b/src/gpu/intel/ocl/micro_sdpa.cl index fbd44542eda..05afc6dff88 100644 --- a/src/gpu/intel/ocl/micro_sdpa.cl +++ b/src/gpu/intel/ocl/micro_sdpa.cl @@ -249,6 +249,48 @@ micro_sdpa(const global KEY_DATA_T *K, const global half *Q, / VAL_ZP_ELEMENTS_PER_BYTE; #endif +#ifdef PREFETCH_K0 + /* Prefetch first K tile. */ + cooperative_prefetch_2d_k( + /* ptr */ K, + /* r */ k, + /* c */ d, + /* rmax */ ugemm_kq_wg_tile_m, + /* cmax */ PREFETCH_D_MAX, + /* ld */ ldk, + /* sg_id */ sg_ij, + /* n_sg */ sg_per_wg, + /* sg_size */ SUBGROUP_SIZE, + /* cache */ LSC_LDCC_L1C_L3C); + +#if KEY_SCALES == QUANTIZE_2D + cooperative_prefetch_2d_maybe_rem( + /* ptr */ K_scales, + /* r */ k, + /* c */ d / KEY_GROUP_SIZE, + /* rmax */ ugemm_kq_wg_tile_m, + /* cmax */ PREFETCH_D_MAX / KEY_GROUP_SIZE, + /* ld */ ldkq, + /* sg_id */ sg_ij, + /* n_sg */ sg_per_wg, + /* sg_size */ SUBGROUP_SIZE, + /* cache */ LSC_LDCC_L1C_L3C); +#endif +#if KEY_ZERO_POINTS == QUANTIZE_2D + cooperative_prefetch_2d_maybe_rem( + /* ptr */ K_zp, + /* r */ k, + /* c */ d / KEY_GROUP_SIZE, + /* rmax */ ugemm_kq_wg_tile_m, + /* cmax */ PREFETCH_D_MAX / KEY_GROUP_SIZE, + /* ld */ ldkq, + /* sg_id */ sg_ij, + /* n_sg */ sg_per_wg, + /* sg_size */ SUBGROUP_SIZE, + /* cache */ LSC_LDCC_L1C_L3C); +#endif +#endif + /* Load Q tile, destined for SLM */ q_tile_type Q_tile; uint q0_copy = q_tile_sg_n * sg_ij; @@ -277,23 +319,6 @@ micro_sdpa(const global KEY_DATA_T *K, const global half *Q, #endif scale *= 1.442695f; // log2(e) -#ifdef PREFETCH_K0 - /* Prefetch first K tile. */ - cooperative_prefetch_2d_k(K, k, d, ugemm_kq_wg_tile_m, PREFETCH_D_MAX, ldk, - sg_ij, sg_per_wg, SUBGROUP_SIZE, LSC_LDCC_L1C_L3C); - -#if KEY_SCALES == QUANTIZE_2D - cooperative_prefetch_2d(K_scales, ugemm_kq_wg_tile_m, - PREFETCH_D_MAX / KEY_GROUP_SIZE, ldkq, sg_ij, sg_per_wg, - SUBGROUP_SIZE, LSC_LDCC_L1C_L3C); -#endif -#if KEY_ZERO_POINTS == QUANTIZE_2D - cooperative_prefetch_2d(K_zp, ugemm_kq_wg_tile_m, - PREFETCH_D_MAX / KEY_GROUP_SIZE, ldkq, sg_ij, sg_per_wg, - SUBGROUP_SIZE, LSC_LDCC_L1C_L3C); -#endif -#endif - /* Initialize S column sums in SLM to -inf */ const uint n_col_sg = DIV_UP(ugemm_kq_wg_tile_n, SUBGROUP_SIZE * sg_per_wg); const float neg_inf = -INFINITY; @@ -420,25 +445,48 @@ micro_sdpa(const global KEY_DATA_T *K, const global half *Q, S_max_tile, S_max_slm, ugemm_kq_wg_tile_n, sg_j0_kq, 0); intel_work_group_barrier_arrive(CLK_LOCAL_MEM_FENCE); + int k_chunk = min(k - k0, ugemm_kq_wg_tile_m); #ifdef PREFETCH_V /* Prefetch V tile. */ - cooperative_prefetch_2d_maybe_rem(V, d, k - k0, D_MAX, - (ugemm_kq_wg_tile_m * PREFETCH_D_MAX) / D_MAX, ldv, sg_ij, - sg_per_wg, SUBGROUP_SIZE, LSC_LDCC_L1C_L3C); + cooperative_prefetch_2d_maybe_rem( + /* ptr */ V, + /* r */ d, + /* c */ k - k0, + /* rmax */ PREFETCH_D_MAX, + /* cmax */ ugemm_kq_wg_tile_m, + /* ld */ ldv, + /* sg_id */ sg_ij, + /* n_sg */ sg_per_wg, + /* sg_size */ SUBGROUP_SIZE, + /* cache */ LSC_LDCC_L1C_L3C); #if VAL_SCALES == QUANTIZE_2D /* Prefetch V scales. */ - cooperative_prefetch_2d_maybe_rem(V_scales, d / VAL_GROUP_SIZE, k - k0, - d / VAL_GROUP_SIZE, - (ugemm_kq_wg_tile_m * PREFETCH_D_MAX) / D_MAX, ldvq, sg_ij, - sg_per_wg, SUBGROUP_SIZE, LSC_LDCC_L1C_L3C); + cooperative_prefetch_2d_maybe_rem( + /* ptr */ V_scales, + /* r */ (d / VAL_GROUP_SIZE), + /* c */ k - k0, + /* rmax */ PREFETCH_D_MAX / VAL_GROUP_SIZE, + /* cmax */ k_chunk, + /* ld */ ldvq, + /* sg_id */ sg_ij, + /* n_sg */ sg_per_wg, + /* sg_size */ SUBGROUP_SIZE, + /* cache */ LSC_LDCC_L1C_L3C); #endif #if VAL_ZERO_POINTS == QUANTIZE_2D /* Prefetch V zero points. */ - cooperative_prefetch_2d_maybe_rem(V_zp, d / VAL_GROUP_SIZE, k - k0, - d / VAL_GROUP_SIZE, - (ugemm_kq_wg_tile_m * PREFETCH_D_MAX) / D_MAX, ldvq, sg_ij, - sg_per_wg, SUBGROUP_SIZE, LSC_LDCC_L1C_L3C); + cooperative_prefetch_2d_maybe_rem( + /* ptr */ V_zp, + /* r */ (d / VAL_GROUP_SIZE), + /* c */ k - k0, + /* rmax */ PREFETCH_D_MAX / VAL_GROUP_SIZE, + /* cmax */ k_chunk, + /* ld */ ldvq, + /* sg_id */ sg_ij, + /* n_sg */ sg_per_wg, + /* sg_size */ SUBGROUP_SIZE, + /* cache */ LSC_LDCC_L1C_L3C); #endif #endif #ifndef ALT_MAX @@ -522,35 +570,73 @@ micro_sdpa(const global KEY_DATA_T *K, const global half *Q, #else const uint stride_k = 1; #endif - cooperative_prefetch_2d_k(K + (k0 + ugemm_kq_wg_tile_m) * stride_k, - k - k0 - ugemm_kq_wg_tile_m, d, ugemm_kq_wg_tile_m, - PREFETCH_D_MAX, ldk, sg_ij, sg_per_wg, SUBGROUP_SIZE, - LSC_LDCC_L1C_L3C); + cooperative_prefetch_2d_k( + /* ptr */ K + (k0 + ugemm_kq_wg_tile_m) * stride_k, + /* r */ k - k0 - ugemm_kq_wg_tile_m, + /* c */ d, + /* rmax */ ugemm_kq_wg_tile_m, + /* cmax */ ugemm_kq_wg_tile_n, + /* ld*/ ldk, + /* sg_id */ sg_ij, + /* n_sg */ sg_per_wg, + /* sg_size */ SUBGROUP_SIZE, + /* cache*/ LSC_LDCC_L1C_L3C); #if KEY_SCALES == QUANTIZE_2D - cooperative_prefetch_2d( - K_scales + ((k0 + ugemm_kq_wg_tile_m) * ldkq), - ugemm_kq_wg_tile_m, PREFETCH_D_MAX / KEY_GROUP_SIZE, ldkq, - sg_ij, sg_per_wg, SUBGROUP_SIZE, LSC_LDCC_L1C_L3C); + cooperative_prefetch_2d_maybe_rem( + /* ptr */ K_scales + (k0 + ugemm_kq_wg_tile_m), + /* r */ k - k0 - ugemm_kq_wg_tile_m, + /* c */ d / KEY_GROUP_SIZE, + /* rmax */ ugemm_kq_wg_tile_m, + /* cmax */ ugemm_kq_wg_tile_n / KEY_GROUP_SIZE, + /* ld */ ldkq, + /* sg_id */ sg_ij, + /* n_sg */ sg_per_wg, + /* sg_size */ SUBGROUP_SIZE, + /* cache */ LSC_LDCC_L1C_L3C); #endif #if KEY_ZERO_POINTS == QUANTIZE_2D - cooperative_prefetch_2d(K_zp + ((k0 + ugemm_kq_wg_tile_m) * ldkq), - ugemm_kq_wg_tile_m, PREFETCH_D_MAX / KEY_GROUP_SIZE, ldkq, - sg_ij, sg_per_wg, SUBGROUP_SIZE, LSC_LDCC_L1C_L3C); + cooperative_prefetch_2d_maybe_rem( + /* ptr */ K_zp + (k0 + ugemm_kq_wg_tile_m), + /* r */ k - k0 - ugemm_kq_wg_tile_m, + /* c */ d / KEY_GROUP_SIZE, + /* rmax */ ugemm_kq_wg_tile_m, + /* cmax */ ugemm_kq_wg_tile_n / KEY_GROUP_SIZE, + /* ld */ ldkq, + /* sg_id */ sg_ij, + /* n_sg */ sg_per_wg, + /* sg_size */ SUBGROUP_SIZE, + /* cache */ LSC_LDCC_L1C_L3C); #endif } #endif + #if WITH_ATTN_MASK && defined(PREFETCH_MASK) /* Prefetch next mask tile. */ if (!last) { #if BROADCAST_MASK_Q - cooperative_prefetch_2d(msk + k0 + ugemm_kq_wg_tile_m + sg_i0_kq, - ugemm_kq_sg_tile_m, 1, 0, 0, 1, SUBGROUP_SIZE, - LSC_LDCC_L1UC_L3C); + cooperative_prefetch_2d_maybe_rem( + /* ptr */ msk + k0 + ugemm_kq_wg_tile_m, + /* r */ k - k0, + /* c */ 1, + /* rmax */ ugemm_kq_wg_tile_m, + /* cmax */ 1, + /* ld */ 0, + /* sg_id */ sg_ij, + /* n_sg */ sg_per_wg, + /* sg_size */ SUBGROUP_SIZE, + /* cache */ LSC_LDCC_L1C_L3C); #else - cooperative_prefetch_2d(msk + k0 + ugemm_kq_wg_tile_m + sg_i0_kq - + (sg_j0_kq + wg_j0) * q, - ugemm_kq_sg_tile_m, ugemm_kq_sg_tile_n, 0, 0, 1, - SUBGROUP_SIZE, LSC_LDCC_L1UC_L3C); + cooperative_prefetch_2d_maybe_rem( + /* ptr */ msk + k0 + ugemm_kq_sg_tile_m + (wg_j0)*MSK_S2, + /* r */ k - k0 - ugemm_kq_wg_tile_m, + /* c */ q - wg_j0, + /* rmax */ ugemm_kq_wg_tile_m, + /* cmax */ (ugemm_kq_wg_tile_n * PREFETCH_D_MAX) / D_MAX, + /* ld */ MSK_S2, + /* sg_id */ sg_ij, + /* n_sg */ sg_per_wg, + /* sg_size */ SUBGROUP_SIZE, + /* cache */ LSC_LDCC_L1UC_L3C); #endif } #endif @@ -563,8 +649,6 @@ micro_sdpa(const global KEY_DATA_T *K, const global half *Q, intel_work_group_barrier_arrive(CLK_LOCAL_MEM_FENCE); /* Accumulate A += V * S */ - int k_chunk = min(k - k0, ugemm_kq_wg_tile_m); - a_tile_type A_tile1 = ugemm_vs( V, ldv, S_slm, ugemm_kq_wg_tile_m, d, ugemm_kq_wg_tile_n, k_chunk, 0, 0, 0, sg_i_vs, sg_j_vs, (local char *)ugemm_slm From 2e410aae10a5600b4687880302b57919b20be27e Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Fri, 3 Jan 2025 00:27:55 -0800 Subject: [PATCH 23/40] xe: ukernel: fix cooperative prefetch function to avoid overlap --- src/gpu/intel/ocl/tile_ops.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/gpu/intel/ocl/tile_ops.h b/src/gpu/intel/ocl/tile_ops.h index 5a859ee9236..527087e4259 100644 --- a/src/gpu/intel/ocl/tile_ops.h +++ b/src/gpu/intel/ocl/tile_ops.h @@ -629,11 +629,13 @@ __attribute__((overloadable)) void cooperative_prefetch_2d_internal( uint n_sg, uint sg_size, enum LSC_LDCC caching) { const uint cl_per_col = (rbytes + 63) >> 6; const uint cl = cl_per_col * c; + const uint cl_per_sg = (cl + n_sg - 1) / n_sg; const uint cl_iters = (cl_per_sg + sg_size - 1) / sg_size; #pragma unroll for (uint ii_cl = 0; ii_cl < cl_iters; ii_cl++) { - uint i_cl = ii_cl + (sg_id * cl_per_sg) + get_sub_group_local_id(); + uint i_cl = (ii_cl * cl_per_sg + sg_id) * sg_size + + get_sub_group_local_id(); uint r_cl = i_cl % cl_per_col; uint c_cl = i_cl / cl_per_col; if (i_cl < cl) { @@ -655,7 +657,8 @@ __attribute__((overloadable)) void cooperative_prefetch_2d_internal( const uint max_off = rbytes - 1 + (c - 1) * ld_bytes; #pragma unroll for (uint ii_cl = 0; ii_cl < cl_iters; ii_cl++) { - uint i_cl = ii_cl + (sg_id * cl_per_sg) + get_sub_group_local_id(); + uint i_cl = (ii_cl * cl_per_sg + sg_id) * sg_size + + get_sub_group_local_id(); uint r_cl = i_cl % cl_per_col; uint c_cl = i_cl / cl_per_col; uint pf_off = min(r_cl * 64 + c_cl * ld_bytes, max_off); From 5890fc624e0d45ac585b35fd1e1d460456319f3a Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Sat, 4 Jan 2025 13:36:38 -0800 Subject: [PATCH 24/40] xe: sdpa: revert K0 prefetch to after Q load to slm --- src/gpu/intel/ocl/micro_sdpa.cl | 87 +++++++++++++++++---------------- 1 file changed, 45 insertions(+), 42 deletions(-) diff --git a/src/gpu/intel/ocl/micro_sdpa.cl b/src/gpu/intel/ocl/micro_sdpa.cl index 05afc6dff88..931d1190d78 100644 --- a/src/gpu/intel/ocl/micro_sdpa.cl +++ b/src/gpu/intel/ocl/micro_sdpa.cl @@ -216,12 +216,12 @@ micro_sdpa(const global KEY_DATA_T *K, const global half *Q, const bool need_sum_barrier = (ugemm_vs_barrier_count == 0); /* Locate K/Q/V/A matrices within batch */ - K += KEY_OFF(b1, b0_kv, 0, 0) / KEY_ELEMENTS_PER_BYTE; Q += QRY_OFF(b1, b0, 0, 0); V += VAL_OFF(b1, b0_kv, 0, 0) / VAL_ELEMENTS_PER_BYTE; A += DST_OFF(b1, b0, 0, 0, 0); #if WITH_ATTN_MASK + uint ldmsk = MSK_S2; msk += MSK_OFF(b1 % MSK_D0, b0 % MSK_D1, 0, 0); #ifndef BLOCK_MSK int mask_aligned = (((size_t)msk) % 4) == 0; @@ -230,6 +230,7 @@ micro_sdpa(const global KEY_DATA_T *K, const global half *Q, #if KEY_SCALES K_scales += KEY_OFF(b1, b0_kv, 0, 0) / KEY_GROUP_SIZE; + uint num_key_groups = d / KEY_GROUP_SIZE; #endif #if KEY_SCALES == QUANTIZE_COMMON float k_scale = convert_float(*K_scales); @@ -240,6 +241,7 @@ micro_sdpa(const global KEY_DATA_T *K, const global half *Q, #endif #if VAL_SCALES V_scales += VAL_OFF(b1, b0_kv, 0, 0) / VAL_GROUP_SIZE; + uint num_val_groups = d / VAL_GROUP_SIZE; #endif #if VAL_SCALES == QUANTIZE_COMMON float v_scale = convert_float(*V_scales); @@ -249,6 +251,34 @@ micro_sdpa(const global KEY_DATA_T *K, const global half *Q, / VAL_ZP_ELEMENTS_PER_BYTE; #endif + /* Load Q tile, destined for SLM */ + q_tile_type Q_tile; + uint q0_copy = q_tile_sg_n * sg_ij; +#ifdef BLOCK_Q + tile_load_block_rem_q( + &Q_tile, (global uint *)Q, q, ldq >> 1, 0, wg_j0 + q0_copy); +#elif Q_ALIGN >= 4 + tile_load(&Q_tile, (global uint *)Q, (d + 1) >> 1, q, ldq >> 1, 0, + wg_j0 + q0_copy); +#else + tile_load_packed_half(&Q_tile, Q, d, q, ldq, 0, wg_j0 + q0_copy); +#endif + + /* Load scale */ +#if WITH_ATTN_SCALE +#if INVERT_SCALE + float iscale = convert_float(*scale_ptr); + float scale = native_recip(iscale); +#else + float scale = convert_float(*scale_ptr); + float iscale = native_recip(scale); +#endif +#else + float scale = 1.0; + float iscale = 1.0; +#endif + scale *= 1.442695f; // log2(e) + #ifdef PREFETCH_K0 /* Prefetch first K tile. */ cooperative_prefetch_2d_k( @@ -267,9 +297,9 @@ micro_sdpa(const global KEY_DATA_T *K, const global half *Q, cooperative_prefetch_2d_maybe_rem( /* ptr */ K_scales, /* r */ k, - /* c */ d / KEY_GROUP_SIZE, + /* c */ num_key_groups, /* rmax */ ugemm_kq_wg_tile_m, - /* cmax */ PREFETCH_D_MAX / KEY_GROUP_SIZE, + /* cmax */ D_MAX / KEY_GROUP_SIZE, /* ld */ ldkq, /* sg_id */ sg_ij, /* n_sg */ sg_per_wg, @@ -280,9 +310,9 @@ micro_sdpa(const global KEY_DATA_T *K, const global half *Q, cooperative_prefetch_2d_maybe_rem( /* ptr */ K_zp, /* r */ k, - /* c */ d / KEY_GROUP_SIZE, + /* c */ num_key_groups, /* rmax */ ugemm_kq_wg_tile_m, - /* cmax */ PREFETCH_D_MAX / KEY_GROUP_SIZE, + /* cmax */ D_MAX / KEY_GROUP_SIZE, /* ld */ ldkq, /* sg_id */ sg_ij, /* n_sg */ sg_per_wg, @@ -291,34 +321,6 @@ micro_sdpa(const global KEY_DATA_T *K, const global half *Q, #endif #endif - /* Load Q tile, destined for SLM */ - q_tile_type Q_tile; - uint q0_copy = q_tile_sg_n * sg_ij; -#ifdef BLOCK_Q - tile_load_block_rem_q( - &Q_tile, (global uint *)Q, q, ldq >> 1, 0, wg_j0 + q0_copy); -#elif Q_ALIGN >= 4 - tile_load(&Q_tile, (global uint *)Q, (d + 1) >> 1, q, ldq >> 1, 0, - wg_j0 + q0_copy); -#else - tile_load_packed_half(&Q_tile, Q, d, q, ldq, 0, wg_j0 + q0_copy); -#endif - - /* Load scale */ -#if WITH_ATTN_SCALE -#if INVERT_SCALE - float iscale = convert_float(*scale_ptr); - float scale = native_recip(iscale); -#else - float scale = convert_float(*scale_ptr); - float iscale = native_recip(scale); -#endif -#else - float scale = 1.0; - float iscale = 1.0; -#endif - scale *= 1.442695f; // log2(e) - /* Initialize S column sums in SLM to -inf */ const uint n_col_sg = DIV_UP(ugemm_kq_wg_tile_n, SUBGROUP_SIZE * sg_per_wg); const float neg_inf = -INFINITY; @@ -464,7 +466,7 @@ micro_sdpa(const global KEY_DATA_T *K, const global half *Q, /* Prefetch V scales. */ cooperative_prefetch_2d_maybe_rem( /* ptr */ V_scales, - /* r */ (d / VAL_GROUP_SIZE), + /* r */ num_val_groups, /* c */ k - k0, /* rmax */ PREFETCH_D_MAX / VAL_GROUP_SIZE, /* cmax */ k_chunk, @@ -478,7 +480,7 @@ micro_sdpa(const global KEY_DATA_T *K, const global half *Q, /* Prefetch V zero points. */ cooperative_prefetch_2d_maybe_rem( /* ptr */ V_zp, - /* r */ (d / VAL_GROUP_SIZE), + /* r */ num_val_groups, /* c */ k - k0, /* rmax */ PREFETCH_D_MAX / VAL_GROUP_SIZE, /* cmax */ k_chunk, @@ -570,12 +572,13 @@ micro_sdpa(const global KEY_DATA_T *K, const global half *Q, #else const uint stride_k = 1; #endif + cooperative_prefetch_2d_k( /* ptr */ K + (k0 + ugemm_kq_wg_tile_m) * stride_k, /* r */ k - k0 - ugemm_kq_wg_tile_m, /* c */ d, /* rmax */ ugemm_kq_wg_tile_m, - /* cmax */ ugemm_kq_wg_tile_n, + /* cmax */ D_MAX, /* ld*/ ldk, /* sg_id */ sg_ij, /* n_sg */ sg_per_wg, @@ -585,9 +588,9 @@ micro_sdpa(const global KEY_DATA_T *K, const global half *Q, cooperative_prefetch_2d_maybe_rem( /* ptr */ K_scales + (k0 + ugemm_kq_wg_tile_m), /* r */ k - k0 - ugemm_kq_wg_tile_m, - /* c */ d / KEY_GROUP_SIZE, + /* c */ num_key_groups, /* rmax */ ugemm_kq_wg_tile_m, - /* cmax */ ugemm_kq_wg_tile_n / KEY_GROUP_SIZE, + /* cmax */ D_MAX / KEY_GROUP_SIZE, /* ld */ ldkq, /* sg_id */ sg_ij, /* n_sg */ sg_per_wg, @@ -598,9 +601,9 @@ micro_sdpa(const global KEY_DATA_T *K, const global half *Q, cooperative_prefetch_2d_maybe_rem( /* ptr */ K_zp + (k0 + ugemm_kq_wg_tile_m), /* r */ k - k0 - ugemm_kq_wg_tile_m, - /* c */ d / KEY_GROUP_SIZE, + /* c */ num_key_groups, /* rmax */ ugemm_kq_wg_tile_m, - /* cmax */ ugemm_kq_wg_tile_n / KEY_GROUP_SIZE, + /* cmax */ D_MAX / KEY_GROUP_SIZE, /* ld */ ldkq, /* sg_id */ sg_ij, /* n_sg */ sg_per_wg, @@ -627,12 +630,12 @@ micro_sdpa(const global KEY_DATA_T *K, const global half *Q, /* cache */ LSC_LDCC_L1C_L3C); #else cooperative_prefetch_2d_maybe_rem( - /* ptr */ msk + k0 + ugemm_kq_sg_tile_m + (wg_j0)*MSK_S2, + /* ptr */ msk + k0 + ugemm_kq_sg_tile_m + (wg_j0)*ldmsk, /* r */ k - k0 - ugemm_kq_wg_tile_m, /* c */ q - wg_j0, /* rmax */ ugemm_kq_wg_tile_m, /* cmax */ (ugemm_kq_wg_tile_n * PREFETCH_D_MAX) / D_MAX, - /* ld */ MSK_S2, + /* ld */ ldmsk, /* sg_id */ sg_ij, /* n_sg */ sg_per_wg, /* sg_size */ SUBGROUP_SIZE, From 0094479374e17ff8f5e24f2e4700c1c3a43a2eb8 Mon Sep 17 00:00:00 2001 From: Daniel Youssif Date: Thu, 9 Jan 2025 22:40:37 +0000 Subject: [PATCH 25/40] xe: jit: conv: cast mask to s16 before mad --- src/gpu/intel/jit/conv/zp_plan.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpu/intel/jit/conv/zp_plan.cpp b/src/gpu/intel/jit/conv/zp_plan.cpp index 13f7aae303a..9a6e3927dc9 100644 --- a/src/gpu/intel/jit/conv/zp_plan.cpp +++ b/src/gpu/intel/jit/conv/zp_plan.cpp @@ -1268,7 +1268,7 @@ class zp_comp_apply_plan_t : public base_plan_t { const expr_t &c_buf, const split_dispatcher_t &sd, int subtile_idx) const { const auto comp_type = comp_layout_.type(); - const auto mask_type = mask_layout_.type(); + const auto mask_type = type_t::s16(); const dim_t kw_dim = comp_layout_.dim(comp_kw_idx_); std::vector comp_off; std::vector mask_off; From fbb574a3d7c382dd872c7f3d11bf2a0c120b9f0e Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Tue, 7 Jan 2025 13:35:55 -0800 Subject: [PATCH 26/40] xe: sdap: Add support for non-power of 2 head_size with quantization --- src/gpu/intel/ocl/micro_sdpa.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/gpu/intel/ocl/micro_sdpa.cpp b/src/gpu/intel/ocl/micro_sdpa.cpp index 6a6aa30cf1b..3a47fdaf148 100644 --- a/src/gpu/intel/ocl/micro_sdpa.cpp +++ b/src/gpu/intel/ocl/micro_sdpa.cpp @@ -331,8 +331,9 @@ status_t micro_sdpa_t::pd_t::init_microkernels(impl::engine_t *engine) { problem_vs.aOffset = ABOffset::Calc; } if (with_value_scales() || with_value_zp()) { - problem_vs.aqGroupM - = (vs_common_scales || vs_common_zp) ? 1 : value_group_size(); + problem_vs.aqGroupM = (vs_common_scales || vs_common_zp) + ? 1 + : utils::rnd_up_pow2(value_group_size()); problem_vs.aqGroupK = 1; } opts_vs.scaleA = with_value_scales() && !vs_common_scales; From 16bd29961bd9915af1a8d3227610563081f5f98c Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Tue, 7 Jan 2025 14:42:40 -0800 Subject: [PATCH 27/40] xe: sdpa: Add support for 16,16,8,2 vs work group configuration --- src/gpu/intel/ocl/tile_ops.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gpu/intel/ocl/tile_ops.h b/src/gpu/intel/ocl/tile_ops.h index 527087e4259..6107f01f0a9 100644 --- a/src/gpu/intel/ocl/tile_ops.h +++ b/src/gpu/intel/ocl/tile_ops.h @@ -134,6 +134,7 @@ DEF_BLOCK_LOAD_STORE16(uint, uint, ) pp, w - 1, h - 1, ld - 1, coord, as_##itype##vl(v)); \ } +DEF_BLOCK2D_LOAD_STORE(half, ushort, 8, 16, u16_m8k16v1, 16, 8) DEF_BLOCK2D_LOAD_STORE(half, ushort, 8, 16, u16_m4k32v1, 32, 4) DEF_BLOCK2D_LOAD_STORE(half, ushort, 16, 16, u16_m8k32v1, 32, 8) From be408c0415af8114dfe3a228c0a95d039d8bbb07 Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Tue, 7 Jan 2025 16:44:27 -0800 Subject: [PATCH 28/40] xe: sdpa: Add checks for VS ugemm group size when not power of 2 --- src/gpu/intel/ocl/micro_sdpa.hpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/gpu/intel/ocl/micro_sdpa.hpp b/src/gpu/intel/ocl/micro_sdpa.hpp index a34c17cb5c4..e409a7f145a 100644 --- a/src/gpu/intel/ocl/micro_sdpa.hpp +++ b/src/gpu/intel/ocl/micro_sdpa.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2024 Intel Corporation +* Copyright 2024-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include "common/c_types_map.hpp" #include "common/gemm_types.hpp" #include "common/gemm_utils.hpp" +#include "common/math_utils.hpp" #include "common/primitive.hpp" #include "common/sdpa_pd.hpp" #include "common/type_helpers.hpp" @@ -140,6 +141,16 @@ struct micro_sdpa_t : public gpu_primitive_t { value_group_size()); } + if (!desc()->vs_scales.has_default_values() + || !desc()->vs_zero_points.has_default_values()) { + int vgs = value_group_size(); + VDISPATCH_SDPA( + math::is_pow2(vgs) || vgs == val_md()->dims[3], + "the value group size(%d) must be a power of 2 or " + "equal to the number of values(%d).", + vgs, val_md()->dims[3]); + } + CHECK(init_microkernels(engine)); return status::success; } From 71cb6de8f6eae393401a090d7fc5245e0e8357e6 Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Fri, 10 Jan 2025 09:24:40 -0800 Subject: [PATCH 29/40] test: sdpa: Add test case for phi3(non-pow2 head_size) --- tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all index 6f1d9b9680c..4043d6ae7a8 100644 --- a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all +++ b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all @@ -56,3 +56,8 @@ --reset --expected-n-partitions=0 --in-shapes=4:20x117x48x128+3:20x1x128x117+0:20x1x117x128 --case=complex_fusion/mha/MHA-starcoder-inf-int8-bs1.json --reset --expected-n-partitions=0 --in-shapes=4:32x16x384x64+3:32x16x64x384+0:32x16x384x64+1:32x1x1x384 --case=complex_fusion/mha/dynamic_quantized_mha-Bert_large-inf-int8-bs1-fake.json --reset --in-shapes=4:20x16x384x64+3:20x16x64x384+0:20x16x384x64+1:20x1x1x384 --case=complex_fusion/mha/sdpa-plain-wo-scale-int8-bs1.json + +# phi3-mini-4k-instruct +--reset +--in-shapes=0:1x32x96x384*abdc+1:1x32x1x384+2:1x32x1x384+3:1x32x384x96+5:1x1x384x384+6:1x32x384x96+7:1x32x384x1+8:1x32x384x1 +--op-attrs=34107656704:group_shape:1x1x96x1+34107752448:group_shape:1x1x1x96 --in-shapes= --case=complex_fusion/mha/sdpa-compressed-kv-int8-gs128.json From 69bd2f9bc5dfc90c776369b2bd961afd70615806 Mon Sep 17 00:00:00 2001 From: Simon Ewing Date: Thu, 9 Jan 2025 14:36:11 -0800 Subject: [PATCH 30/40] xe: ocl: gemm: fixup biasless kernel generation --- src/gpu/intel/ocl/gemm/gemm_with_post_ops.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gpu/intel/ocl/gemm/gemm_with_post_ops.cpp b/src/gpu/intel/ocl/gemm/gemm_with_post_ops.cpp index e7a54ee19f7..36a52fa40c0 100644 --- a/src/gpu/intel/ocl/gemm/gemm_with_post_ops.cpp +++ b/src/gpu/intel/ocl/gemm/gemm_with_post_ops.cpp @@ -142,7 +142,9 @@ status_t gemm_with_post_ops_t::pd_t::init_kernel_ctx( auto c_type = dst_md(0)->data_type; const auto src_info = memory_desc_info_t::create(gemm_pd_->dst_md(0)); const auto bias_info = [&]() { - auto info = memory_desc_info_t::create(src_md(2)); + // If no bias, just default to same layout as dst - any valid layout will work, it's just a dummy + auto info = memory_desc_info_t::create( + with_bias() ? src_md(2) : dst_md(0)); if (info.data_type == data_type::undef) info.data_type = data_type::f32; return info; }(); From df5f802bb419c3921149fd295c544a814ebcb7b3 Mon Sep 17 00:00:00 2001 From: Simon Ewing Date: Thu, 9 Jan 2025 17:15:48 -0800 Subject: [PATCH 31/40] xe: ocl: gemm: fixup postop normalization --- src/gpu/intel/ocl/gemm_matmul.hpp | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/gpu/intel/ocl/gemm_matmul.hpp b/src/gpu/intel/ocl/gemm_matmul.hpp index ab1d5d9c7a0..5b1e19cbaf3 100644 --- a/src/gpu/intel/ocl/gemm_matmul.hpp +++ b/src/gpu/intel/ocl/gemm_matmul.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2024 Intel Corporation +* Copyright 2020-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -88,13 +88,13 @@ struct gemm_matmul_t : public gpu_primitive_t { CHECK(map_gemm_zp(DNNL_ARG_DST, DNNL_ARG_C)); } - auto maybe_reshape = [&](dims_t &orig_a_dims, dims_t &orig_b_dims, - dims_t &orig_c_dims, - dims_t &orig_bias_dims, - const int orig_dims) { + auto maybe_reshape + = [&](dims_t &orig_a_dims, dims_t &orig_b_dims, + dims_t &orig_c_dims, dims_t &orig_bias_dims, + const int orig_dims) -> status_t { int batch_b_dims = 1; - for (int i = b_md->ndims; i > 2; i--) { - batch_b_dims *= b_md->dims[b_md->ndims - i]; + for (int i = 0; i < b_md->ndims - 2; i++) { + batch_b_dims *= b_md->dims[i]; } for (int i = 0; i < orig_dims; i++) { orig_a_dims[i] = a_md->dims[i]; @@ -161,7 +161,7 @@ struct gemm_matmul_t : public gpu_primitive_t { for (int i = 0; i < attr()->post_ops_.len(); i++) { auto &po = post_ops.entry_[i]; if (po.is_binary()) { - auto &po_desc = po.binary.src1_desc; + const auto &po_desc = po.binary.src1_desc; auto a_dim = po_desc.dims[po_desc.ndims - reshape_size]; for (int i = po_desc.ndims; i > reshape_size; i--) { @@ -187,9 +187,11 @@ struct gemm_matmul_t : public gpu_primitive_t { ? po_desc.dims[po_desc.ndims - 1] : 1; } - CHECK(memory_desc_reshape( - po_desc, po_desc, reshape_size, po_dims)); - tmp_post_ops.entry_[i].binary.src1_desc = po_desc; + memory_desc_t tmp_po_desc; + CHECK(memory_desc_reshape(tmp_po_desc, po_desc, + reshape_size, po_dims)); + tmp_post_ops.entry_[i].binary.src1_desc + = tmp_po_desc; } else if (po.is_prelu()) { auto mask = po.prelu.mask; int new_mask = 0; From 8f16e53d60e5af3d5082e2fb03b0bbfdfec200ea Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Thu, 9 Jan 2025 22:34:36 -0800 Subject: [PATCH 32/40] xe: jit: gemm: microkernel strategy heuristic adjustments for sdpa --- src/gpu/intel/jit/gemm/generator/microkernel_provider.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/gpu/intel/jit/gemm/generator/microkernel_provider.cpp b/src/gpu/intel/jit/gemm/generator/microkernel_provider.cpp index db34fbf4690..ceae9105328 100644 --- a/src/gpu/intel/jit/gemm/generator/microkernel_provider.cpp +++ b/src/gpu/intel/jit/gemm/generator/microkernel_provider.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2024 Intel Corporation +* Copyright 2024-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -264,12 +264,11 @@ static inline bool getStrategyByHeuristics(HW hw, GEMMStrategy &strategy, bool l } else if (!block2DA) { s.A.accessType = AccessType::Block; if (systolic) - s.ka_load = (problem.A.layout == MatrixLayout::T) ? 32 : 16; + s.ka_load = (problem.A.layout == MatrixLayout::T) ? (64 / problem.Ta_ext) : 16; s.slmA = true; - } else if (problem.A.layout == MatrixLayout::T) { s.A.accessType = AccessType::Block2DTranspose; - s.ka_load = 32; + s.ka_load = 64 / problem.Ta_ext; } else if (problem.A.layout == MatrixLayout::N) { s.A.accessType = AccessType::Block2DVNNI; s.A_copies = 2; From 6fed6157d7c41fc17b56ae45970f8dcc4f46c246 Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Thu, 9 Jan 2025 22:35:33 -0800 Subject: [PATCH 33/40] xe: ocl: sdpa: strategy adjustments for quantized SDPA --- src/gpu/intel/ocl/micro_sdpa.cpp | 58 +++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/src/gpu/intel/ocl/micro_sdpa.cpp b/src/gpu/intel/ocl/micro_sdpa.cpp index 3a47fdaf148..2e6e700a822 100644 --- a/src/gpu/intel/ocl/micro_sdpa.cpp +++ b/src/gpu/intel/ocl/micro_sdpa.cpp @@ -52,16 +52,26 @@ sdpa_config_t xehpg_h32_s64 = {16, 16, 16, 8, 4, 4, 2, 8}; sdpa_config_t xehpg_h32_s32 = {8, 8, 8, 8, 4, 4, 4, 4}; sdpa_config_t xehpg_h32_2nd = {8, 32, 16, 8, 8, 1, 2, 4}; +sdpa_config_t xehpg_q_h32 = {32, 16, 16, 16, 2, 8, 2, 8}; +sdpa_config_t xehpg_q_h32_2nd = {32, 16, 8, 8, 8, 1, 4, 2}; + sdpa_config_t xehpg_h64 = {32, 16, 16, 16, 4, 8, 4, 8}; sdpa_config_t xehpg_h64_s128 = {16, 16, 16, 16, 4, 8, 4, 8}; sdpa_config_t xehpg_h64_s64 = {32, 16, 16, 8, 8, 4, 4, 8}; sdpa_config_t xehpg_h64_2nd = {8, 16, 16, 8, 8, 1, 4, 2}; +sdpa_config_t xehpg_q_h64 = {32, 16, 16, 16, 4, 4, 4, 4}; +sdpa_config_t xehpg_q_h64_2nd = {16, 16, 8, 8, 16, 1, 8, 2}; + sdpa_config_t xehpg_h128 = {16, 16, 32, 8, 8, 4, 4, 8}; sdpa_config_t xehpg_h128_s32 = {16, 16, 16, 8, 16, 2, 8, 4}; sdpa_config_t xehpg_h128_2nd = {8, 16, 16, 8, 16, 1, 8, 2}; sdpa_config_t xehpg_h128_s256_2nd = {8, 16, 32, 8, 8, 1, 4, 2}; +sdpa_config_t xehpg_q_h128 = {32, 16, 16, 16, 8, 4, 8, 4}; +sdpa_config_t xehpg_q_h128_2nd = {32, 16, 16, 8, 16, 1, 8, 2}; +sdpa_config_t xehpg_q_h128_s64_2nd = {16, 16, 16, 8, 16, 1, 8, 2}; + sdpa_config_t xehpg_h256 = {16, 16, 32, 8, 16, 2, 8, 4}; sdpa_config_t xehpg_h256_s128 = {8, 16, 32, 16, 8, 4, 8, 4}; sdpa_config_t xehpg_h256_s32 = {8, 16, 32, 8, 16, 2, 8, 4}; @@ -79,28 +89,53 @@ sdpa_config_t xehpc_h64_s32 = {16, 16, 16, 16, 4, 2, 4, 2}; sdpa_config_t xehpc_h64_2nd = {32, 32, 32, 16, 4, 1, 2, 2}; sdpa_config_t xehpc_h64_s64_2nd = {16, 16, 16, 16, 4, 1, 4, 1}; +sdpa_config_t xehpc_q_h64 = {16, 64, 32, 16, 8, 4, 2, 16}; + sdpa_config_t xehpc_h128 = {16, 64, 32, 16, 16, 2, 4, 8}; sdpa_config_t xehpc_h128_s64 = {16, 32, 32, 32, 4, 2, 4, 2}; sdpa_config_t xehpc_h128_s32 = {16, 16, 16, 16, 8, 2, 8, 2}; sdpa_config_t xehpc_h128_2nd = {32, 32, 32, 16, 8, 1, 4, 2}; +sdpa_config_t xehpc_q_h128 = {16, 64, 16, 32, 16, 2, 8, 4}; +sdpa_config_t xehpc_q_h128_s64 = {16, 16, 32, 16, 4, 4, 4, 4}; +sdpa_config_t xehpc_q_h128_s32 = {16, 16, 32, 16, 4, 2, 4, 2}; +sdpa_config_t xehpc_q_h128_2nd = {32, 32, 16, 32, 4, 1, 4, 1}; +sdpa_config_t xehpc_q_h128_s32_2nd = {16, 32, 16, 16, 8, 1, 4, 2}; + sdpa_config_t xehpc_h256 = {16, 32, 32, 32, 8, 4, 8, 4}; sdpa_config_t xehpc_h256_s64 = {16, 32, 32, 32, 8, 1, 8, 1}; sdpa_config_t xehpc_h256_2nd = {16, 16, 16, 16, 16, 1, 16, 1}; -sdpa_config_t *choose_config_xehpg(dim_t head_size, dim_t seq, dim_t thin_q) { +sdpa_config_t *choose_config_xehpg( + dim_t head_size, dim_t seq, bool thin_q, bool quantized) { if (head_size <= 32) { + if (quantized && seq >= 128) { + if (thin_q) return &xehpg_q_h32_2nd; + return &xehpg_q_h32; + } if (thin_q) return &xehpg_h32_2nd; if (seq <= 32) return &xehpg_h32_s32; if (seq <= 64) return &xehpg_h32_s64; if (seq <= 256) return &xehpg_h32_s256; return &xehpg_h32; } else if (head_size <= 64) { + if (quantized) { + if (thin_q) return &xehpg_q_h64_2nd; + return &xehpg_q_h64; + } if (thin_q) return &xehpg_h64_2nd; if (seq <= 64) return &xehpg_h64_s64; if (seq <= 128) return &xehpg_h64_s128; return &xehpg_h64; } else if (head_size <= 128) { + if (quantized) { + if (thin_q) { + if (seq <= 64) return &xehpg_q_h128_s64_2nd; + return &xehpg_q_h128_2nd; + } + if (seq <= 32) return &xehpg_h128_s32; + return &xehpg_q_h128; + } if (thin_q) { if (seq <= 256) return &xehpg_h128_s256_2nd; return &xehpg_h128_2nd; @@ -120,7 +155,8 @@ sdpa_config_t *choose_config_xehpg(dim_t head_size, dim_t seq, dim_t thin_q) { return nullptr; } -sdpa_config_t *choose_config_xehpc(dim_t head_size, dim_t seq, bool thin_q) { +sdpa_config_t *choose_config_xehpc( + dim_t head_size, dim_t seq, bool thin_q, bool quantized) { if (head_size <= 32) { if (thin_q) return &xehpc_h32_2nd; if (seq <= 32) return &xehpc_h32_s32; @@ -130,10 +166,20 @@ sdpa_config_t *choose_config_xehpc(dim_t head_size, dim_t seq, bool thin_q) { if (seq <= 64) return &xehpc_h64_s64_2nd; return &xehpc_h64_2nd; } + if (quantized && seq >= 256) return &xehpc_q_h64; if (seq <= 32) return &xehpc_h64_s32; if (seq <= 64) return &xehpc_h64_s64; return &xehpc_h64; } else if (head_size <= 128) { + if (quantized) { + if (thin_q) { + if (seq <= 32) return &xehpc_q_h128_s32_2nd; + return &xehpc_q_h128_2nd; + } + if (seq <= 32) return &xehpc_q_h128_s32; + if (seq <= 64) return &xehpc_q_h128_s64; + return &xehpc_q_h128; + } if (thin_q) return &xehpc_h128_2nd; if (seq <= 32) return &xehpc_h128_s32; if (seq <= 64) return &xehpc_h128_s64; @@ -190,15 +236,19 @@ status_t micro_sdpa_t::pd_t::init_microkernels(impl::engine_t *engine) { /* Retrieve pre-tuned kernel configuration */ sdpa_config_t *config = nullptr; bool thin_q = (d->queries() <= 16); + bool quantized = types::is_integral_dt(key_md()->data_type) + || types::is_integral_dt(val_md()->data_type); switch (arch_) { case arch_t::xe_hpg: - config = choose_config_xehpg(d->head_size(), d->keys(), thin_q); + config = choose_config_xehpg( + d->head_size(), d->keys(), thin_q, quantized); break; case arch_t::xe_hpc: case arch_t::xe2: case arch_t::xe3: - config = choose_config_xehpc(d->head_size(), d->keys(), thin_q); + config = choose_config_xehpc( + d->head_size(), d->keys(), thin_q, quantized); default: break; } From 57abed954a636f2e5a6297d5452ee2e87055879b Mon Sep 17 00:00:00 2001 From: Andrey Kalinin Date: Thu, 9 Jan 2025 21:47:11 -0800 Subject: [PATCH 34/40] x64: brgemm: update condition regarding rdb_tail --- src/cpu/x64/brgemm/brgemm.cpp | 2 +- src/cpu/x64/brgemm/brgemm_utils.cpp | 21 ++++++++++++++++++++- src/cpu/x64/brgemm/brgemm_utils.hpp | 4 ++-- 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/cpu/x64/brgemm/brgemm.cpp b/src/cpu/x64/brgemm/brgemm.cpp index 8a663cfa18f..38e2975fd02 100644 --- a/src/cpu/x64/brgemm/brgemm.cpp +++ b/src/cpu/x64/brgemm/brgemm.cpp @@ -534,7 +534,7 @@ status_t brgemm_desc_set_attr( if (brg->is_dgmm) CHECK(brdgmm_blocking(brg)); else - CHECK(brgemm_blocking(brg)); + CHECK(brgemm_blocking(brg, true)); } if (!brg->is_dgmm) { diff --git a/src/cpu/x64/brgemm/brgemm_utils.cpp b/src/cpu/x64/brgemm/brgemm_utils.cpp index 6473b9701de..47a92327e01 100644 --- a/src/cpu/x64/brgemm/brgemm_utils.cpp +++ b/src/cpu/x64/brgemm/brgemm_utils.cpp @@ -252,7 +252,7 @@ int calculate_max_bcast_block(brgemm_desc_t *brg, const int adj_ld_block2) { return max_bcast_block; } -status_t brgemm_blocking(brgemm_desc_t *brg) { +status_t brgemm_blocking(brgemm_desc_t *brg, bool attr_blocking) { const data_type_t ld_step_compute_dt = get_mac_emu_data_type(brg->dt_b, brg->isa_impl, brg->isa_impl != avx2_vnni_2 && !brg->is_fp8_via_convert()); @@ -750,6 +750,25 @@ status_t brgemm_blocking(brgemm_desc_t *brg) { brg->rdb = brg->reduce_dim / brg->rd_block; brg->rdb_tail = brg->reduce_dim % brg->rd_block; + // Remove these guards in the future (add tail processing by reduction + // dimension) + // TODO: these checks do not work for fp8-f16 and f16-fp8 cfgs + if (attr_blocking + && !IMPLICATION(brg->rdb > 0 && brg->rdb_tail, + brg->is_input_convert() || brg->amx_wary_k_tail())) { + return status::unimplemented; + } + + if (attr_blocking + && !IMPLICATION( + (brg->rdb_tail + % ((brg->is_bf16_tmm || brg->is_f16_tmm) ? 2 + : 4)) + != 0, + brg->is_input_convert() || brg->amx_wary_k_tail())) { + return status::unimplemented; + } + //TODO: check this condition brg->interleave_tilestores_ = brg->beta == 0 && (brg->brgattr.use_interleave_stores diff --git a/src/cpu/x64/brgemm/brgemm_utils.hpp b/src/cpu/x64/brgemm/brgemm_utils.hpp index db2fc9a2a8d..a62af01465c 100644 --- a/src/cpu/x64/brgemm/brgemm_utils.hpp +++ b/src/cpu/x64/brgemm/brgemm_utils.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2024 Intel Corporation +* Copyright 2022-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,7 +34,7 @@ bool can_dispatch_uker(const brgemm_desc_t *brg); void maybe_try_bf32(brgemm_desc_t *brg); -status_t brgemm_blocking(brgemm_desc_t *brg); +status_t brgemm_blocking(brgemm_desc_t *brg, bool attr_blocking = false); status_t brdgmm_blocking(brgemm_desc_t *brg); From 58ded61884edd096fbd6cf5a879c888c57070957 Mon Sep 17 00:00:00 2001 From: Andrey Kalinin Date: Thu, 9 Jan 2025 21:47:55 -0800 Subject: [PATCH 35/40] x64: matmul: no extendable_k for packed_sparse_weights --- src/cpu/x64/matmul/brgemm_matmul_utils.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cpu/x64/matmul/brgemm_matmul_utils.cpp b/src/cpu/x64/matmul/brgemm_matmul_utils.cpp index 6f581e6d140..a14291cacd5 100644 --- a/src/cpu/x64/matmul/brgemm_matmul_utils.cpp +++ b/src/cpu/x64/matmul/brgemm_matmul_utils.cpp @@ -2037,8 +2037,8 @@ void matmul_amx_blocking_params_t::set_blocking_parameters( const dim_t current_k_tail = K % k_blk_; - extendable_k_ - = !use_buffer_a && K % wei_k_blk && k_chunk_elems_ > wei_k_blk; + extendable_k_ = !use_buffer_a && K % wei_k_blk + && k_chunk_elems_ > wei_k_blk && !packed_sparse_weights; if (extendable_k_) { if (k_chunk_elems_ >= K) { From 7a50e03cfc84e6505c6ff041ec4d4be9e81a6a43 Mon Sep 17 00:00:00 2001 From: "Wang, Zhitao" Date: Thu, 19 Dec 2024 08:34:50 +0000 Subject: [PATCH 36/40] graph: backend: dnnl: support permute for scale and zps --- .../dnnl/kernels/sdp_primitive_config.cpp | 5 +++++ src/graph/backend/dnnl/subgraph.cpp | 22 ++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive_config.cpp b/src/graph/backend/dnnl/kernels/sdp_primitive_config.cpp index ec38a9598c2..ebf50cac1db 100644 --- a/src/graph/backend/dnnl/kernels/sdp_primitive_config.cpp +++ b/src/graph/backend/dnnl/kernels/sdp_primitive_config.cpp @@ -179,6 +179,11 @@ status_t sdp_primitive_config_t::initial_check( } } if (op_kind != graph::op_kind::MatMul) continue; + // TODO(zhitao): execute the reorder for scale and zps mannually if the + // transpose attribute is specified as true. + if (cur_op->has_attr(op_attr::transpose_b) + && cur_op->get_attr(op_attr::transpose_b)) + return status::unimplemented; auto post_op = get_post_op(cur_op); if (post_op && mm1_post_op_kind.count(post_op->get_kind())) { mm1 = cur_op; diff --git a/src/graph/backend/dnnl/subgraph.cpp b/src/graph/backend/dnnl/subgraph.cpp index 512aef1dd42..5b60572d989 100644 --- a/src/graph/backend/dnnl/subgraph.cpp +++ b/src/graph/backend/dnnl/subgraph.cpp @@ -1,5 +1,5 @@ /******************************************************************************* - * Copyright 2022-2024 Intel Corporation + * Copyright 2022-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -443,6 +443,26 @@ void subgraph_rewriter_t::insert_op_before(const op_ptr &inserted_op, auto in_dtype = in_val->get_logical_tensor().data_type; new_val->set_data_type(in_dtype); + if (inserted_op->get_kind() == op_kind::dnnl_permute + && (base_op->get_kind() == op_kind::dnnl_mul_scales + || base_op->get_kind() == op_kind::dnnl_sub_zps)) { + // Only abx tag is respected for scale and zps inputs, should set + // strides explicitly and execute reorder. + + dnnl::memory::desc in_md + = make_dnnl_memory_desc(in_val->get_logical_tensor()); + const auto &perm = inserted_op->get_attr>( + op_attr::permutation); + std::vector int_perm(perm.size(), -1); + for (size_t i = 0; i < perm.size(); i++) { + int_perm[i] = static_cast(perm[i]); + } + dnnl::memory::desc out_md = in_md.permute_axes(int_perm); + const auto &dims = out_md.get_dims(); + // set the strides with abx tag. + new_val->set_strides(get_dense_strides(dims)); + } + if (k == std::numeric_limits::max()) { k = inserted_op->num_outputs(); } From d18538ed231ed0483586fa7083d9e1dbc7942cde Mon Sep 17 00:00:00 2001 From: "Wang, Zhitao" Date: Fri, 20 Dec 2024 01:30:48 +0000 Subject: [PATCH 37/40] benchdnn: inputs: graph: add transposed case for compressed sdpa --- src/graph/backend/dnnl/passes/insert_ops.cpp | 3 --- tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all | 1 + 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/graph/backend/dnnl/passes/insert_ops.cpp b/src/graph/backend/dnnl/passes/insert_ops.cpp index d29c08341f3..19d127ba1b2 100644 --- a/src/graph/backend/dnnl/passes/insert_ops.cpp +++ b/src/graph/backend/dnnl/passes/insert_ops.cpp @@ -442,9 +442,6 @@ status_t insert_permute_for_dynamic_mul_scale_sub_zp( std::swap(group_shape[ndims - 1], group_shape[ndims - 2]); cur_op->set_attr>( op_attr::group_shape, group_shape); - } else { // per-channel quantization - const auto axis = cur_op->get_attr(op_attr::axis); - cur_op->set_attr(op_attr::axis, (2 * ndims - 3) - axis); } } diff --git a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all index 4043d6ae7a8..f4ec24d0c55 100644 --- a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all +++ b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all @@ -46,6 +46,7 @@ --reset --dt=f32,bf16,f16 --in-shapes=0:32x16x128x64+1:32x16x128x64+5:32x16x128x128+8:32x16x128x64 --case=complex_fusion/mha/sdpa-plain-simplified-f16.json --reset --dt=f32,bf16,f16 --in-shapes=0:acbd+1:acbd+8:acbd --case=complex_fusion/mha/sdpa-plain-simplified-f16.json --reset --dt=f32,bf16,f16 --in-shapes=3:384,3:384x384,3:1x16x384x384 --case=complex_fusion/mha/sdpa-plain-scale-by-mul-f16.json +--reset --op-attrs=34107656704:group_shape:1x1x1x32+34107654464:transpose_b:1 --in-shapes=0:1x32x32x128+1:1x32x32x4+2:1x32x32x4 --case=complex_fusion/mha/sdpa-compressed-k-int8-gs32.json # Re-written int8 graphs --reset --in-shapes=5:4x16x32x256+4:4x16x256x33+0:4x16x33x256+1:4x1x1x33+3:4x1x32x33 --case=complex_fusion/mha/MHA-GPT-inf-int8-bs1.json From d8d68816cf22d5b608645e03e6b978e46ab00309 Mon Sep 17 00:00:00 2001 From: "Wang, Zhitao" Date: Thu, 26 Dec 2024 11:20:09 +0000 Subject: [PATCH 38/40] graph: backend: dnnl: fix shape check for per-channel dynamic quant --- src/graph/backend/dnnl/passes/lower.cpp | 22 ++++++------- src/graph/interface/op_def_constraint.cpp | 39 +++++++++++++++++++---- 2 files changed, 42 insertions(+), 19 deletions(-) diff --git a/src/graph/backend/dnnl/passes/lower.cpp b/src/graph/backend/dnnl/passes/lower.cpp index c7de715df3c..7ce48ae04c0 100644 --- a/src/graph/backend/dnnl/passes/lower.cpp +++ b/src/graph/backend/dnnl/passes/lower.cpp @@ -568,15 +568,10 @@ static status_t dynamic_dequant_handler( const auto scale_lt = scales->get_logical_tensor(); const auto ndims = ltw(src_lt).ndims(); - VCHECK_INVALID_ARGUMENT((ndims == ltw(scale_lt).ndims()), - "scale and src should have the same number of dimensions " - "for grouped quantization"); VCHECK_INVALID_ARGUMENT( (static_cast(ndims) == group_shape.size()), "group shape size should match the number of dimensions of " "src"); - VCHECK_UNIMPLEMENTED((ndims >= 2), - "group quantization requires at least two dimensions"); const auto &src_dims = ltw(src_lt).vdims(); const auto &scale_dims = ltw(scale_lt).vdims(); @@ -627,14 +622,7 @@ static status_t dynamic_dequant_handler( rewriter.to_insert(mul_scales); if (has_zps) { - value_ptr scales = in_vals[1], zps = in_vals[2]; - const auto &scale_dims = ltw(scales->get_logical_tensor()).vdims(); - const auto &zp_dims = ltw(zps->get_logical_tensor()).vdims(); - for (size_t idx = 0; idx < scale_dims.size(); ++idx) { - VCHECK_INVALID_ARGUMENT((scale_dims[idx] == zp_dims[idx]), - "scale and zero point tensors should have the same shape"); - } - + value_ptr zps = in_vals[2]; const int64_t zps_data_type = zps->get_logical_tensor().data_type; op_ptr sub_zps = std::make_shared(op_kind::dnnl_sub_zps); sub_zps->connect_input(1, zps); @@ -643,6 +631,14 @@ static status_t dynamic_dequant_handler( sub_zps->set_attr(op_attr::qtype, qtype); sub_zps->set_attr(op_attr::data_type, zps_data_type); if (is_group_quantization) { + value_ptr scales = in_vals[1]; + const auto &scale_dims = ltw(scales->get_logical_tensor()).vdims(); + const auto &zp_dims = ltw(zps->get_logical_tensor()).vdims(); + for (size_t idx = 0; idx < scale_dims.size(); ++idx) { + VCHECK_INVALID_ARGUMENT((scale_dims[idx] == zp_dims[idx]), + "scale and zero point tensors should have the same " + "shape"); + } const auto &group_shape = cur_op->get_attr>( op_attr::group_shape); sub_zps->set_attr>( diff --git a/src/graph/interface/op_def_constraint.cpp b/src/graph/interface/op_def_constraint.cpp index 7efd6213561..512af96e167 100644 --- a/src/graph/interface/op_def_constraint.cpp +++ b/src/graph/interface/op_def_constraint.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -328,11 +328,38 @@ bool check_dyn_quant_dequant_scales_zps(const op_t *n) { // in case of not setting value for zps if (sz_zps == DNNL_GRAPH_UNKNOWN_DIM) { return true; } - VCHECK_SHAPE_INFER((sz_scales == sz_zps), - "%s, scales and zps should keep same. given scale " - "size: %d, given zp size: %d.", - op_t::kind2str(n->get_kind()).c_str(), - static_cast(sz_scales), static_cast(sz_zps)); + if (qtype == "per_group") { + const auto &ndims + = n->get_input_value(1)->get_logical_tensor().ndims; + const auto &scale_ndims + = n->get_input_value(1)->get_logical_tensor().ndims; + const auto &scale_dims + = n->get_input_value(1)->get_logical_tensor().dims; + const auto &zp_ndims + = n->get_input_value(2)->get_logical_tensor().ndims; + const auto &zp_dims + = n->get_input_value(2)->get_logical_tensor().dims; + VCHECK_SHAPE_INFER((ndims >= 2), + "group quantization requires at least two dimensions"); + VCHECK_SHAPE_INFER(((ndims == scale_ndims) && (ndims == zp_ndims)), + "%s, input, scales and zps should keep the number of " + "dimensions for group quantization", + op_t::kind2str(n->get_kind()).c_str()); + VCHECK_SHAPE_INFER( + (std::equal(scale_dims, scale_dims + ndims, zp_dims)), + "%s, scales and zps should keep the same shape for group " + "quantization", + op_t::kind2str(n->get_kind()).c_str()); + } + + if (qtype == "per_channel") { + VCHECK_SHAPE_INFER((sz_zps == 1 || sz_scales == sz_zps), + "%s, zps should be 1 or equals to scales size for " + "per_channel policy, given zps size: %d and scales size: " + "%d", + op_t::kind2str(n->get_kind()).c_str(), + static_cast(sz_zps), static_cast(sz_scales)); + } if (qtype == "per_tensor") { VCHECK_SHAPE_INFER((sz_zps == 1), From 124cf4a32d51580971493bf7a4b783d4b175fc6e Mon Sep 17 00:00:00 2001 From: "Wang, Zhitao" Date: Thu, 26 Dec 2024 11:20:45 +0000 Subject: [PATCH 39/40] benchdnn: inputs: graph: add compressed sdpa with per-channel quant --- tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all index f4ec24d0c55..fdafff881d0 100644 --- a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all +++ b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all @@ -47,6 +47,7 @@ --reset --dt=f32,bf16,f16 --in-shapes=0:acbd+1:acbd+8:acbd --case=complex_fusion/mha/sdpa-plain-simplified-f16.json --reset --dt=f32,bf16,f16 --in-shapes=3:384,3:384x384,3:1x16x384x384 --case=complex_fusion/mha/sdpa-plain-scale-by-mul-f16.json --reset --op-attrs=34107656704:group_shape:1x1x1x32+34107654464:transpose_b:1 --in-shapes=0:1x32x32x128+1:1x32x32x4+2:1x32x32x4 --case=complex_fusion/mha/sdpa-compressed-k-int8-gs32.json +--reset --op-attrs=34107656704:qtype:per_channel*axis:3 --in-shapes=1:32+2:1 --case=complex_fusion/mha/sdpa-compressed-k-int8-gs32.json # Re-written int8 graphs --reset --in-shapes=5:4x16x32x256+4:4x16x256x33+0:4x16x33x256+1:4x1x1x33+3:4x1x32x33 --case=complex_fusion/mha/MHA-GPT-inf-int8-bs1.json From 38055e0183f2cdb73d7adb819887280bb8a03b82 Mon Sep 17 00:00:00 2001 From: Tomasz Czeszun Date: Fri, 10 Jan 2025 07:51:50 -0800 Subject: [PATCH 40/40] x64: conv: handle dilation int overflow --- src/cpu/x64/jit_avx512_common_conv_kernel.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/cpu/x64/jit_avx512_common_conv_kernel.cpp b/src/cpu/x64/jit_avx512_common_conv_kernel.cpp index 4513d94faef..5a61ec210fa 100644 --- a/src/cpu/x64/jit_avx512_common_conv_kernel.cpp +++ b/src/cpu/x64/jit_avx512_common_conv_kernel.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2016-2024 Intel Corporation +* Copyright 2016-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include "common/type_helpers.hpp" #include "common/utils.hpp" +#include "cpu/cpu_convolution_pd.hpp" #include "cpu/platform.hpp" #include "cpu/x64/cpu_barrier.hpp" #include "cpu/x64/injectors/injector_utils.hpp" @@ -822,6 +823,13 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf(jit_conv_conf_t &jcp, jcp.stride_h = (ndims == 3) ? 1 : cd.strides[ndims - 4]; jcp.stride_w = cd.strides[ndims - 3]; + // Big int (> INT_MAX) values are unsupported and jcp fields may overflow + // TODO: change data type of jcp fields to size_t + VDISPATCH_CONV_IC(!((ndims == 5 && cd.dilates[ndims - 5] > INT_MAX) + || (ndims >= 4 && cd.dilates[ndims - 4] > INT_MAX) + || (cd.dilates[ndims - 3] > INT_MAX)), + VERBOSE_BAD_PARAM, "dilates"); + jcp.dilate_d = (ndims == 5) ? cd.dilates[0] : 0; jcp.dilate_h = (ndims == 3) ? 0 : cd.dilates[ndims - 4]; jcp.dilate_w = cd.dilates[ndims - 3];