From 8b6860b0dd95557c7a07473a4ae61020bf6212e7 Mon Sep 17 00:00:00 2001 From: Radomir Djogo Date: Fri, 7 Jun 2024 18:17:41 +0000 Subject: [PATCH 1/5] #4858: add 2nd param to typecast --- .../sweep_tests/pytorch_ops.py | 8 +++--- .../sweep_tests/tt_lib_ops.py | 6 ++--- .../eltwise_unary/eltwise_unary_op.cpp | 7 ++++- .../eltwise_unary/eltwise_unary_op.hpp | 17 +++++++++--- .../csrc/tt_lib_bindings_tensor_xary_ops.cpp | 26 ++++++++++++++----- .../llk_math_eltwise_unary_sfpu_typecast.h | 8 +++--- .../eltwise_unary/typecast.h | 5 ++-- ttnn/cpp/ttnn/operations/unary.hpp | 2 +- 8 files changed, 55 insertions(+), 24 deletions(-) diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py index 8cf1d96d5da..dbe87a4a6a9 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py @@ -1374,12 +1374,12 @@ def eltwise_identity(x, *args, **kwargs): return x -def eltwise_typecast(x, *args, tt_output_dtype, **kwargs): - if tt_output_dtype[0] == ttl.tensor.DataType.UINT16: +def eltwise_typecast(x, *args, tt_input_dtype, tt_output_dtype, **kwargs): + if tt_input_dtype[0] == ttl.tensor.DataType.BFLOAT16 and tt_output_dtype[0] == ttl.tensor.DataType.UINT16: return torch.clamp(x.to(torch.int32), min=0, max=65535) # due to no uint16 support - elif tt_output_dtype[0] == ttl.tensor.DataType.UINT32: + elif tt_input_dtype[0] == ttl.tensor.DataType.BFLOAT16 and tt_output_dtype[0] == ttl.tensor.DataType.UINT32: return torch.relu(x.to(torch.int32)) # due to no uint32 support - elif tt_output_dtype[0] == ttl.tensor.DataType.BFLOAT16: + elif tt_input_dtype[0] == ttl.tensor.DataType.UINT16 and tt_output_dtype[0] == ttl.tensor.DataType.BFLOAT16: return x.to(torch.bfloat16) else: return x diff --git a/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py index 2740045969a..c212ce00dd7 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py @@ -2331,15 +2331,15 @@ def eltwise_typecast( x, *args, device, - dtype, + tt_input_dtype, tt_output_dtype, layout, input_mem_config, output_mem_config, **kwargs, ): - t0 = setup_tt_tensor(x, device, layout[0], input_mem_config[0], dtype[0]) - t1 = ttl.tensor.eltwise_typecast(t0, tt_output_dtype[0], output_mem_config=output_mem_config) + t0 = setup_tt_tensor(x, device, layout[0], input_mem_config[0], tt_input_dtype[0]) + t1 = ttl.tensor.eltwise_typecast(t0, tt_input_dtype[0], tt_output_dtype[0], output_mem_config=output_mem_config) return tt2torch_tensor(t1) diff --git a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp index f327e839966..f72b98b6977 100644 --- a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp +++ b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp @@ -191,9 +191,14 @@ std::pair get_op_init_and_func_parameterized( break; } case UnaryOpType::TYPECAST: + TT_ASSERT(params.size() == 2, "Expected eltwise_typecast to take 2 parameters"); op_init_and_name = { "typecast_tile_init();", - fmt::format("typecast_tile<{1}u>({0});", idst, std::to_string((uint32_t)datatype_to_dataformat_converter((DataType)param0)))}; + fmt::format( + "typecast_tile<{1}u, {2}u>({0});", + idst, + std::to_string((uint32_t)datatype_to_dataformat_converter((DataType)params[0])), + std::to_string((uint32_t)datatype_to_dataformat_converter((DataType)params[1])))}; break; default: TT_ASSERT(false && "unexpected parameterized type"); }; diff --git a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp index ee81024cf74..c9f79be5a80 100644 --- a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp +++ b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp @@ -201,7 +201,7 @@ inline Tensor run_eltwise_unary( const std::vector& ops_chain, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG) { TT_FATAL(ops_chain.size() > 0, "At least 1 unary op must be specified"); - DataType output_dtype = (ops_chain[0].op_type == UnaryOpType::TYPECAST) ? static_cast(ops_chain[0].params[0]) : input_tensor.get_dtype(); + DataType output_dtype = (ops_chain[0].op_type == UnaryOpType::TYPECAST) ? static_cast(ops_chain[0].params[1]) : input_tensor.get_dtype(); bool fp32_dest_acc_en = output_dtype == DataType::UINT32 or input_tensor.get_dtype() == DataType::UINT32 or @@ -247,7 +247,7 @@ inline Tensor run_eltwise_unary( const std::vector& ops_chain, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG) { TT_FATAL(ops_chain.size() > 0, "At least 1 unary op must be specified"); - DataType output_dtype = (ops_chain[0].op_type == UnaryOpType::TYPECAST) ? static_cast(ops_chain[0].params[0]) : input_tensor.get_dtype(); + DataType output_dtype = (ops_chain[0].op_type == UnaryOpType::TYPECAST) ? static_cast(ops_chain[0].params[1]) : input_tensor.get_dtype(); bool fp32_dest_acc_en = output_dtype == DataType::UINT32 or input_tensor.get_dtype() == DataType::UINT32 or @@ -378,7 +378,6 @@ constexpr auto rsub = make_eltwise_unary_with_param{}; constexpr auto silu = make_eltwise_unary{}; constexpr auto identity = make_eltwise_unary{}; constexpr auto identity_uint32 = make_eltwise_unary{}; -constexpr auto eltwise_typecast = make_eltwise_unary_with_param{}; constexpr auto add_unary_sfpu = make_eltwise_symmetric_binop_unary_with_param{}; constexpr auto mul_unary_sfpu = make_eltwise_symmetric_binop_unary_with_param{}; constexpr auto unary_gt = make_eltwise_unary_with_param{}; @@ -452,6 +451,18 @@ inline Tensor softplus( input_tensor, {UnaryWithParam(UnaryOpType::SOFTPLUS, {beta, threshold})}, output_mem_config); } +inline Tensor eltwise_typecast( + const Tensor& input_tensor, + uint32_t tt_input_dtype, + uint32_t tt_output_dtype, + const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG) { + TT_ASSERT(input_tensor.device()->arch() != tt::ARCH::GRAYSKULL, "eltwise_typecast is not currently supported on Grayskull"); + return run_eltwise_unary( + input_tensor, + {UnaryWithParam(UnaryOpType::TYPECAST, {static_cast(tt_input_dtype), static_cast(tt_output_dtype)})}, + output_mem_config); +} + inline Tensor unary_chain( const Tensor& input_tensor, std::vector ops_chain, diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp index 6cabcd4f924..789de224c59 100644 --- a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp +++ b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp @@ -110,12 +110,26 @@ namespace tt::tt_metal::detail { detail::bind_unary_op(m_tensor, "silu", silu, R"doc(Returns tensor with the silu all of elements of the input tensor ``{0}``.)doc"); detail::bind_unary_op(m_tensor, "neg", neg, R"doc(Returns tensor with the negate all of elements of the input tensor ``{0}``.)doc"); - detail::bind_unary_op_with_param( - m_tensor, "eltwise_typecast", eltwise_typecast, - py::arg("tt_output_dtype"), - R"doc(Returns tensor with all of the elements of the input tensor ``{0}`` typecasted from bfloat16 to uint32, bfloat16 to uint16, or uint16 to bfloat16.)doc", - R"doc("Indicates output dtype of typecast", "ttl.tensor.DataType", "")doc" - ); + m_tensor.def("eltwise_typecast", &eltwise_typecast, + py::arg("input").noconvert(), py::arg("tt_input_dtype"), py::arg("tt_output_dtype"), py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, R"doc( + Returns tensor with all elements of the input tensor ``{0}`` typecasted. + Supported typecasts: + BFLOAT16 -> UINT32 + BFLOAT16 -> UINT16 + UINT16 -> BFLOAT16 + + Input tensor must have tt_input_dtype data type. + + Output tensor will have tt_output_dtype data type. + + .. csv-table:: + :header: "Argument", "Description", "Data type", "Valid range", "Required" + + "input", "Tensor softplus is applied to", "Tensor", "Tensor of shape [W, Z, Y, X]", "Yes" + "tt_input_dtype", "Input tensor DataType", "DataType", "One of supported input DataTypes", "Yes" + "tt_output_dtype", "Desired output tensor DataType", "DataType", "One of supported output DataTypes", "Yes" + "output_mem_config", "Layout of tensor in TT Accelerator device memory banks", "MemoryConfig", "Default is interleaved in DRAM", "No" + )doc"); detail::bind_unary_op_with_param( m_tensor, "exp", py::overload_cast(&exp), diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_typecast.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_typecast.h index 56b4b0b56c5..a425d98c05a 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_typecast.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_typecast.h @@ -12,21 +12,21 @@ namespace ckernel { // New LLK SFPU APIs -template +template inline void llk_math_eltwise_unary_sfpu_typecast(uint dst_index, int vector_mode = (int)VectorMode::RC) { - if constexpr (OUT_DTYPE == (uint32_t)DataFormat::UInt16) { + if constexpr (IN_DTYPE == (uint32_t)DataFormat::Float16_b && OUT_DTYPE == (uint32_t)DataFormat::UInt16) { llk_math_eltwise_unary_sfpu_params( ckernel::sfpu::calculate_typecast_fp16b_to_uint16, dst_index, vector_mode); } - else if constexpr (OUT_DTYPE == (uint32_t)DataFormat::UInt32) { + else if constexpr (IN_DTYPE == (uint32_t)DataFormat::Float16_b && OUT_DTYPE == (uint32_t)DataFormat::UInt32) { llk_math_eltwise_unary_sfpu_params( ckernel::sfpu::calculate_typecast_fp16b_to_uint32, dst_index, vector_mode); } - else if constexpr (OUT_DTYPE == (uint32_t)DataFormat::Float16_b) { + else if constexpr (IN_DTYPE == (uint32_t)DataFormat::UInt16 && OUT_DTYPE == (uint32_t)DataFormat::Float16_b) { llk_math_eltwise_unary_sfpu_params( ckernel::sfpu::calculate_typecast_uint16_to_fp16b, dst_index, diff --git a/tt_metal/include/compute_kernel_api/eltwise_unary/typecast.h b/tt_metal/include/compute_kernel_api/eltwise_unary/typecast.h index 69e87503ff5..7c06201c25b 100644 --- a/tt_metal/include/compute_kernel_api/eltwise_unary/typecast.h +++ b/tt_metal/include/compute_kernel_api/eltwise_unary/typecast.h @@ -31,11 +31,12 @@ namespace ckernel { * | Argument | Description | Type | Valid Range | Required | * |----------------|----------------------------------------------------------------------------|----------|-------------------------------------------------------|----------| * | tile_index | The index of the tile in DST register buffer to perform typecast operation | uint32_t | Must be less than the size of the DST register buffer | True | + * | IN_DTYPE | Input data format | uint32_t | Must be valid tt::DataFormat | True | * | OUT_DTYPE | Desired output data format | uint32_t | Must be valid tt::DataFormat | True | */ -template +template ALWI void typecast_tile(uint32_t idst) { - MATH(( llk_math_eltwise_unary_sfpu_typecast(idst) )); + MATH(( llk_math_eltwise_unary_sfpu_typecast(idst) )); } /** diff --git a/ttnn/cpp/ttnn/operations/unary.hpp b/ttnn/cpp/ttnn/operations/unary.hpp index 496561d0a09..bae3685e09e 100644 --- a/ttnn/cpp/ttnn/operations/unary.hpp +++ b/ttnn/cpp/ttnn/operations/unary.hpp @@ -44,7 +44,7 @@ inline Tensor execute_on_worker_thread( const std::vector& op_chain, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt) { - DataType output_dtype = (op_chain[0].op_type == UnaryOpType::TYPECAST) ? static_cast(op_chain[0].params[0]) : input_tensor.get_dtype(); + DataType output_dtype = (op_chain[0].op_type == UnaryOpType::TYPECAST) ? static_cast(op_chain[0].params[1]) : input_tensor.get_dtype(); bool fp32_dest_acc_en = output_dtype == DataType::UINT32 or input_tensor.get_dtype() == DataType::UINT32 or input_tensor.get_dtype() == DataType::INT32; // MT: Currently only uint32/int32 is moved to From 4e741a4451e905617b80c31947fe89eb9b49a224 Mon Sep 17 00:00:00 2001 From: Radomir Djogo Date: Mon, 10 Jun 2024 21:30:31 +0000 Subject: [PATCH 2/5] #4858: add int32 to fp16b typecast --- .../sweep_tests/pytorch_ops.py | 2 ++ .../csrc/tt_lib_bindings_tensor_xary_ops.cpp | 1 + .../llk_api/llk_sfpu/ckernel_sfpu_typecast.h | 16 +++++++++++++++- .../llk_math_eltwise_unary_sfpu_typecast.h | 6 ++++++ .../compute_kernel_api/eltwise_unary/typecast.h | 2 ++ 5 files changed, 26 insertions(+), 1 deletion(-) diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py index dbe87a4a6a9..cd51cac3160 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py @@ -1381,6 +1381,8 @@ def eltwise_typecast(x, *args, tt_input_dtype, tt_output_dtype, **kwargs): return torch.relu(x.to(torch.int32)) # due to no uint32 support elif tt_input_dtype[0] == ttl.tensor.DataType.UINT16 and tt_output_dtype[0] == ttl.tensor.DataType.BFLOAT16: return x.to(torch.bfloat16) + elif tt_input_dtype[0] == ttl.tensor.DataType.INT32 and tt_output_dtype[0] == ttl.tensor.DataType.BFLOAT16: + return x.to(torch.bfloat16) else: return x diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp index 789de224c59..ca8007d24de 100644 --- a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp +++ b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp @@ -117,6 +117,7 @@ namespace tt::tt_metal::detail { BFLOAT16 -> UINT32 BFLOAT16 -> UINT16 UINT16 -> BFLOAT16 + INT32 -> BFLOAT16 Input tensor must have tt_input_dtype data type. diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_typecast.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_typecast.h index a41af67a272..94c758e7918 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_typecast.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_typecast.h @@ -74,7 +74,21 @@ inline void calculate_typecast_uint16_to_fp16b() for (int d = 0; d < ITERATIONS; d++) { TTI_SFPLOAD(0,6,3,0); TTI_SFPCAST(0,1,0); - TTI_SFPSTORE(1,2,3,0); + TTI_SFP_STOCH_RND(0,0,3,1,2,1); + TTI_SFPSTORE(2,2,3,0); + dst_reg++; + } +} + +template +inline void calculate_typecast_int32_to_fp16b() +{ + #pragma GCC unroll 0 + for (int d = 0; d < ITERATIONS; d++) { + TTI_SFPLOAD(0,12,3,0); + TTI_SFPCAST(0,1,0); + TTI_SFP_STOCH_RND(0,0,3,1,2,1); + TTI_SFPSTORE(2,2,3,0); dst_reg++; } } diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_typecast.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_typecast.h index a425d98c05a..ef18e2f0676 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_typecast.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_typecast.h @@ -32,6 +32,12 @@ inline void llk_math_eltwise_unary_sfpu_typecast(uint dst_index, int vector_mode dst_index, vector_mode); } + else if constexpr (IN_DTYPE == (uint32_t)DataFormat::Int32 && OUT_DTYPE == (uint32_t)DataFormat::Float16_b) { + llk_math_eltwise_unary_sfpu_params( + ckernel::sfpu::calculate_typecast_int32_to_fp16b, + dst_index, + vector_mode); + } } template diff --git a/tt_metal/include/compute_kernel_api/eltwise_unary/typecast.h b/tt_metal/include/compute_kernel_api/eltwise_unary/typecast.h index 7c06201c25b..6d19edba393 100644 --- a/tt_metal/include/compute_kernel_api/eltwise_unary/typecast.h +++ b/tt_metal/include/compute_kernel_api/eltwise_unary/typecast.h @@ -24,6 +24,8 @@ namespace ckernel { * Float16_b -> UInt32 * Float16_b -> UInt16 * UInt16 -> Float16_b + * Int32 -> Float16_b + * * For output to be UInt32, Dest must be in 32 bit mode. * * Return value: None From be8579d6712c992f11156d6259e55e821f976bc7 Mon Sep 17 00:00:00 2001 From: Radomir Djogo Date: Mon, 10 Jun 2024 23:37:22 +0000 Subject: [PATCH 3/5] #4858: update typecast defines for eltwise binary case --- .../op_library/eltwise_binary/eltwise_binary_op.cpp | 13 +++++++++---- .../op_library/eltwise_binary/eltwise_binary_op.hpp | 4 +++- .../multi_core/eltwise_binary_op_multi_core.cpp | 2 +- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/tt_eager/tt_dnn/op_library/eltwise_binary/eltwise_binary_op.cpp b/tt_eager/tt_dnn/op_library/eltwise_binary/eltwise_binary_op.cpp index 426d17e4fda..3bc52d11ab5 100644 --- a/tt_eager/tt_dnn/op_library/eltwise_binary/eltwise_binary_op.cpp +++ b/tt_eager/tt_dnn/op_library/eltwise_binary/eltwise_binary_op.cpp @@ -19,7 +19,11 @@ namespace eltwise_binary_op_utils { using namespace tt::tt_metal; std::map get_defines( - BinaryOpType op_type, const std::optional output_dtype, const std::optional> fused_activations) { + BinaryOpType op_type, + const std::optional in_dtype, + const std::optional output_dtype, + const std::optional> fused_activations) { + std::map defines; string op_name = "sub_tiles"; string op_binary_type = "EltwiseBinaryType::ELWSUB"; @@ -106,12 +110,13 @@ std::map get_defines( default: TT_ASSERT(false && "Undefined op type"); } - if(output_dtype.has_value() && (output_dtype.value() == DataType::UINT32 || output_dtype.value() == DataType::UINT16)){ + if(in_dtype.has_value() && output_dtype.has_value() && (output_dtype.value() == DataType::UINT32 || output_dtype.value() == DataType::UINT16)){ TT_ASSERT(defines.count("SFPU_OP_CHAIN_0") == 0 && "SFPU_OP_CHAIN_0 already defined"); - auto dataformat = std::to_string((uint32_t)datatype_to_dataformat_converter(output_dtype.value())); + auto in_dataformat = std::to_string((uint32_t)datatype_to_dataformat_converter(in_dtype.value())); + auto out_dataformat = std::to_string((uint32_t)datatype_to_dataformat_converter(output_dtype.value())); defines.insert({"SFPU_OP_CHAIN_0", - fmt::format("typecast_tile_init(); typecast_tile<{0}u>(i);", dataformat)}); + fmt::format("typecast_tile_init(); typecast_tile<{0}u, {1}u>(i);", in_dataformat, out_dataformat)}); defines.insert({"SFPU_OP_TYPECAST_INCLUDE", "1"}); } diff --git a/tt_eager/tt_dnn/op_library/eltwise_binary/eltwise_binary_op.hpp b/tt_eager/tt_dnn/op_library/eltwise_binary/eltwise_binary_op.hpp index 6cf3624eec7..3b9a40bbed6 100644 --- a/tt_eager/tt_dnn/op_library/eltwise_binary/eltwise_binary_op.hpp +++ b/tt_eager/tt_dnn/op_library/eltwise_binary/eltwise_binary_op.hpp @@ -41,7 +41,9 @@ enum class BinaryOpType { namespace eltwise_binary_op_utils { -std::map get_defines(BinaryOpType op_type, const std::optional out_dtype = std::nullopt, +std::map get_defines(BinaryOpType op_type, + const std::optional in_dtype = std::nullopt, + const std::optional out_dtype = std::nullopt, const std::optional> fused_activations = std::nullopt); } // namespace eltwise_binary_op_utils diff --git a/tt_eager/tt_dnn/op_library/eltwise_binary/multi_core/eltwise_binary_op_multi_core.cpp b/tt_eager/tt_dnn/op_library/eltwise_binary/multi_core/eltwise_binary_op_multi_core.cpp index f9bf11ef33e..dbe81b9f97a 100644 --- a/tt_eager/tt_dnn/op_library/eltwise_binary/multi_core/eltwise_binary_op_multi_core.cpp +++ b/tt_eager/tt_dnn/op_library/eltwise_binary/multi_core/eltwise_binary_op_multi_core.cpp @@ -312,7 +312,7 @@ operation::ProgramWithCallbacks eltwise_binary_multi_core(const Tensor &a, const } auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_device_cores, cb_src1_config); - std::map eltwise_defines = eltwise_binary_op_utils::get_defines(op_type, output.get_dtype(), fused_activations); + std::map eltwise_defines = eltwise_binary_op_utils::get_defines(op_type, a.get_dtype(), output.get_dtype(), fused_activations); if (eltwise_defines.find("SFPU_OP_INIT_PRE_IN0_0") != eltwise_defines.end()) { tt_metal::CircularBufferConfig cb_interm_config = tt_metal::CircularBufferConfig(1 * src0_single_tile_size, {{CB::c_intermed0, src0_cb_data_format}}) From c3e46b2eff664b7dc9a4cc4cae1a9d6caaf1adb8 Mon Sep 17 00:00:00 2001 From: Radomir Djogo Date: Tue, 11 Jun 2024 01:08:47 +0000 Subject: [PATCH 4/5] #4858: update ttnn typecast to provide 2 params --- ttnn/cpp/ttnn/operations/copy.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/copy.hpp b/ttnn/cpp/ttnn/operations/copy.hpp index d324a1d7834..60cb33419ab 100644 --- a/ttnn/cpp/ttnn/operations/copy.hpp +++ b/ttnn/cpp/ttnn/operations/copy.hpp @@ -46,9 +46,10 @@ struct Typecast { TT_FATAL(output_dtype == optional_output_tensor.value().get_dtype(), "If both output dtype and output tensor provided dtype should match"); } + DataType input_dtype = input.get_dtype(); auto memory_config = memory_config_arg.value_or(input.memory_config()); - bool fp32_dest_acc_en = output_dtype == DataType::UINT32; - auto unary_op = UnaryWithParam{UnaryOpType::TYPECAST, static_cast(output_dtype)}; + bool fp32_dest_acc_en = output_dtype == DataType::UINT32 or input_dtype == DataType::INT32; + auto unary_op = UnaryWithParam{UnaryOpType::TYPECAST, {static_cast(input_dtype), static_cast(output_dtype)}}; auto eltwise_op = EltwiseUnary{{unary_op}, memory_config, fp32_dest_acc_en, output_dtype}; return operation::run(eltwise_op, {input}, {}, {optional_output_tensor}, queue_id).at(0); } From 21ee2b4b73e8585f2d904bec36a4c3f33ec0c5e0 Mon Sep 17 00:00:00 2001 From: Radomir Djogo Date: Tue, 11 Jun 2024 15:14:52 +0000 Subject: [PATCH 5/5] #4858: document that GS does not support typecast --- tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp | 2 ++ tt_metal/include/compute_kernel_api/eltwise_unary/typecast.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp index ca8007d24de..26085c9c09c 100644 --- a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp +++ b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp @@ -123,6 +123,8 @@ namespace tt::tt_metal::detail { Output tensor will have tt_output_dtype data type. + Note: This operation is not supported on Grayskull. + .. csv-table:: :header: "Argument", "Description", "Data type", "Valid range", "Required" diff --git a/tt_metal/include/compute_kernel_api/eltwise_unary/typecast.h b/tt_metal/include/compute_kernel_api/eltwise_unary/typecast.h index 6d19edba393..4b58e8e6787 100644 --- a/tt_metal/include/compute_kernel_api/eltwise_unary/typecast.h +++ b/tt_metal/include/compute_kernel_api/eltwise_unary/typecast.h @@ -30,6 +30,8 @@ namespace ckernel { * * Return value: None * + * Note: This operation is not supported on Grayskull. + * * | Argument | Description | Type | Valid Range | Required | * |----------------|----------------------------------------------------------------------------|----------|-------------------------------------------------------|----------| * | tile_index | The index of the tile in DST register buffer to perform typecast operation | uint32_t | Must be less than the size of the DST register buffer | True |