From 16c7d5b3dd1f5d2070cd16b2def26b58f04a45f3 Mon Sep 17 00:00:00 2001 From: tarafdarTT Date: Fri, 28 Jun 2024 16:28:25 +0000 Subject: [PATCH] #9744: add use_multicore --- .../op_library/backward/backward_ops.cpp | 8 +- .../op_library/composite/composite_ops.cpp | 13 +-- .../ttnn/operations/data_movement/pad/pad.hpp | 93 ++++++++++++++----- .../data_movement/pad/pad_pybind.hpp | 9 +- 4 files changed, 82 insertions(+), 41 deletions(-) diff --git a/tt_eager/tt_dnn/op_library/backward/backward_ops.cpp b/tt_eager/tt_dnn/op_library/backward/backward_ops.cpp index 95a1616b5363..0c9400a805fc 100644 --- a/tt_eager/tt_dnn/op_library/backward/backward_ops.cpp +++ b/tt_eager/tt_dnn/op_library/backward/backward_ops.cpp @@ -1968,17 +1968,11 @@ std::vector _prod_bw( // dim 0 Tensor tensor_1_temp = reciprocal_input; if (reciprocal_input.get_legacy_shape()[0] % 32 != 0) { - const Shape start_index = {0, 0, 0, 0}; - const Shape required_shape = { - reciprocal_input.get_legacy_shape()[0] + (32 - (reciprocal_input.get_legacy_shape()[0] % 32)), - reciprocal_input.get_legacy_shape()[1], - reciprocal_input.get_legacy_shape()[2], - reciprocal_input.get_legacy_shape()[3]}; std::vector> padding = {{0, 0}, {0, 32 - (reciprocal_input.get_legacy_shape()[0] % 32)}, {0, 0}, {0, 0}}; - tensor_1_temp = ttnn::pad(reciprocal_input, padding, 0, std::nullopt); + tensor_1_temp = ttnn::pad(reciprocal_input, padding, 0, std::nullopt); } std::vector after_permute_dims = {3, 1, 2, 0}; Tensor tensor_1 = permute(tensor_1_temp, after_permute_dims, output_mem_config); diff --git a/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp b/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp index f541d3104d4c..bab071847196 100644 --- a/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp +++ b/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp @@ -1400,17 +1400,10 @@ Tensor hypot(const Tensor& input_a, const Tensor& input_b, const MemoryConfig& o Tensor _scatter(const Tensor& input_a, const Tensor& input_b, const MemoryConfig& output_mem_config) { const Shape start_index = {0, 0, 0, 0}; - ttnn::Tensor input_tensor_4D = ttnn::unsqueeze_to_4D(input_a); - auto input_shape_with_tile_padding = input_tensor_4D.get_shape().with_tile_padding(); - auto output_padded_shape = input_b.legacy_shape(); - std::vector> padding(4); - for(size_t i = 0; i < padding.size(); i++) { - padding[i] = {0, output_padded_shape[i] - input_shape_with_tile_padding[i]}; - } - + ttnn::Tensor input_tensor_4D = ttnn::unsqueeze_to_4D(input_a); - Tensor index = ttnn::pad(ones_like(input_a, output_mem_config), padding, 0, std::nullopt); - Tensor temp_a = ttnn::pad(input_a, padding, 0, std::nullopt); + Tensor index = ttnn::pad(ones_like(input_tensor_4D, output_mem_config), input_b.shape(), ttnn::Shape(start_index), 0, std::nullopt); + Tensor temp_a = ttnn::pad(input_tensor_4D,input_b.shape(), ttnn::Shape(start_index), 0, std::nullopt); return where(index, temp_a, input_b, output_mem_config); } Tensor scatter(const Tensor& input_a, const Tensor& input_b, const MemoryConfig& output_mem_config) { diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/pad.hpp b/ttnn/cpp/ttnn/operations/data_movement/pad/pad.hpp index 3037853ed48c..1015cbb7cc41 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/pad/pad.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/pad/pad.hpp @@ -5,13 +5,8 @@ #pragma once #include "tt_eager/tensor/types.hpp" -#include "tt_eager/tt_dnn/op_library/concat/concat_op.hpp" -#include "tt_eager/tt_dnn/op_library/pad/pad_op.hpp" -#include "tt_eager/tt_dnn/op_library/permute/permute_op.hpp" -#include "tt_eager/tt_dnn/op_library/repeat/repeat_op.hpp" -#include "tt_eager/tt_dnn/op_library/composite/composite_ops.hpp" -#include "tt_eager/tt_dnn/op_library/upsample/upsample_op.hpp" #include "ttnn/cpp/ttnn/operations/core.hpp" +#include "tt_eager/tt_dnn/op_library/pad/pad_op.hpp" #include @@ -45,7 +40,68 @@ struct Pad { static ttnn::Tensor execute_on_worker_thread( uint8_t queue_id, const ttnn::Tensor& input_tensor, - std::vector> padding, //intentionally not const& + std::vector output_padded_shape, + std::vector input_tensor_start, + const float value, + const std::optional& memory_config_arg) { + + + auto memory_config = memory_config_arg.value_or(input_tensor.memory_config()); + + auto output_tensor = operation::run( + tt::tt_metal::Pad{ + .output_tensor_shape=tt::tt_metal::Shape(output_padded_shape), + .input_tensor_start=tt::tt_metal::Shape(input_tensor_start), + .pad_value=value, + .output_mem_config=memory_config, + .use_multicore=true + }, + {input_tensor}).front(); + + return output_tensor; + + } + + static ttnn::Tensor execute_on_worker_thread( + uint8_t queue_id, + const ttnn::Tensor& input_tensor, + const Shape output_padded_shape, + const Shape input_tensor_start, + const float value, + const std::optional& memory_config_arg) { + + + std::vector output_padded_vector(output_padded_shape.rank()); + std::vector input_start_vector(output_padded_shape.rank()); + + for(uint32_t dim=0; dim& memory_config_arg) { + + + + return execute_on_worker_thread(DefaultQueueId, input_tensor, output_padded_shape, input_tensor_start, value, memory_config_arg); + + } + + + + static ttnn::Tensor execute_on_worker_thread( + uint8_t queue_id, + const ttnn::Tensor& input_tensor, + std::vector> padding, const float value, const std::optional& memory_config_arg) { @@ -64,7 +120,7 @@ struct Pad { "ttnn.pad: row-major tensors have to use fallback because the kernel currently causes a PCC error"); // Unsqueeze Tensor to 4D if it is not already - ttnn::Tensor input_tensor_4D = ttnn::unsqueeze_to_4D(input_tensor); + ttnn::Tensor input_tensor_4D = ttnn::unsqueeze_to_4D(input_tensor); padding.insert(padding.begin(), 4 - original_rank, {0, 0}); auto input_shape_with_tile_padding = input_tensor_4D.get_shape().with_tile_padding(); std::vector output_padded_shape(padding.size()); @@ -72,9 +128,6 @@ struct Pad { output_padded_shape[i] = input_shape_with_tile_padding[i] + padding[i].second; } - // Due to the strangeness of tt::tt_metal::pad, we need to split front and back pad - // Front will be passed separately. And pad_back is retrieved -> output_padded_shape - pad_front - auto memory_config = memory_config_arg.value_or(input_tensor.memory_config()); auto pad_front = padding | std::views::transform([](const auto& p) { return p.first; }); auto pad_back = padding | std::views::transform([](const auto& p) { return p.second; }); @@ -90,17 +143,11 @@ struct Pad { "ttnn.pad: for tiled tensors padding end must be a multiple of the tile size on height and width for a " "tensor in tile layout"); - // Performing actual padding + // Performing actual padding std::vector pad_front_vec(pad_front.begin(), pad_front.end()); - auto output_tensor = operation::run( - tt::tt_metal::Pad{ - .output_tensor_shape=tt::tt_metal::Shape(output_padded_shape), - .input_tensor_start=tt::tt_metal::Shape(pad_front_vec), - .pad_value=value, - .output_mem_config=memory_config, - .use_multicore=true - }, - {input_tensor_4D}).front(); + + + auto output_tensor = execute_on_worker_thread(queue_id, input_tensor, output_padded_shape, pad_front_vec, value, memory_config_arg); // output_tensor is currently 4D. We have to squeeze back to the original rank @@ -121,8 +168,10 @@ struct Pad { output_tensor = ttnn::reshape(output_tensor, ttnn::Shape(padded_shape)); return output_tensor; + } + static ttnn::Tensor execute_on_worker_thread( const ttnn::Tensor& input_tensor, std::vector> padding, //intentionally not const& @@ -132,6 +181,8 @@ struct Pad { return execute_on_worker_thread(DefaultQueueId, input_tensor, padding, value, memory_config_arg); } + + }; } // namespace data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/pad_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/pad/pad_pybind.hpp index 37b3a9bc7fad..9cbf84cd09ed 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/pad/pad_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/pad/pad_pybind.hpp @@ -23,10 +23,12 @@ Pad tensor with constant value. Padded shape is accumulated if ttnn.pad is calle * :attr:`input_tensor`: input tensor * :attr:`padding`: padding to apply. Each element of padding should be a tuple of 2 integers, with the first integer specifying the number of values to add before the tensor and the second integer specifying the number of values to add after the tensor. * :attr:`value`: value to pad with - * :attr:`queue_id` (Optional[uint8]): command queue id Keyword Args: - * :attr:`memory_config`: the memory configuration to use for the operation)doc"; + * :attr:`memory_config`: the memory configuration to use for the operation + * :attr:`queue_id` (Optional[uint8]): command queue id + * :attr:`use_multicore` (Optional[bool]): whether or not we should use multicore. Defaults to true + )doc"; using OperationType = decltype(ttnn::pad); ttnn::bind_registered_operation( @@ -47,7 +49,8 @@ Keyword Args: py::arg("value"), py::kw_only(), py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = 0, + }); } } // namespace ttnn::operations::data_movement::detail