Skip to content

Commit

Permalink
#9744: add use_multicore
Browse files Browse the repository at this point in the history
  • Loading branch information
ntarafdar committed Jun 28, 2024
1 parent c5fc70d commit 16c7d5b
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 41 deletions.
8 changes: 1 addition & 7 deletions tt_eager/tt_dnn/op_library/backward/backward_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1968,17 +1968,11 @@ std::vector<Tensor> _prod_bw(
// dim 0
Tensor tensor_1_temp = reciprocal_input;
if (reciprocal_input.get_legacy_shape()[0] % 32 != 0) {
const Shape start_index = {0, 0, 0, 0};
const Shape required_shape = {
reciprocal_input.get_legacy_shape()[0] + (32 - (reciprocal_input.get_legacy_shape()[0] % 32)),
reciprocal_input.get_legacy_shape()[1],
reciprocal_input.get_legacy_shape()[2],
reciprocal_input.get_legacy_shape()[3]};
std::vector<std::pair<uint32_t, uint32_t>> padding = {{0, 0},
{0, 32 - (reciprocal_input.get_legacy_shape()[0] % 32)},
{0, 0},
{0, 0}};
tensor_1_temp = ttnn::pad(reciprocal_input, padding, 0, std::nullopt);
tensor_1_temp = ttnn::pad(reciprocal_input, padding, 0, std::nullopt);
}
std::vector<int64_t> after_permute_dims = {3, 1, 2, 0};
Tensor tensor_1 = permute(tensor_1_temp, after_permute_dims, output_mem_config);
Expand Down
13 changes: 3 additions & 10 deletions tt_eager/tt_dnn/op_library/composite/composite_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1400,17 +1400,10 @@ Tensor hypot(const Tensor& input_a, const Tensor& input_b, const MemoryConfig& o

Tensor _scatter(const Tensor& input_a, const Tensor& input_b, const MemoryConfig& output_mem_config) {
const Shape start_index = {0, 0, 0, 0};
ttnn::Tensor input_tensor_4D = ttnn::unsqueeze_to_4D(input_a);
auto input_shape_with_tile_padding = input_tensor_4D.get_shape().with_tile_padding();
auto output_padded_shape = input_b.legacy_shape();
std::vector<std::pair<uint32_t, uint32_t>> padding(4);
for(size_t i = 0; i < padding.size(); i++) {
padding[i] = {0, output_padded_shape[i] - input_shape_with_tile_padding[i]};
}

ttnn::Tensor input_tensor_4D = ttnn::unsqueeze_to_4D(input_a);

Tensor index = ttnn::pad(ones_like(input_a, output_mem_config), padding, 0, std::nullopt);
Tensor temp_a = ttnn::pad(input_a, padding, 0, std::nullopt);
Tensor index = ttnn::pad(ones_like(input_tensor_4D, output_mem_config), input_b.shape(), ttnn::Shape(start_index), 0, std::nullopt);
Tensor temp_a = ttnn::pad(input_tensor_4D,input_b.shape(), ttnn::Shape(start_index), 0, std::nullopt);
return where(index, temp_a, input_b, output_mem_config);
}
Tensor scatter(const Tensor& input_a, const Tensor& input_b, const MemoryConfig& output_mem_config) {
Expand Down
93 changes: 72 additions & 21 deletions ttnn/cpp/ttnn/operations/data_movement/pad/pad.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,8 @@
#pragma once

#include "tt_eager/tensor/types.hpp"
#include "tt_eager/tt_dnn/op_library/concat/concat_op.hpp"
#include "tt_eager/tt_dnn/op_library/pad/pad_op.hpp"
#include "tt_eager/tt_dnn/op_library/permute/permute_op.hpp"
#include "tt_eager/tt_dnn/op_library/repeat/repeat_op.hpp"
#include "tt_eager/tt_dnn/op_library/composite/composite_ops.hpp"
#include "tt_eager/tt_dnn/op_library/upsample/upsample_op.hpp"
#include "ttnn/cpp/ttnn/operations/core.hpp"
#include "tt_eager/tt_dnn/op_library/pad/pad_op.hpp"

#include <ranges>

Expand Down Expand Up @@ -45,7 +40,68 @@ struct Pad {
static ttnn::Tensor execute_on_worker_thread(
uint8_t queue_id,
const ttnn::Tensor& input_tensor,
std::vector<std::pair<uint32_t, uint32_t>> padding, //intentionally not const&
std::vector<uint32_t> output_padded_shape,
std::vector<uint32_t> input_tensor_start,
const float value,
const std::optional<MemoryConfig>& memory_config_arg) {


auto memory_config = memory_config_arg.value_or(input_tensor.memory_config());

auto output_tensor = operation::run(
tt::tt_metal::Pad{
.output_tensor_shape=tt::tt_metal::Shape(output_padded_shape),
.input_tensor_start=tt::tt_metal::Shape(input_tensor_start),
.pad_value=value,
.output_mem_config=memory_config,
.use_multicore=true
},
{input_tensor}).front();

return output_tensor;

}

static ttnn::Tensor execute_on_worker_thread(
uint8_t queue_id,
const ttnn::Tensor& input_tensor,
const Shape output_padded_shape,
const Shape input_tensor_start,
const float value,
const std::optional<MemoryConfig>& memory_config_arg) {


std::vector<uint32_t> output_padded_vector(output_padded_shape.rank());
std::vector<uint32_t> input_start_vector(output_padded_shape.rank());

for(uint32_t dim=0; dim<output_padded_shape.rank(); dim++) {
output_padded_vector[dim] = output_padded_shape[dim];
input_start_vector[dim] = input_tensor_start[dim];
}

return execute_on_worker_thread(queue_id, input_tensor, output_padded_vector, input_start_vector, value, memory_config_arg);
}


static ttnn::Tensor execute_on_worker_thread(
const ttnn::Tensor& input_tensor,
const Shape output_padded_shape,
const Shape input_tensor_start,
const float value,
const std::optional<MemoryConfig>& memory_config_arg) {



return execute_on_worker_thread(DefaultQueueId, input_tensor, output_padded_shape, input_tensor_start, value, memory_config_arg);

}



static ttnn::Tensor execute_on_worker_thread(
uint8_t queue_id,
const ttnn::Tensor& input_tensor,
std::vector<std::pair<uint32_t, uint32_t>> padding,
const float value,
const std::optional<MemoryConfig>& memory_config_arg) {

Expand All @@ -64,17 +120,14 @@ struct Pad {
"ttnn.pad: row-major tensors have to use fallback because the kernel currently causes a PCC error");

// Unsqueeze Tensor to 4D if it is not already
ttnn::Tensor input_tensor_4D = ttnn::unsqueeze_to_4D(input_tensor);
ttnn::Tensor input_tensor_4D = ttnn::unsqueeze_to_4D(input_tensor);
padding.insert(padding.begin(), 4 - original_rank, {0, 0});
auto input_shape_with_tile_padding = input_tensor_4D.get_shape().with_tile_padding();
std::vector<uint32_t> output_padded_shape(padding.size());
for(size_t i = 0; i < padding.size(); i++) {
output_padded_shape[i] = input_shape_with_tile_padding[i] + padding[i].second;
}

// Due to the strangeness of tt::tt_metal::pad, we need to split front and back pad
// Front will be passed separately. And pad_back is retrieved -> output_padded_shape - pad_front
auto memory_config = memory_config_arg.value_or(input_tensor.memory_config());
auto pad_front = padding | std::views::transform([](const auto& p) { return p.first; });
auto pad_back = padding | std::views::transform([](const auto& p) { return p.second; });

Expand All @@ -90,17 +143,11 @@ struct Pad {
"ttnn.pad: for tiled tensors padding end must be a multiple of the tile size on height and width for a "
"tensor in tile layout");

// Performing actual padding
// Performing actual padding
std::vector<uint32_t> pad_front_vec(pad_front.begin(), pad_front.end());
auto output_tensor = operation::run(
tt::tt_metal::Pad{
.output_tensor_shape=tt::tt_metal::Shape(output_padded_shape),
.input_tensor_start=tt::tt_metal::Shape(pad_front_vec),
.pad_value=value,
.output_mem_config=memory_config,
.use_multicore=true
},
{input_tensor_4D}).front();


auto output_tensor = execute_on_worker_thread(queue_id, input_tensor, output_padded_shape, pad_front_vec, value, memory_config_arg);


// output_tensor is currently 4D. We have to squeeze back to the original rank
Expand All @@ -121,8 +168,10 @@ struct Pad {
output_tensor = ttnn::reshape(output_tensor, ttnn::Shape(padded_shape));

return output_tensor;

}


static ttnn::Tensor execute_on_worker_thread(
const ttnn::Tensor& input_tensor,
std::vector<std::pair<uint32_t, uint32_t>> padding, //intentionally not const&
Expand All @@ -132,6 +181,8 @@ struct Pad {
return execute_on_worker_thread(DefaultQueueId, input_tensor, padding, value, memory_config_arg);

}


};

} // namespace data_movement
Expand Down
9 changes: 6 additions & 3 deletions ttnn/cpp/ttnn/operations/data_movement/pad/pad_pybind.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@ Pad tensor with constant value. Padded shape is accumulated if ttnn.pad is calle
* :attr:`input_tensor`: input tensor
* :attr:`padding`: padding to apply. Each element of padding should be a tuple of 2 integers, with the first integer specifying the number of values to add before the tensor and the second integer specifying the number of values to add after the tensor.
* :attr:`value`: value to pad with
* :attr:`queue_id` (Optional[uint8]): command queue id
Keyword Args:
* :attr:`memory_config`: the memory configuration to use for the operation)doc";
* :attr:`memory_config`: the memory configuration to use for the operation
* :attr:`queue_id` (Optional[uint8]): command queue id
* :attr:`use_multicore` (Optional[bool]): whether or not we should use multicore. Defaults to true
)doc";

using OperationType = decltype(ttnn::pad);
ttnn::bind_registered_operation(
Expand All @@ -47,7 +49,8 @@ Keyword Args:
py::arg("value"),
py::kw_only(),
py::arg("memory_config") = std::nullopt,
py::arg("queue_id") = 0});
py::arg("queue_id") = 0,
});
}

} // namespace ttnn::operations::data_movement::detail

0 comments on commit 16c7d5b

Please sign in to comment.