tenstorrent · arakhmati · Jun 18, 2024 · Jun 6, 2024 · Jun 7, 2024 · Jun 18, 2024
@@ -236,16 +236,16 @@ def test_expand_and_broadcast(device, h, w):
 @pytest.mark.parametrize("h", [500])
 @pytest.mark.parametrize("w", [512])
 def test_expand_and_broadcast_reversed(device, h, w):
-    torch_a = torch.rand((1, h, w), dtype=torch.bfloat16)
-    torch_b = torch.rand((h, w), dtype=torch.bfloat16)
-    torch_output = torch.lt(torch_b, torch_a)
+    torch_input_tensor_a = torch.rand((1, h, w), dtype=torch.bfloat16)
+    torch_input_tensor_b = torch.rand((h, w), dtype=torch.bfloat16)
+    torch_output = torch.lt(torch_input_tensor_b, torch_input_tensor_a)
 
-    a = ttnn.from_torch(torch_a, layout=ttnn.TILE_LAYOUT, device=device)
-    b = ttnn.from_torch(torch_b, layout=ttnn.TILE_LAYOUT, device=device)
-    tt_output = ttnn.lt(b, a)
-    tt_output = ttnn.to_torch(tt_output)
+    input_tensor_a = ttnn.from_torch(torch_input_tensor_a, layout=ttnn.TILE_LAYOUT, device=device)
+    input_tensor_b = ttnn.from_torch(torch_input_tensor_b, layout=ttnn.TILE_LAYOUT, device=device)
+    output = ttnn.lt(input_tensor_b, input_tensor_a)
+    output = ttnn.to_torch(output)
 
-    assert_with_pcc(torch_output, tt_output, 0.9999)
+    assert_with_pcc(torch_output, output, 0.9999)
 
 
 @pytest.mark.parametrize("atol", [1e-8, 1e-10])

diff --git a/tt_eager/tensor/tensor.hpp b/tt_eager/tensor/tensor.hpp
@@ -43,7 +43,8 @@ struct Tensor {
         bool track_ref_count = false;
         TensorAttributes(const Storage storage, const ttnn::Shape shape, DataType dtype, Layout layout) :
             storage(storage), shape(shape), dtype(dtype), layout(layout) {}
-        TensorAttributes() : shape({0xff, 0xff, 0xff, 0xff}), dtype(DataType::INVALID), layout(Layout::INVALID) {}
+        TensorAttributes() :
+            shape(std::array<uint32_t, 4>{0xff, 0xff, 0xff, 0xff}), dtype(DataType::INVALID), layout(Layout::INVALID) {}
         ~TensorAttributes() = default;
 
         // Use these functions to manage the main_thread_ref_count for a tensor attr instance.
@@ -392,6 +393,15 @@ Tensor create_device_tensor(
     Device *device,
     const MemoryConfig &memory_config = {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED});
 
+static Tensor create_device_tensor(
+    const ttnn::Shape &shape,
+    DataType dtype,
+    Layout layout,
+    Device *device,
+    const MemoryConfig &memory_config = {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) {
+    return create_device_tensor(shape.value(), dtype, layout, device, memory_config);
+}
+
 // template<typename Buffer>
 // void *get_host_buffer(const Tensor &tensor);
 void *get_raw_host_data_ptr(const Tensor &tensor);

diff --git a/tt_eager/tensor/types.hpp b/tt_eager/tensor/types.hpp
@@ -173,19 +173,21 @@ class Shape {
     }
 
     template <std::size_t Rank>
-    explicit Shape(const std::array<uint32_t, Rank> &shape, const std::array<uint32_t, Rank> &shape_tile_padding) :
+    explicit Shape(const std::array<uint32_t, Rank> &shape, const std::array<uint32_t, Rank> &shape_with_tile_padding) :
         rank_(Rank), dimensions_{}, padding_{Rank} {
         for (auto index = 0; index < Rank; index++) {
-            auto padded_dimension = shape_tile_padding[index];
+            auto padded_dimension = shape_with_tile_padding[index];
             this->dimensions_[index] = padded_dimension;
             this->padding_[index] = {.front = 0, .back = padded_dimension - shape[index]};
         }
     }
-    explicit Shape(const std::vector<uint32_t> &shape, const std::vector<uint32_t> &shape_tile_padding) :
+    explicit Shape(const std::vector<uint32_t> &shape, const std::vector<uint32_t> &shape_with_tile_padding) :
         rank_(shape.size()), dimensions_{}, padding_{shape.size()} {
-        TT_ASSERT(shape.size() == shape_tile_padding.size(), "Shape and shape_tile_padding must have the same size");
+        TT_ASSERT(
+            shape.size() == shape_with_tile_padding.size(),
+            "Shape and shape_with_tile_padding must have the same size");
         for (auto index = 0; index < shape.size(); index++) {
-            auto padded_dimension = shape_tile_padding[index];
+            auto padded_dimension = shape_with_tile_padding[index];
             this->dimensions_[index] = padded_dimension;
             this->padding_[index] = {.front = 0, .back = padded_dimension - shape[index]};
         }
@@ -720,14 +722,20 @@ struct Shape {
     explicit Shape(const std::array<uint32_t, Rank> &shape) : ranked_shape{RankedShape<Rank>{shape}} {}
 
     template <std::size_t Rank>
-    explicit Shape(const std::array<uint32_t, Rank> &shape, const std::array<uint32_t, Rank> &shape_tile_padding) :
-        ranked_shape{RankedShape<Rank>{shape, shape_tile_padding}} {}
+    explicit Shape(const std::array<uint32_t, Rank> &shape, const std::array<uint32_t, Rank> &shape_with_tile_padding) :
+        ranked_shape{RankedShape<Rank>{shape, shape_with_tile_padding}} {}
 
     template <std::size_t Rank>
     explicit Shape(
         const std::array<uint32_t, Rank> &shape, const std::array<std::array<uint32_t, 2>, Rank> &tile_padding) :
         ranked_shape{RankedShape<Rank>{shape, tile_padding}} {}
 
+    static Shape from_vector(const std::vector<uint32_t> &shape) { return Shape{tt::tt_metal::Shape{shape}}; }
+
+    static Shape from_vector(const std::vector<uint32_t> &shape, const std::vector<uint32_t> &shape_with_tile_padding) {
+        return Shape{tt::tt_metal::Shape{shape, shape_with_tile_padding}};
+    }
+
     const auto rank() const {
         return std::visit(
             []<std::size_t Rank>(const RankedShape<Rank> &shape) -> const auto { return Rank; }, this->ranked_shape);

diff --git a/tt_eager/tt_dnn/op_library/run_operation.cpp b/tt_eager/tt_dnn/op_library/run_operation.cpp
@@ -17,6 +17,10 @@
 #include "tt_metal/tt_stl/reflection.hpp"
 #include "tt_numpy/functions.hpp"
 
+namespace tt::tt_metal {
+    std::atomic<uint32_t> operation_id_atomic_count = 0;
+}
+
 namespace tt::tt_metal::operation {
 
 namespace detail {
@@ -119,7 +123,7 @@ constexpr auto decorate_device_operation(const Function& function) {
 template <typename OutputTensors>
 OutputTensors run_host_operation(const HostOperation<OutputTensors>& operation, const Tensors& input_tensors) {
     ZoneScopedN("TT_DNN_HOST_OP");
-    uint32_t op_id = assign_id();
+    uint32_t op_id = assign_operation_id();
 
     operation.validate(input_tensors);
     auto output_tensors = operation.compute_output_tensors(input_tensors);
@@ -143,7 +147,7 @@ OutputTensors run_device_operation(
     const OptionalConstTensors& optional_input_tensors,
     const OptionalTensors& optional_output_tensors) {
     ZoneScopedN("TT_DNN_DEVICE_OP");
-    uint32_t op_id = assign_id();
+    uint32_t op_id = assign_operation_id();
 
     std::function<std::variant<std::shared_ptr<Program>, std::reference_wrapper<Program>>(
         const DeviceOperation<OutputTensors>&,

diff --git a/tt_eager/tt_dnn/op_library/run_operation.hpp b/tt_eager/tt_dnn/op_library/run_operation.hpp
@@ -276,12 +276,6 @@ inline void log_operation(
     const OptionalTensors& optional_output_tensors = {}) {}
 #endif
 
-inline uint32_t assign_id()
-{
-    static std::atomic<uint32_t> atomic_count{0};
-    return atomic_count.fetch_add(1);
-}
-
 template<class OutputTensors=Tensors>
 OutputTensors run(
     const HostOperation<OutputTensors>& operation,

diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_pytensor.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_pytensor.cpp
@@ -567,7 +567,7 @@ Tensor convert_python_tensors_to_tt_tensors(py::list tensor_shards, std::optiona
                     ZoneScopedN("TT_DNN_FALLBACK_OP");
                     auto [op, input_tensors] = detail::parse_external_operation(function, args, kwargs, function_name);
                     operation::log_operation(op, input_tensors);
-                    uint32_t op_id = tt::tt_metal::operation::assign_id();
+                    uint32_t op_id = tt::tt_metal::assign_operation_id();
 
                     auto output_tensors = function(*args, **kwargs);
 

@@ -51,7 +51,10 @@ struct ProgramCache {
 
    private:
     inline static bool is_enabled_ = false;
-    std::unordered_map<uint64_t, tt::stl::unique_any<1024, 32>> cache_{};
+
+    static constexpr auto MAX_CACHED_PROGRAM_SIZE = 1024;
+    static constexpr auto ALIGNMENT = 32;
+    std::unordered_map<uint64_t, tt::stl::unique_any<MAX_CACHED_PROGRAM_SIZE, ALIGNMENT>> cache_{};
 };
 
 }

@@ -5,6 +5,7 @@
 #pragma once
 
 #include <filesystem>
+#include <tuple>
 #include <type_traits>
 
 #include "tensor/tensor.hpp"
@@ -22,6 +23,12 @@ namespace tt {
 
 namespace tt_metal {
 
+extern std::atomic<uint32_t> operation_id_atomic_count;
+
+inline uint32_t assign_operation_id() {
+    return operation_id_atomic_count.fetch_add(1);
+}
+
 namespace op_profiler {
 
 enum class OpType { python_fallback, tt_dnn_cpu, tt_dnn_device, unknown };
@@ -251,6 +258,48 @@ inline json get_base_json(
     return j;
 }
 
+template <typename operation_t>
+inline json get_base_json(
+    uint32_t operation_id,
+    const typename operation_t::operation_attributes_t& operation_attributes,
+    const typename operation_t::tensor_args_t& tensor_args,
+    typename operation_t::tensor_return_value_t& tensor_return_value) {
+    ZoneScoped;
+    json j;
+    j["global_call_count"] = operation_id;
+
+    auto as_string = [](std::string_view v) -> std::string { return {v.data(), v.size()}; };
+    std::string opName = as_string(tt::stl::get_type_name<operation_t>());
+    std::replace(opName.begin(), opName.end(), ',', ';');
+    j["op_code"] = opName;
+
+    json attributesObj;
+    constexpr auto& attribute_names = std::decay_t<decltype(operation_attributes)>::attribute_names;
+    const auto attribute_values = operation_attributes.attribute_values();
+    [&attributesObj, &attribute_names, &attribute_values]<size_t... Ns>(std::index_sequence<Ns...>) {
+        (
+            [&attributesObj, &attribute_names, &attribute_values] {
+                const auto& attribute_name = std::get<Ns>(attribute_names);
+                const auto& attribute = std::get<Ns>(attribute_values);
+                attributesObj[attribute_name] = fmt::format("{}", attribute);
+            }(),
+            ...);
+    }(std::make_index_sequence<std::tuple_size_v<std::decay_t<decltype(attribute_names)>>>{});
+    j["attributes"] = attributesObj;
+
+    std::vector<json> input_tensors;
+    tt::stl::reflection::visit_object_of_type<Tensor>(
+        [&input_tensors](auto&& tensor) { input_tensors.push_back(get_tensor_json(tensor)); }, tensor_args);
+    j["input_tensors"] = input_tensors;
+
+    std::vector<json> output_tensors;
+    tt::stl::reflection::visit_object_of_type<Tensor>(
+        [&output_tensors](auto&& tensor) { output_tensors.push_back(get_tensor_json(tensor)); }, tensor_return_value);
+    j["output_tensors"] = output_tensors;
+
+    return j;
+}
+
 inline std::string op_meta_data_serialized_json(
     uint32_t opID, const tt::tt_metal::operation::ExternalOperation& op, const std::vector<Tensor>& input_tensors) {
     auto j = get_base_json<true>(opID, op, input_tensors);
@@ -322,6 +371,37 @@ inline std::string op_meta_data_serialized_json(
     }
 }
 
+template <typename operation_t>
+inline std::string op_meta_data_serialized_json(
+    const operation_t& operation,
+    uint32_t operation_id,
+    auto device_id,
+    const auto& program,
+    const auto& program_hash,
+    const auto& operation_attributes,
+    const auto& tensor_args,
+    auto& tensor_return_value) {
+    auto j = get_base_json<operation_t>(operation_id, operation_attributes, tensor_args, tensor_return_value);
+    j["op_type"] = magic_enum::enum_name(OpType::tt_dnn_device);
+    j["device_id"] = device_id;
+    j["op_hash"] = program_hash;
+    j["kernel_info"] = get_kernels_json(program);
+
+    j["optional_input_tensors"] = std::vector<json>{};
+
+    auto perfModel = operation_t::create_op_performance_model(operation_attributes, tensor_args, tensor_return_value);
+    j["performance_model"]["compute_ns"] = perfModel.get_compute_ns();
+    j["performance_model"]["ideal_ns"] = perfModel.get_ideal_ns();
+    j["performance_model"]["bandwidth_ns"] = perfModel.get_bandwidth_ns();
+    j["performance_model"]["input_bws"] = perfModel.get_input_bws();
+    j["performance_model"]["output_bws"] = perfModel.get_output_bws();
+
+    std::string short_str = fmt::format("`TT_DNN_DEVICE_OP: {}, {}, {}, ", j["op_code"], program_hash, device_id);
+
+    std::string ser = j.dump(4);
+    return fmt::format("{}{} ->\n{}`", short_str, operation_id, ser);
+}
+
 #define TracyOpTTNNDevice(                                                                                           \
     op_id, op_hash, is_cached, device_id, operation, program, input_tensors, optional_input_tensors, output_tensors) \
     std::string op_message = op_profiler::op_meta_data_serialized_json(                                              \
@@ -338,6 +418,21 @@ inline std::string op_meta_data_serialized_json(
     ZoneText(op_text.c_str(), op_text.size());                                                                       \
     TracyMessage(op_message.c_str(), op_message.size());
 
+#define TracyOpTNNNDeviceV2(                                                                                           \
+    operation, operation_id, device_id, program, program_hash, operation_attributes, tensor_args, tensor_return_value) \
+    std::string op_message = op_profiler::op_meta_data_serialized_json(                                                \
+        operation,                                                                                                     \
+        operation_id,                                                                                                  \
+        device_id,                                                                                                     \
+        program,                                                                                                       \
+        program_hash,                                                                                                  \
+        operation_attributes,                                                                                          \
+        tensor_args,                                                                                                   \
+        tensor_return_value);                                                                                          \
+    std::string op_text = fmt::format("id:{}", operation_id);                                                          \
+    ZoneText(op_text.c_str(), op_text.size());                                                                         \
+    TracyMessage(op_message.c_str(), op_message.size());
+
 #define TracyOpTTNNHost(op_id, operation, input_tensors, output_tensors)                            \
     std::string op_message =                                                                        \
         op_profiler::op_meta_data_serialized_json(op_id, operation, input_tensors, output_tensors); \
@@ -355,6 +450,8 @@ inline std::string op_meta_data_serialized_json(
 
 #define TracyOpTTNNDevice( \
     op_id, op_hash, is_cached, device_id, operation, program, input_tensors, optional_input_tensors, output_tensors)
+#define TracyOpTNNNDeviceV2( \
+    operation, operation_id, device_id, program, program_hash, operation_attributes, tensor_args, tensor_return_value)
 #define TracyOpTTNNHost(op_id, operation, input_tensors, output_tensors)
 #define TracyOpTTNNExternal(op_id, op, input_tensors)
 

@@ -387,6 +387,48 @@ std::ostream& operator<<(std::ostream& os, const std::set<T>& set) {
     return os;
 }
 
+template <typename to_visit_t, typename T>
+    requires std::same_as<std::decay_t<T>, to_visit_t>
+constexpr auto visit_object_of_type(auto callback, T&& value) {
+    callback(value);
+}
+
+template <typename to_visit_t, typename T>
+constexpr auto visit_object_of_type(auto callback, const std::optional<T>& value) {
+    if (value.has_value()) {
+        visit_object_of_type<to_visit_t>(callback, value.value());
+    }
+}
+
+template <typename to_visit_t, typename T>
+constexpr auto visit_object_of_type(auto callback, const std::vector<T>& value) {
+    for (auto& tensor : value) {
+        visit_object_of_type<to_visit_t>(callback, tensor);
+    }
+}
+
+template <typename to_visit_t, typename T, auto N>
+constexpr auto visit_object_of_type(auto callback, const std::array<T, N>& value) {
+    for (auto& tensor : value) {
+        visit_object_of_type<to_visit_t>(callback, tensor);
+    }
+}
+
+template <typename to_visit_t, typename... Ts>
+constexpr auto visit_object_of_type(auto callback, const std::tuple<Ts...>& value) {
+    constexpr auto num_attributes = sizeof...(Ts);
+    [&callback, &value]<size_t... Ns>(std::index_sequence<Ns...>) {
+        (visit_object_of_type<to_visit_t>(callback, std::get<Ns>(value)), ...);
+    }(std::make_index_sequence<num_attributes>{});
+}
+
+template <typename to_visit_t, typename T>
+    requires(not std::same_as<std::decay_t<T>, to_visit_t>) and requires { std::decay_t<T>::attribute_names; }
+constexpr auto visit_object_of_type(auto callback, T&& object) {
+    constexpr auto num_attributes = std::tuple_size_v<decltype(std::decay_t<T>::attribute_names)>;
+    visit_object_of_type<to_visit_t>(callback, object.attribute_values());
+}
+
 }  // namespace reflection
 }  // namespace stl
 }  // namespace tt

@@ -6,6 +6,10 @@ set(TTNN_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/matmul.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/reduction/argmax/device/argmax_op.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/binary_op.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_program_factory.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/broadcast_width_multi_core_program_factory.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/reduction/topk/device/topk_op.cpp
 )
 

@@ -219,7 +219,7 @@ struct operation_t {
 
     template <typename... args_t>
     auto operator()(args_t&&... args) const {
-        ZoneScoped;
+        ZoneScopedN("Run ttnn operation (struct-based)");
         ZoneName(this->cpp_fully_qualified_name, std::strlen(this->cpp_fully_qualified_name));
         tt::log_debug(tt::LogOp, "Started   C++ ttnn operation: {}", this->cpp_fully_qualified_name);
 
@@ -332,7 +332,7 @@ struct lambda_operation_t {
 
     template <typename... args_t>
     auto operator()(args_t&&... args) const {
-        ZoneScoped;
+        ZoneScopedN("Run ttnn operation (lambda-based)");
         ZoneName(this->cpp_fully_qualified_name, std::strlen(this->cpp_fully_qualified_name));
         tt::log_debug(tt::LogOp, "Started   C++ ttnn operation: {}", this->cpp_fully_qualified_name);
         auto output = this->lambda(std::forward<decltype(args)>(args)...);