From 273a940cbcb91549a3e44dc05662d121a5a0b394 Mon Sep 17 00:00:00 2001
From: Marko Bezulj <156311081+mbezuljTT@users.noreply.github.com>
Date: Sat, 30 Nov 2024 09:46:17 +0100
Subject: [PATCH] fix graph_processor buffer_alloc_id calculus (#15426)

buffer_alloc_id was assigned per memory address, however this created
wrong results when capturing composite op that might be allocating and
deallocating entire tensors, so two buffers might have the same address.

add  buffer->unique_id() and used as a key, with id_to_counter map.
split id_to_counter into tensor_id_to_counter and buffer_id_to_counter.
---
 tests/ttnn/unit_tests/gtests/CMakeLists.txt   |   1 +
 .../unit_tests/gtests/test_graph_basic.cpp    | 123 ++++++++++++++++++
 tt_metal/impl/buffers/buffer.cpp              |   6 +
 tt_metal/impl/buffers/buffer.hpp              |   4 +
 tt_metal/include/tt_metal/buffer.hpp          |   8 ++
 ttnn/cpp/ttnn/graph/graph_processor.cpp       |  32 ++---
 ttnn/cpp/ttnn/graph/graph_processor.hpp       |   3 +-
 7 files changed, 160 insertions(+), 17 deletions(-)
 create mode 100644 tests/ttnn/unit_tests/gtests/test_graph_basic.cpp
diff --git a/tests/ttnn/unit_tests/gtests/CMakeLists.txt b/tests/ttnn/unit_tests/gtests/CMakeLists.txt
index fcdd7a0b35f..52e1e69c098 100644
--- a/tests/ttnn/unit_tests/gtests/CMakeLists.txt
+++ b/tests/ttnn/unit_tests/gtests/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(TTNN_UNIT_TESTS_SRC
     ${CMAKE_CURRENT_SOURCE_DIR}/test_add.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_graph_add.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_graph_basic.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_async_runtime.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_multiprod_queue.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_multi_cq_multi_dev.cpp
diff --git a/tests/ttnn/unit_tests/gtests/test_graph_basic.cpp b/tests/ttnn/unit_tests/gtests/test_graph_basic.cpp
new file mode 100644
index 00000000000..9cfd0dec7b2
--- /dev/null
+++ b/tests/ttnn/unit_tests/gtests/test_graph_basic.cpp
@@ -0,0 +1,123 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "gtest/gtest.h"
+#include "ttnn_test_fixtures.hpp"
+#include "ttnn/device.hpp"
+#include "ttnn/tensor/tensor.hpp"
+#include "ttnn/graph/graph_processor.hpp"
+#include "ttnn/graph/graph_consts.hpp"
+
+#include <string>
+
+namespace ttnn::graph::test {
+
+struct BufferTestParam {
+    ttnn::SimpleShape shape_a;
+    ttnn::SimpleShape shape_b;
+};
+
+class BufferTestFixture
+    : public TTNNFixtureWithDevice,
+      public testing::WithParamInterface<std::tuple<BufferTestParam, tt::tt_metal::IGraphProcessor::RunMode>> {};
+
+TEST_P(BufferTestFixture, BufferTest) {
+    auto param_combination = GetParam();
+    auto params = std::get<0>(param_combination);
+    auto run_mode = std::get<1>(param_combination);
+
+    tt::tt_metal::Device* device = &(this->getDevice());
+    {
+        ttnn::graph::GraphProcessor::begin_graph_capture(run_mode);
+        {
+            const auto input_a = ttnn::TensorSpec(
+                params.shape_a,
+                tt::tt_metal::TensorLayout(
+                    tt::tt_metal::DataType::BFLOAT16,
+                    tt::tt_metal::PageConfig(tt::tt_metal::Layout::TILE),
+                    ttnn::L1_MEMORY_CONFIG));
+            const auto input_tensor_a = tt::tt_metal::create_device_tensor(input_a, device);
+        }
+        {
+            const auto input_a = ttnn::TensorSpec(
+                params.shape_a,
+                tt::tt_metal::TensorLayout(
+                    tt::tt_metal::DataType::BFLOAT16,
+                    tt::tt_metal::PageConfig(tt::tt_metal::Layout::TILE),
+                    ttnn::L1_MEMORY_CONFIG));
+            const auto input_tensor_a = tt::tt_metal::create_device_tensor(input_a, device);
+
+            const auto input_b = ttnn::TensorSpec(
+                params.shape_b,
+                tt::tt_metal::TensorLayout(
+                    tt::tt_metal::DataType::BFLOAT16,
+                    tt::tt_metal::PageConfig(tt::tt_metal::Layout::TILE),
+                    ttnn::L1_MEMORY_CONFIG));
+
+            const auto input_tensor_b = tt::tt_metal::create_device_tensor(input_b, device);
+        }
+        auto trace = ttnn::graph::GraphProcessor::end_graph_capture();
+
+        auto find_nodes_by_type = [](const auto& trace, const std::string& type) {
+            std::vector<nlohmann::json> nodes;
+            for (const auto& node : trace) {
+                if (node.at(kNodeType) == type) {
+                    nodes.push_back(node);
+                }
+            }
+            return nodes;
+        };
+
+        // Check if there are two buffer_allocate_nodes, and if each is connected to only one different buffer
+        auto buffer_allocate_nodes = find_nodes_by_type(trace, kNodeBufferAllocate);
+        EXPECT_EQ(buffer_allocate_nodes.size(), 3);
+        for (const auto& node : buffer_allocate_nodes) {
+            EXPECT_EQ(node.at(kConnections).size(), 1);
+        }
+        auto connection_a = buffer_allocate_nodes[0].at(kConnections)[0].get<int>();
+        auto connection_a2 = buffer_allocate_nodes[1].at(kConnections)[0].get<int>();
+        auto connection_c = buffer_allocate_nodes[2].at(kConnections)[0].get<int>();
+        EXPECT_NE(connection_a, connection_a2);
+        EXPECT_NE(connection_a, connection_c);
+        EXPECT_NE(connection_a2, connection_c);
+
+        // Check if there are two buffer nodes and they have correct sizes
+        auto buffer_nodes = find_nodes_by_type(trace, kNodeBuffer);
+        EXPECT_EQ(buffer_nodes.size(), 3);
+        auto size_a = std::stoi(buffer_nodes[0].at(kParams).at(kSize).get<std::string>());
+        EXPECT_EQ(params.shape_a.volume() * 2, size_a);
+        auto size_a2 = std::stoi(buffer_nodes[1].at(kParams).at(kSize).get<std::string>());
+        EXPECT_EQ(params.shape_a.volume() * 2, size_a2);
+        auto size_b = std::stoi(buffer_nodes[2].at(kParams).at(kSize).get<std::string>());
+        EXPECT_EQ(params.shape_b.volume() * 2, size_b);
+
+        // Print the trace for reference
+        std::cout << trace << std::endl;
+    }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    BufferTest,
+    BufferTestFixture,
+    ::testing::Combine(
+        ::testing::Values(BufferTestParam{
+            .shape_a = ttnn::SimpleShape(tt::tt_metal::Array4D{1, 1, 32, 32}),
+            .shape_b = ttnn::SimpleShape(tt::tt_metal::Array4D{32, 1, 32, 32})}),
+        ::testing::Values(
+            tt::tt_metal::IGraphProcessor::RunMode::NO_DISPATCH, tt::tt_metal::IGraphProcessor::RunMode::NORMAL)),
+    [](const testing::TestParamInfo<std::tuple<BufferTestParam, tt::tt_metal::IGraphProcessor::RunMode>>& info) {
+        std::stringstream ss;
+
+        static uint32_t uid = 0;
+        ss << uid++;
+
+        const auto& run_mode = std::get<1>(info.param);
+        switch (run_mode) {
+            case tt::tt_metal::IGraphProcessor::RunMode::NO_DISPATCH: ss << "_NO_DISPATCH"; break;
+            case tt::tt_metal::IGraphProcessor::RunMode::NORMAL: ss << "_NORMAL"; break;
+            default: break;
+        }
+        return ss.str();
+    });
+}  // namespace ttnn::graph::test
diff --git a/tt_metal/impl/buffers/buffer.cpp b/tt_metal/impl/buffers/buffer.cpp
index 54756581fee..34975b0442b 100644
--- a/tt_metal/impl/buffers/buffer.cpp
+++ b/tt_metal/impl/buffers/buffer.cpp
@@ -13,6 +13,7 @@
 #include "tt_metal/types.hpp"
 
 #include <algorithm>
+#include <atomic>
 #include <mutex>
 #include <utility>
 #include "tt_metal/common/base.hpp"
@@ -25,6 +26,8 @@ namespace tt {
 
 namespace tt_metal {
 
+std::atomic<size_t> Buffer::next_unique_id = 0;
+
 std::ostream& operator<<(std::ostream& os, const ShardSpec& spec) {
     tt::stl::reflection::operator<<(os, spec);
     return os;
@@ -247,6 +250,7 @@ Buffer::Buffer(
     if (size != 0) {
         validate_buffer_size_and_page_size(size, page_size, buffer_type, buffer_layout, shard_parameters);
     }
+    unique_id_ = next_unique_id.fetch_add(1);
 }
 
 std::shared_ptr<Buffer> Buffer::create(
@@ -524,6 +528,8 @@ v1::BufferHandle v1::CreateBuffer(InterleavedBufferConfig config) { return v1::B
 
 void v1::DeallocateBuffer(const BufferHandle& buffer) { v0::DeallocateBuffer(*buffer); }
 
+std::size_t v1::GetId(const BufferHandle& buffer) { return buffer->unique_id(); }
+
 void v1::WriteToBuffer(const BufferHandle& buffer, stl::Span<const std::byte> host_buffer) {
     detail::WriteToBuffer(*buffer, stl::Span<const uint8_t>{reinterpret_cast<const std::uint8_t *>(host_buffer.data()), host_buffer.size()});
 }
diff --git a/tt_metal/impl/buffers/buffer.hpp b/tt_metal/impl/buffers/buffer.hpp
index f17a39104bc..4539da12fd8 100644
--- a/tt_metal/impl/buffers/buffer.hpp
+++ b/tt_metal/impl/buffers/buffer.hpp
@@ -248,6 +248,8 @@ class Buffer final {
     std::optional<SubDeviceId> sub_device_id() const { return sub_device_id_; }
     std::optional<SubDeviceManagerId> sub_device_manager_id() const { return sub_device_manager_id_; }
 
+    size_t unique_id() const { return unique_id_; }
+
     Buffer(
         Device *device,
         DeviceAddr size,
@@ -300,6 +302,8 @@ class Buffer final {
     std::shared_ptr<const BufferPageMapping> buffer_page_mapping_;
 
     std::weak_ptr<Buffer> weak_self;
+    size_t unique_id_ = 0;
+    static std::atomic<size_t> next_unique_id;
 };
 
 }  // namespace v0
diff --git a/tt_metal/include/tt_metal/buffer.hpp b/tt_metal/include/tt_metal/buffer.hpp
index a69f6a436eb..ea7d61f5c72 100644
--- a/tt_metal/include/tt_metal/buffer.hpp
+++ b/tt_metal/include/tt_metal/buffer.hpp
@@ -28,6 +28,14 @@ BufferHandle CreateBuffer(InterleavedBufferConfig config);
  */
 void DeallocateBuffer(const BufferHandle& buffer);
 
+/**
+ * @brief Retrieves the ID of the specified buffer.
+ *
+ * @param buffer The buffer to get the ID from.
+ * @return The unique ID of the buffer.
+ */
+std::size_t GetId(const BufferHandle& buffer);
+
 /**
  * @brief Copies data from a host buffer into the specified device buffer.
  *
diff --git a/ttnn/cpp/ttnn/graph/graph_processor.cpp b/ttnn/cpp/ttnn/graph/graph_processor.cpp
index f702edc1a26..637e936c9f0 100644
--- a/ttnn/cpp/ttnn/graph/graph_processor.cpp
+++ b/ttnn/cpp/ttnn/graph/graph_processor.cpp
@@ -95,9 +95,8 @@ GraphProcessor::GraphProcessor(RunMode mode) : run_mode(mode) {
 }
 void GraphProcessor::track_allocate(const tt::tt_metal::Buffer* buffer) {
     const std::lock_guard<std::mutex> lock(mutex);
-    auto buf_id = add_buffer(buffer);
+    auto buffer_id = add_buffer(buffer);
 
-    auto alloc_id = reinterpret_cast<std::uintptr_t>(buffer);
     auto counter = graph.size();
 
     std::unordered_map<std::string, std::string> params = {
@@ -113,7 +112,7 @@ void GraphProcessor::track_allocate(const tt::tt_metal::Buffer* buffer) {
             .counter = counter,
             .node_type = kNodeBufferAllocate,
             .params = params,
-            .connections = {buf_id}
+            .connections = {buffer_id}
         });
         graph[current_op_id.top()].connections.push_back(counter);
     }
@@ -121,7 +120,7 @@ void GraphProcessor::track_allocate(const tt::tt_metal::Buffer* buffer) {
 
 void GraphProcessor::track_deallocate(tt::tt_metal::Buffer* buffer) {
     const std::lock_guard<std::mutex> lock(mutex);
-    auto buffer_idx = add_buffer(buffer);
+    auto buffer_id = add_buffer(buffer);
     auto counter = graph.size();
     std::unordered_map<std::string, std::string> params = {
             {kSize, std::to_string(buffer->size())},
@@ -135,7 +134,7 @@ void GraphProcessor::track_deallocate(tt::tt_metal::Buffer* buffer) {
             .counter = counter,
             .node_type = kNodeBufferDeallocate,
             .params = params,
-            .connections = {buffer_idx}
+            .connections = {buffer_id}
         });
         graph[current_op_id.top()].connections.push_back(counter);
     }
@@ -286,21 +285,21 @@ int GraphProcessor::add_tensor(const Tensor& t) {
     } else {
         tensor_id = t.tensor_id.value();
     }
-    auto tensor_counter = id_to_counter.count(tensor_id) > 0 ? id_to_counter[tensor_id] : graph.size();
+    auto tensor_counter = tensor_id_to_counter.count(tensor_id) > 0 ? tensor_id_to_counter[tensor_id] : graph.size();
     auto shape = t.get_shape();
     std::unordered_map<std::string, std::string> params = {
         {kShape, fmt::format("{}", shape)},
         {kTensorId, fmt::format("{}", tensor_id)},
     };
 
-    if (id_to_counter.count(tensor_id) == 0) {
+    if (tensor_id_to_counter.count(tensor_id) == 0) {
         graph.push_back(Vertex{.counter = tensor_counter, .node_type = kNodeTensor, .params = params, .connections = {}});
-        id_to_counter[tensor_id] = tensor_counter;
+        tensor_id_to_counter[tensor_id] = tensor_counter;
     }
 
     if (buffer) {
-        auto buffer_idx = add_buffer(buffer);
-        graph[buffer_idx].connections.push_back(tensor_counter);
+        auto buffer_id = add_buffer(buffer);
+        graph[buffer_id].connections.push_back(tensor_counter);
     } else {
         tt::log_info("Tensor doesn't have buffer, but storage is {}", demangle(get_type_in_var(t.get_storage()).name()));
     }
@@ -308,9 +307,9 @@ int GraphProcessor::add_tensor(const Tensor& t) {
 }
 
 int GraphProcessor::add_buffer(const tt::tt_metal::Buffer* buffer) {
-    auto buffer_alloc_id = reinterpret_cast<std::uintptr_t>(buffer);
-    auto counter = id_to_counter.count(buffer_alloc_id) > 0 ? id_to_counter[buffer_alloc_id] : graph.size();
-    if (id_to_counter.count(buffer_alloc_id) == 0) {
+    auto buffer_id = buffer->unique_id();
+    auto counter = buffer_id_to_counter.count(buffer_id) > 0 ? buffer_id_to_counter[buffer_id] : graph.size();
+    if (buffer_id_to_counter.count(buffer_id) == 0) {
         std::unordered_map<std::string, std::string> params = {
             {kSize, std::to_string(buffer->size())},
             {kType, buffer->is_dram() ? "DRAM" : "L1"},
@@ -324,10 +323,10 @@ int GraphProcessor::add_buffer(const tt::tt_metal::Buffer* buffer) {
             .connections = {}
         });
         graph[current_op_id.top()].connections.push_back(counter);
-        id_to_counter[buffer_alloc_id] = counter;
+        buffer_id_to_counter[buffer_id] = counter;
         return counter;
     }
-    return id_to_counter[buffer_alloc_id];
+    return buffer_id_to_counter[buffer_id];
 }
 
 
@@ -428,7 +427,8 @@ void GraphProcessor::end_function_process_optional_tensor(const std::any& any_va
 void GraphProcessor::begin_capture(RunMode mode) {
     const std::lock_guard<std::mutex> lock(mutex);
     graph.clear();
-    id_to_counter.clear();
+    buffer_id_to_counter.clear();
+    tensor_id_to_counter.clear();
     graph.push_back(Vertex{
         .counter = 0,
         .node_type = kNodeCaptureStart,
diff --git a/ttnn/cpp/ttnn/graph/graph_processor.hpp b/ttnn/cpp/ttnn/graph/graph_processor.hpp
index 7f89251622d..488d7cf1f06 100644
--- a/ttnn/cpp/ttnn/graph/graph_processor.hpp
+++ b/ttnn/cpp/ttnn/graph/graph_processor.hpp
@@ -73,7 +73,8 @@ namespace ttnn::graph {
         std::mutex mutex;
         RunMode run_mode = RunMode::NORMAL;
         std::stack<int> current_op_id;
-        std::unordered_map<std::int64_t, int> id_to_counter;
+        std::unordered_map<std::int64_t, int> buffer_id_to_counter;
+        std::unordered_map<std::int64_t, int> tensor_id_to_counter;
         int last_finished_op_id = -1;
         std::vector<Vertex> graph;
         std::unordered_map<std::type_index, ProcessFunc> begin_function_any_map;