#12624: Fix tracking of globaly allocated CBs in graph capture

tenstorrent · Oct 31, 2024 · 4c9787e · 4c9787e
1 parent fe1129c
commit 4c9787e
Show file tree

Hide file tree

Showing 6 changed files with 35 additions and 35 deletions.
diff --git a/tt_metal/graph/graph_tracking.cpp b/tt_metal/graph/graph_tracking.cpp
@@ -45,12 +45,12 @@ void GraphTracker::track_deallocate(Buffer* buffer) {
     }
 }
 
-void GraphTracker::track_allocate_cb(const CoreRangeSet &core_range_set, uint64_t addr, uint64_t size) {
+void GraphTracker::track_allocate_cb(const CoreRangeSet &core_range_set, uint64_t addr, uint64_t size, bool is_globally_allocated) {
     if (processors.empty()) {
         return;
     }
     for (auto& it : processors) {
-        it->track_allocate_cb(core_range_set, addr, size);
+        it->track_allocate_cb(core_range_set, addr, size, is_globally_allocated);
     }
 }
 

diff --git a/tt_metal/graph/graph_tracking.hpp b/tt_metal/graph/graph_tracking.hpp
@@ -32,7 +32,7 @@ inline namespace v0 {
 
         virtual void track_deallocate(tt::tt_metal::Buffer* buffer) {};
 
-        virtual void track_allocate_cb(const CoreRangeSet &core_range_set, uint64_t addr, uint64_t size) {};
+        virtual void track_allocate_cb(const CoreRangeSet &core_range_set, uint64_t addr, uint64_t size, bool is_globally_allocated) {};
 
         virtual void track_deallocate_cb() {};
 
@@ -81,7 +81,7 @@ inline namespace v0 {
 
         void track_deallocate(Buffer* buffer);
 
-        void track_allocate_cb(const CoreRangeSet &core_range_set, uint64_t addr, uint64_t size);
+        void track_allocate_cb(const CoreRangeSet &core_range_set, uint64_t addr, uint64_t size, bool is_globally_allocated);
 
         void track_deallocate_cb();
 

diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp
@@ -700,7 +700,7 @@ void detail::Program_::allocate_circular_buffers(const Device *device) {
                 }
             }
         }
-        tt::tt_metal::GraphTracker::instance().track_allocate_cb(circular_buffer->core_ranges(), computed_addr, circular_buffer->size());
+        tt::tt_metal::GraphTracker::instance().track_allocate_cb(circular_buffer->core_ranges(), computed_addr, circular_buffer->size(), circular_buffer->globally_allocated());
         circular_buffer->set_locally_allocated_address(computed_addr);
     }
     this->local_circular_buffer_allocation_needed_ = false;

diff --git a/ttnn/cpp/ttnn/graph/graph_processor.cpp b/ttnn/cpp/ttnn/graph/graph_processor.cpp
@@ -10,6 +10,7 @@
 #include "ttnn/graph/graph_consts.hpp"
 #include <cxxabi.h>
 #include <memory>
+#include <string>
 #include <typeindex>
 #include <unordered_map>
 #include "ttnn/core.hpp"
@@ -135,12 +136,13 @@ void GraphProcessor::track_deallocate(tt::tt_metal::Buffer* buffer) {
 
 }
 
-void GraphProcessor::track_allocate_cb(const CoreRangeSet &core_range_set, uint64_t addr, uint64_t size) {
+void GraphProcessor::track_allocate_cb(const CoreRangeSet &core_range_set, uint64_t addr, uint64_t size, bool is_globally_allocated) {
     const std::lock_guard<std::mutex> lock(mutex);
     std::unordered_map<std::string, std::string> params = {
         {kSize, std::to_string(size)},
         {kAddress, std::to_string(addr)},
-        {"core_range_set", core_range_set.str()}
+        {"core_range_set", core_range_set.str()},
+        {"globally_allocated", std::to_string(is_globally_allocated)}
     };
     auto counter = graph.size();
     {
@@ -179,7 +181,7 @@ void GraphProcessor::track_program(tt::tt_metal::Program* program) {
     }
 
     for (auto& cb : program->circular_buffers()) {
-        track_allocate_cb(cb->core_ranges(), 0, cb->size());
+        track_allocate_cb(cb->core_ranges(), 0, cb->size(), cb->globally_allocated());
     }
 }
 

diff --git a/ttnn/cpp/ttnn/graph/graph_processor.hpp b/ttnn/cpp/ttnn/graph/graph_processor.hpp
@@ -44,7 +44,7 @@ namespace ttnn::graph {
 
         void track_deallocate(tt::tt_metal::Buffer* buffer) override;
 
-        void track_allocate_cb(const CoreRangeSet &core_range, uint64_t addr, uint64_t size) override;
+        void track_allocate_cb(const CoreRangeSet &core_range, uint64_t addr, uint64_t size, bool is_globally_allocated) override;
 
         void track_deallocate_cb() override;
 

diff --git a/ttnn/cpp/ttnn/graph/graph_trace_utils.cpp b/ttnn/cpp/ttnn/graph/graph_trace_utils.cpp
@@ -4,17 +4,15 @@
 
 #include "graph_trace_utils.hpp"
 
-#include "graph_processor.hpp"
-#include "graph_consts.hpp"
-
-
-#include "tt_metal/common/assert.hpp"
-
-#include <unordered_set>
+#include <cstdlib>  // std::strtoul
 #include <string>
 #include <cstdlib> // std::strtoul
 
 
+#include "graph_consts.hpp"
+#include "graph_processor.hpp"
+#include "tt_metal/common/assert.hpp"
+
 namespace ttnn::graph {
 
 namespace {
@@ -32,20 +30,21 @@ ttnn::Shape parse_shape(std::string_view shape_string) {
     while (str < end_str) {
         char* next;
         uint32_t value = std::strtoul(str, &next, 10);
-        if (str == next) break; // no conversion happened
+        if (str == next)
+            break;  // no conversion happened
         shape.push_back(value);
         str = next;
         if (*str == ',') {
-            ++str; // skip the comma
+            ++str;  // skip the comma
         }
         if (*str == ' ') {
-            ++str; // skip spaces, assume a single space
+            ++str;  // skip spaces, assume a single space
         }
     }
 
     return ttnn::Shape(shape);
 }
-} // namespace
+}  // namespace
 
 uint32_t extract_peak_L1_memory_usage(const nlohmann::json& trace) {
     uint32_t total_cb = 0;
@@ -80,7 +79,7 @@ uint32_t extract_peak_L1_memory_usage(const nlohmann::json& trace) {
         } else if (v[kNodeType] == kNodeBufferDeallocate) {
             auto connection = v[kConnections][0].get<int>();
             auto buffer = trace[connection];
-            if(buffer[kParams][kType] == "L1") {
+            if (buffer[kParams][kType] == "L1") {
                 total_buffer -= stoi(buffer[kParams][kSize].get<std::string>());
             }
         } else if (v[kNodeType] == kNodeFunctionEnd) {
@@ -113,7 +112,7 @@ std::pair<uint32_t, uint32_t> count_intermediate_and_output_tensors(const nlohma
             last_end_found = true;
             last_end_index = i;
 
-            if(v[kParams][kName] == "create_device_tensor") {
+            if (v[kParams][kName] == "create_device_tensor") {
                 auto id = v[kConnections][0].get<int>();
                 intermediate_tensors.insert(id);
             }
@@ -124,22 +123,22 @@ std::pair<uint32_t, uint32_t> count_intermediate_and_output_tensors(const nlohma
     TT_ASSERT(last_end_found);
 
     auto connections = trace[last_end_index][kConnections].get<std::unordered_set<uint32_t>>();
-    for(auto index : connections) {
+    for (auto index : connections) {
         // It can be tensor or some other node like
-        if(trace[index][kNodeType] == kNodeTensor) {
+        if (trace[index][kNodeType] == kNodeTensor) {
             output_tensors.insert(index);
         }
     }
 
-    for(int index : output_tensors) {
+    for (int index : output_tensors) {
         intermediate_tensors.erase(index);
     }
 
     // Return the counts of intermediate and output tensors
     return {intermediate_tensors.size(), output_tensors.size()};
 }
 
-std::vector<std::string> extract_calltrace(const nlohmann::json& trace){
+std::vector<std::string> extract_calltrace(const nlohmann::json& trace) {
     std::vector<std::string> op_calls;
     size_t i = 0;
 
@@ -155,11 +154,10 @@ std::vector<std::string> extract_calltrace(const nlohmann::json& trace){
     return op_calls;
 }
 
-std::unordered_set<uint32_t> extract_output_tensors(const nlohmann::json& trace)
-{
+std::unordered_set<uint32_t> extract_output_tensors(const nlohmann::json& trace) {
     // Lambda to find the last 'function_end' node
     auto find_function_end_node = [](const auto& trace) -> const nlohmann::json& {
-        for(int i = trace.size() - 1; i >= 0; --i) {
+        for (int i = trace.size() - 1; i >= 0; --i) {
             const auto& v = trace[i];
             if (v[kNodeType] == kNodeFunctionEnd) {
                 return v;
@@ -187,33 +185,33 @@ std::unordered_set<uint32_t> extract_output_tensors(const nlohmann::json& trace)
     return output_tensors;
 }
 
-std::vector<TensorInfo> extract_output_info(const nlohmann::json& trace)
-{
+std::vector<TensorInfo> extract_output_info(const nlohmann::json& trace) {
     std::vector<TensorInfo> output;
     auto output_tensors = extract_output_tensors(trace);
 
     for (const auto& node : trace) {
-        if (node[kNodeType] != kNodeBuffer )
+        if (node[kNodeType] != kNodeBuffer)
             continue;
 
         auto connections = node[kConnections].get<std::unordered_set<uint32_t>>();
         for (const auto& tensor_id : connections) {
             if (output_tensors.find(tensor_id) == output_tensors.end())
                 continue;
 
-            const auto type = node[kParams][kType] == "L1" ? tt::tt_metal::BufferType::L1 : tt::tt_metal::BufferType::DRAM;
+            const auto type =
+                node[kParams][kType] == "L1" ? tt::tt_metal::BufferType::L1 : tt::tt_metal::BufferType::DRAM;
             const auto size = stoi(node[kParams][kSize].get<std::string>());
 
             const auto& tensor = trace[tensor_id];
             const std::string shape_string = tensor[kParams][kShape];
             const auto shape = parse_shape(shape_string);
 
-            output.emplace_back(TensorInfo {.shape = shape, .size = size, .type = type});
+            output.emplace_back(TensorInfo{.shape = shape, .size = size, .type = type});
         }
     }
 
     return output;
 }
 
 
-} // namespace ttnn::graph
+}  // namespace ttnn::graph