#1588: Enable metal device side profiler dumps (#1590)

This somehow got missed during the addition of TTNN perf reports - reported by Vlad.
tenstorrent · Dec 13, 2024 · 9d34762 · 9d34762
1 parent 30a7a9e
commit 9d34762
Show file tree

Hide file tree

Showing 7 changed files with 63 additions and 0 deletions.
diff --git a/runtime/include/tt/runtime/detail/ttmetal.h b/runtime/include/tt/runtime/detail/ttmetal.h
@@ -9,6 +9,7 @@
 #include "distributed/mesh_device.hpp"
 #include "impl/buffers/circular_buffer.hpp"
 #include "impl/event/event.hpp"
+#include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
 
 #include "tt/runtime/types.h"

diff --git a/runtime/lib/ttmetal/runtime.cpp b/runtime/lib/ttmetal/runtime.cpp
@@ -83,6 +83,13 @@ void closeDevice(Device device) {
   ::tt::tt_metal::distributed::MeshDevice &ttmetalMeshDevice =
       device.as<::tt::tt_metal::distributed::MeshDevice>(
           DeviceRuntime::TTMetal);
+
+#if defined(TT_RUNTIME_ENABLE_PERF_TRACE)
+  for (::tt::tt_metal::Device *ttmetalDevice :
+       ttmetalMeshDevice.get_devices()) {
+    ::tt::tt_metal::detail::DumpDeviceProfileResults(ttmetalDevice);
+  }
+#endif
   ttmetalMeshDevice.close_devices();
 }
 

diff --git a/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_add.mlir b/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_add.mlir
@@ -0,0 +1,11 @@
+// RUN: ttmlir-opt --ttir-to-ttmetal-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttmetal-to-flatbuffer %t.mlir > %t.ttm
+
+func.func @add(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
+  %0 = tensor.empty() : tensor<64x128xf32>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
+  %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+  return %1 : tensor<64x128xf32>
+}
diff --git a/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_div.mlir b/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_div.mlir
@@ -0,0 +1,11 @@
+// RUN: ttmlir-opt --ttir-to-ttmetal-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttmetal-to-flatbuffer %t.mlir > %t.ttm
+
+func.func @div(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
+  %0 = tensor.empty() : tensor<64x128xf32>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
+  %1 = "ttir.div"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+  return %1 : tensor<64x128xf32>
+}
diff --git a/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_exp.mlir b/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_exp.mlir
@@ -0,0 +1,11 @@
+// RUN: ttmlir-opt --ttir-to-ttmetal-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttmetal-to-flatbuffer %t.mlir > %t.ttm
+
+func.func @exp(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
+  %0 = tensor.empty() : tensor<64x128xf32>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
+  %1 = "ttir.exp"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+  return %1 : tensor<64x128xf32>
+}
diff --git a/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_max.mlir b/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_max.mlir
@@ -0,0 +1,11 @@
+// RUN: ttmlir-opt --ttir-to-ttmetal-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttmetal-to-flatbuffer %t.mlir > %t.ttm
+
+func.func @maximum(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
+  %0 = tensor.empty() : tensor<64x128xf32>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
+  %1 = "ttir.maximum"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+  return %1 : tensor<64x128xf32>
+}
diff --git a/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_multiply.mlir b/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_multiply.mlir
@@ -0,0 +1,11 @@
+// RUN: ttmlir-opt --ttir-to-ttmetal-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttmetal-to-flatbuffer %t.mlir > %t.ttm
+
+func.func @multiply(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
+  %0 = tensor.empty() : tensor<64x128xf32>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
+  %1 = "ttir.multiply"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+  return %1 : tensor<64x128xf32>
+}