From 9d347627f6577430bc9486f31c9e1593d4c8eb18 Mon Sep 17 00:00:00 2001 From: Tapasvi Patel <133996364+tapspatel@users.noreply.github.com> Date: Fri, 13 Dec 2024 10:55:06 -0600 Subject: [PATCH] #1588: Enable metal device side profiler dumps (#1590) This somehow got missed during the addition of TTNN perf reports - reported by Vlad. --- runtime/include/tt/runtime/detail/ttmetal.h | 1 + runtime/lib/ttmetal/runtime.cpp | 7 +++++++ .../Silicon/TTMetal/perf_unit/test_perf_add.mlir | 11 +++++++++++ .../Silicon/TTMetal/perf_unit/test_perf_div.mlir | 11 +++++++++++ .../Silicon/TTMetal/perf_unit/test_perf_exp.mlir | 11 +++++++++++ .../Silicon/TTMetal/perf_unit/test_perf_max.mlir | 11 +++++++++++ .../Silicon/TTMetal/perf_unit/test_perf_multiply.mlir | 11 +++++++++++ 7 files changed, 63 insertions(+) create mode 100644 test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_add.mlir create mode 100644 test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_div.mlir create mode 100644 test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_exp.mlir create mode 100644 test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_max.mlir create mode 100644 test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_multiply.mlir diff --git a/runtime/include/tt/runtime/detail/ttmetal.h b/runtime/include/tt/runtime/detail/ttmetal.h index 1b043f6e5..e532ec05f 100644 --- a/runtime/include/tt/runtime/detail/ttmetal.h +++ b/runtime/include/tt/runtime/detail/ttmetal.h @@ -9,6 +9,7 @@ #include "distributed/mesh_device.hpp" #include "impl/buffers/circular_buffer.hpp" #include "impl/event/event.hpp" +#include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/host_api.hpp" #include "tt/runtime/types.h" diff --git a/runtime/lib/ttmetal/runtime.cpp b/runtime/lib/ttmetal/runtime.cpp index 634b26a43..202965087 100644 --- a/runtime/lib/ttmetal/runtime.cpp +++ b/runtime/lib/ttmetal/runtime.cpp @@ -83,6 +83,13 @@ void closeDevice(Device device) { ::tt::tt_metal::distributed::MeshDevice &ttmetalMeshDevice = device.as<::tt::tt_metal::distributed::MeshDevice>( DeviceRuntime::TTMetal); + +#if defined(TT_RUNTIME_ENABLE_PERF_TRACE) + for (::tt::tt_metal::Device *ttmetalDevice : + ttmetalMeshDevice.get_devices()) { + ::tt::tt_metal::detail::DumpDeviceProfileResults(ttmetalDevice); + } +#endif ttmetalMeshDevice.close_devices(); } diff --git a/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_add.mlir b/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_add.mlir new file mode 100644 index 000000000..c89ad5242 --- /dev/null +++ b/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_add.mlir @@ -0,0 +1,11 @@ +// RUN: ttmlir-opt --ttir-to-ttmetal-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir +// RUN: FileCheck %s --input-file=%t.mlir +// RUN: ttmlir-translate --ttmetal-to-flatbuffer %t.mlir > %t.ttm + +func.func @add(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> { + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] + %0 = tensor.empty() : tensor<64x128xf32> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] + %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32> + return %1 : tensor<64x128xf32> +} diff --git a/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_div.mlir b/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_div.mlir new file mode 100644 index 000000000..a24a2f4f7 --- /dev/null +++ b/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_div.mlir @@ -0,0 +1,11 @@ +// RUN: ttmlir-opt --ttir-to-ttmetal-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir +// RUN: FileCheck %s --input-file=%t.mlir +// RUN: ttmlir-translate --ttmetal-to-flatbuffer %t.mlir > %t.ttm + +func.func @div(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> { + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] + %0 = tensor.empty() : tensor<64x128xf32> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] + %1 = "ttir.div"(%arg0, %arg1, %0) <{operandSegmentSizes = array}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32> + return %1 : tensor<64x128xf32> +} diff --git a/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_exp.mlir b/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_exp.mlir new file mode 100644 index 000000000..f46543c80 --- /dev/null +++ b/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_exp.mlir @@ -0,0 +1,11 @@ +// RUN: ttmlir-opt --ttir-to-ttmetal-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir +// RUN: FileCheck %s --input-file=%t.mlir +// RUN: ttmlir-translate --ttmetal-to-flatbuffer %t.mlir > %t.ttm + +func.func @exp(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> { + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] + %0 = tensor.empty() : tensor<64x128xf32> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] + %1 = "ttir.exp"(%arg0, %0) <{operandSegmentSizes = array}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32> + return %1 : tensor<64x128xf32> +} diff --git a/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_max.mlir b/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_max.mlir new file mode 100644 index 000000000..b8dcae064 --- /dev/null +++ b/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_max.mlir @@ -0,0 +1,11 @@ +// RUN: ttmlir-opt --ttir-to-ttmetal-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir +// RUN: FileCheck %s --input-file=%t.mlir +// RUN: ttmlir-translate --ttmetal-to-flatbuffer %t.mlir > %t.ttm + +func.func @maximum(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> { + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] + %0 = tensor.empty() : tensor<64x128xf32> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] + %1 = "ttir.maximum"(%arg0, %arg1, %0) <{operandSegmentSizes = array}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32> + return %1 : tensor<64x128xf32> +} diff --git a/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_multiply.mlir b/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_multiply.mlir new file mode 100644 index 000000000..e8ca11af4 --- /dev/null +++ b/test/ttmlir/Silicon/TTMetal/perf_unit/test_perf_multiply.mlir @@ -0,0 +1,11 @@ +// RUN: ttmlir-opt --ttir-to-ttmetal-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir +// RUN: FileCheck %s --input-file=%t.mlir +// RUN: ttmlir-translate --ttmetal-to-flatbuffer %t.mlir > %t.ttm + +func.func @multiply(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> { + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] + %0 = tensor.empty() : tensor<64x128xf32> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] + %1 = "ttir.multiply"(%arg0, %arg1, %0) <{operandSegmentSizes = array}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32> + return %1 : tensor<64x128xf32> +}