From c9996d5949149e769f66731d06934bc2c0e9022e Mon Sep 17 00:00:00 2001
From: Almeet Bhullar <abhullar@tenstorrent.com>
Date: Thu, 24 Oct 2024 00:31:20 +0000
Subject: [PATCH] #5174: Uplifitng microbenchmarks to run on BH

---
 tests/scripts/run_moreh_microbenchmark.sh     |  8 +--
 tests/scripts/test_moreh_microbenchmark.py    | 72 ++++++++++++++++++-
 .../1_compute_mm/test_compute_mm.cpp          |  9 ++-
 .../test_dram_read.cpp                        | 18 +++--
 4 files changed, 94 insertions(+), 13 deletions(-)

diff --git a/tests/scripts/run_moreh_microbenchmark.sh b/tests/scripts/run_moreh_microbenchmark.sh
index 2b7107bb7df..f2a085fa2c7 100755
--- a/tests/scripts/run_moreh_microbenchmark.sh
+++ b/tests/scripts/run_moreh_microbenchmark.sh
@@ -29,12 +29,12 @@ run_profiling_test() {
   pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_pcie_h2d_l1 -k $ARCH_NAME
   pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_pcie_d2h_l1 -k $ARCH_NAME
   # pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_noc -k $ARCH_NAME
-  pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_dram -k $ARCH_NAME
-  pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_l1 -k $ARCH_NAME
+  pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_dram -k $ARCH_NAME # how to set r and c for this
+  pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_l1 -k $ARCH_NAME # how to set r and c for this
 
-  if [[ "$ARCH_NAME" == "wormhole_b0" ]]; then
+  if [[ "$ARCH_NAME" != "grayskull" ]]; then
     pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_single_core_sharded -k $ARCH_NAME
-    pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_12_core -k $ARCH_NAME
+    pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_all_core -k $ARCH_NAME
     pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_remote_cb_sync -k $ARCH_NAME
   fi
   # bypass wh_b0 for now until we can move FD cores to last col
diff --git a/tests/scripts/test_moreh_microbenchmark.py b/tests/scripts/test_moreh_microbenchmark.py
index 6076d6c034e..a6f946eda36 100755
--- a/tests/scripts/test_moreh_microbenchmark.py
+++ b/tests/scripts/test_moreh_microbenchmark.py
@@ -529,6 +529,7 @@ def test_pcie_d2h_dram(iteration, test_vector_small, test_vector_large):
     [
         ("grayskull", 2, 1048576, np.array([4096, 16384, 65536, 262144, 1048576, 4194304, 16777216])),
         ("wormhole_b0", 2, 1499136, np.array([4096, 16384, 65536, 262144, 1048576, 4194304, 16777216])),
+        ("blackhole", 2, 1499136, np.array([4096, 16384, 65536, 262144, 1048576, 4194304, 16777216])),
     ],
 )
 def test_pcie_h2d_l1(arch, iteration, L1_size, test_vector):
@@ -553,6 +554,7 @@ def test_pcie_h2d_l1(arch, iteration, L1_size, test_vector):
     [
         ("grayskull", 2, 1048576, np.array([4096, 16384, 65536])),
         ("wormhole_b0", 2, 1499136, np.array([4096, 16384, 65536])),
+        ("blackhole", 2, 1499136, np.array([4096, 16384, 65536])),
     ],
 )
 def test_pcie_d2h_l1(arch, iteration, L1_size, test_vector):
@@ -649,6 +651,16 @@ def test_matmul_dram(arch, freq, r, c, test_vector):
         ("wormhole_b0", 1000, np.array([[512, 512, 256]]), 0, 1, 1, 8, 0, 0, 0, 0, 658522.0),
         ("wormhole_b0", 1000, np.array([[512, 512, 256]]), 0, 0, 1, 8, 1, 0, 0, 0, 346350.0),
         ("wormhole_b0", 1000, np.array([[512, 512, 256]]), 0, 1, 1, 8, 1, 0, 0, 0, 597457.0),
+        # ########################### 512 512 512 x 8 subblock 4 2 ################################
+        ("blackhole", 800, np.array([[512, 512, 512]]), 0, 0, 1, 8, 0, 0, 0, 0, 717089.0),
+        ("blackhole", 800, np.array([[512, 512, 512]]), 0, 1, 1, 8, 0, 0, 0, 0, 1233930.0),
+        ("blackhole", 800, np.array([[512, 512, 512]]), 0, 0, 1, 8, 1, 0, 0, 0, 664492.0),
+        ("blackhole", 800, np.array([[512, 512, 512]]), 0, 1, 1, 8, 1, 0, 0, 0, 1173029.0),
+        # ########################### 512 512 256x8 subblock 4 2 ################################
+        ("blackhole", 800, np.array([[512, 512, 256]]), 0, 0, 1, 8, 0, 0, 0, 0, 399068.0),
+        ("blackhole", 800, np.array([[512, 512, 256]]), 0, 1, 1, 8, 0, 0, 0, 0, 658522.0),
+        ("blackhole", 800, np.array([[512, 512, 256]]), 0, 0, 1, 8, 1, 0, 0, 0, 346350.0),
+        ("blackhole", 800, np.array([[512, 512, 256]]), 0, 1, 1, 8, 1, 0, 0, 0, 597457.0),
     ],
 )
 def test_matmul_single_core_sharded(
@@ -716,9 +728,12 @@ def test_matmul_single_core_sharded(
         ("wormhole_b0", 1000, np.array([32768, 12 * 128]), 1, 8, 0, 12, 0),
         ("wormhole_b0", 1000, np.array([32768, 12 * 128]), 1, 8, 1, 12, 0),
         ("wormhole_b0", 1000, np.array([2048, 3840]), 1, 4, 1, 12, 0),  # Padded FF1 shapes for llama 70b on TG
+        ("blackhole", 800, np.array([32768, 8 * 128]), 1, 8, 0, 8, 0),
+        ("blackhole", 800, np.array([32768, 8 * 128]), 1, 8, 1, 8, 0),
+        ("blackhole", 800, np.array([2048, 3840]), 1, 4, 1, 8, 0),  # Padded FF1 shapes for llama 70b on TG
     ],
 )
-def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id):
+def test_dram_read_all_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id):
     data = []
     cycle_list = []
     time_list = []
@@ -770,6 +785,16 @@ def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_form
             0,
             240,
         ),  # 244 GB/s
+        (
+            "blackhole",
+            np.array([2048, 3840]),
+            1,
+            16,
+            0,
+            8,
+            0,
+            240,
+        ),  # 244 GB/s
         # FF2 shapes for TG llama 70b
         (
             "wormhole_b0",
@@ -781,6 +806,16 @@ def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_form
             0,
             250,
         ),  # 255 GB/s
+        (
+            "blackhole",
+            np.array([3584, 2304]),
+            1,
+            28,
+            1,
+            8,
+            0,
+            250,
+        ),  # 255 GB/s
         # Dense Out shapes for TG llama 70b
         (
             "wormhole_b0",
@@ -792,6 +827,16 @@ def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_form
             0,
             220,
         ),  # 226 GB/s
+        (
+            "blackhole",
+            np.array([1024, 2304]),
+            1,
+            8,
+            1,
+            8,
+            0,
+            220,
+        ),  # 226 GB/s
         # QKV shapes for TG llama 70b
         (
             "wormhole_b0",
@@ -803,6 +848,16 @@ def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_form
             0,
             225,
         ),  # 232 GB/s
+        (
+            "blackhole",
+            np.array([2048, 1536]),
+            1,
+            16,
+            1,
+            8,
+            0,
+            225,
+        ),  # 232 GB/ss
     ],
 )
 def test_dram_read_l1_write_core(
@@ -866,6 +921,20 @@ def test_dram_read_l1_write_core(
         ("wormhole_b0", "Matmul", np.array([32, 2048, 128]), 1, 8, 10, 256, 1, 2, 1),
         # # multi layer multi receiver test
         ("wormhole_b0", "Matmul", np.array([32, 2048, 128]), 1, 8, 10, 256, 1, 2, 15),
+        # single layer single receiver test
+        ("blackhole", None, np.array([32768, 128]), 1, 64, 5, 256, 1, 1, 1),
+        # single layer multi receiver test
+        ("blackhole", None, np.array([32768, 128]), 1, 64, 3, 256, 1, 2, 1),
+        # multi layer multi receiver test
+        ("blackhole", None, np.array([32768, 256]), 1, 64, 5, 256, 1, 4, 15),
+        # Matmul test does not support mixed data format, just test for either bfp8 or fp16
+        # single layer single receiver test
+        ("blackhole", "Matmul", np.array([32, 4096, 128]), 1, 8, 10, 256, 0, 1, 1),
+        ("blackhole", "Matmul", np.array([32, 2048, 128]), 1, 8, 10, 256, 1, 1, 1),
+        # # single layer multi receiver test
+        ("blackhole", "Matmul", np.array([32, 2048, 128]), 1, 8, 10, 256, 1, 2, 1),
+        # # multi layer multi receiver test
+        ("blackhole", "Matmul", np.array([32, 2048, 128]), 1, 8, 10, 256, 1, 2, 15),
     ],
 )
 @pytest.mark.parametrize(
@@ -944,6 +1013,7 @@ def test_dram_read_remote_cb_sync(
     [
         ("grayskull", 1020, 9, 12, np.array([[3456, 3072, 1024], [2304, 3072, 768]]), np.array([[2304, 3072, 768]])),
         ("wormhole_b0", 1000, 6, 6, np.array([[2304, 1536, 1024], [1536, 1536, 768]]), np.array([[1536, 1536, 768]])),
+        ("blackhole", 800, 6, 6, np.array([[2304, 1536, 1024], [1536, 1536, 768]]), np.array([[1536, 1536, 768]])),
     ],
 )
 def test_matmul_l1(arch, freq, r, c, test_vector_global, test_vector_local):
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp
index ced445053f8..d72ac2a08b1 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp
@@ -684,17 +684,21 @@ int main(int argc, char** argv) {
 uint32_t get_l1_size(tt::ARCH arch) {
     constexpr uint32_t GS_L1_SIZE = 1048576;
     constexpr uint32_t WH_L1_SIZE = 1499136;
+    constexpr uint32_t BH_L1_SIZE = 1499136;
 
     uint32_t l1_size = 0;
     if (arch == tt::ARCH::WORMHOLE_B0) {
         l1_size = WH_L1_SIZE;
     } else if (arch == tt::ARCH::GRAYSKULL) {
         l1_size = GS_L1_SIZE;
+    } else if (arch == tt::ARCH::BLACKHOLE) {
+        l1_size = BH_L1_SIZE;
     }
     return l1_size;
 }
 
 double get_tt_npu_rpeak_tflops(tt::ARCH arch, CoreCoord grid_size, int tt_npu_clock) {
+    constexpr double BH_FPU_BFP8_TFLOPS_PER_TENSIX = 2.97;
     constexpr double WH_FPU_BFP8_TFLOPS_PER_TENSIX = 2.05;
     constexpr double GS_FPU_BFP8_TFLOPS_PER_TENSIX = 0.58;
 
@@ -707,6 +711,9 @@ double get_tt_npu_rpeak_tflops(tt::ARCH arch, CoreCoord grid_size, int tt_npu_cl
     } else if (arch == tt::ARCH::GRAYSKULL) {
         rpeak_tflops =
             GS_FPU_BFP8_TFLOPS_PER_TENSIX * static_cast<double>(num_compute_core) * static_cast<double>(clock);
+    } else if (arch == tt::ARCH::BLACKHOLE) {
+        rpeak_tflops =
+            BH_FPU_BFP8_TFLOPS_PER_TENSIX * static_cast<double>(num_compute_core) * static_cast<double>(clock);
     }
 
     log_debug(LogTest, "Rpeak {} TFLOPS", rpeak_tflops);
@@ -777,7 +784,7 @@ CoreCoord get_core_range(
 std::tuple<MathFidelity, bool> get_compute_params(tt::ARCH arch) {
     MathFidelity math_fidelity = MathFidelity::HiFi4;
     bool fp32_dest_acc_en = false;
-    if (arch == tt::ARCH::WORMHOLE_B0) {
+    if (arch == tt::ARCH::WORMHOLE_B0 or arch == tt::ARCH::BLACKHOLE) {
         math_fidelity = MathFidelity::HiFi2;
         // TODO: apply packer_l1_acc
         // TODO: need to consider whether to set these variablias as arguments
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
index d52e39f31c7..79e0a1d3740 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
@@ -222,12 +222,15 @@ bool validation(
 uint32_t get_dram_bandwidth(tt::ARCH arch) {
     constexpr uint32_t GS_DRAM_BANDWIDTH_GB_PER_SEC = 100;
     constexpr uint32_t WH_DRAM_BANDWIDTH_GB_PER_SEC = 384;
+    constexpr uint32_t BH_DRAM_BANDWIDTH_GB_PER_SEC = 512;
 
     uint32_t dram_bandwidth_gb_per_sec = 0;
     if (arch == tt::ARCH::WORMHOLE_B0) {
         dram_bandwidth_gb_per_sec = WH_DRAM_BANDWIDTH_GB_PER_SEC;
     } else if (arch == tt::ARCH::GRAYSKULL) {
         dram_bandwidth_gb_per_sec = GS_DRAM_BANDWIDTH_GB_PER_SEC;
+    } else if (arch == tt::ARCH::BLACKHOLE) {
+        dram_bandwidth_gb_per_sec = BH_DRAM_BANDWIDTH_GB_PER_SEC;
     }
     return dram_bandwidth_gb_per_sec;
 }
@@ -327,10 +330,11 @@ void get_dram_reader_core_coords_grayskull(
     all_cores_ordered = adj_core_logical_realloc;
 }
 
-void get_dram_reader_core_coords_wormhole_b0(
+void get_dram_reader_core_coords(
     tt_metal::Device* device, CoreRangeSet& all_cores, std::vector<CoreCoord>& all_cores_ordered) {
-    // hardcoded for wh_b0
-    uint32_t full_grid_size_y = 12;
+
+    uint32_t full_grid_size_x = device->grid_size().x;
+    uint32_t full_grid_size_y = device->grid_size().y;
     uint32_t x_step = 3;
 
     // get all the logical coord
@@ -624,7 +628,7 @@ int main(int argc, char **argv) {
         tt_metal::Device *device = tt_metal::CreateDevice(device_id);
         dram_bandwidth_spec = get_dram_bandwidth(device->arch());
 
-        TT_ASSERT(device->arch() == ARCH::WORMHOLE_B0, "device must be wh_b0");
+        TT_ASSERT(device->arch() == ARCH::WORMHOLE_B0 or device->arch() == ARCH::BLACKHOLE, "device must be wh_b0 or bh");
 
         int clock_freq_mhz = get_tt_npu_clock(device);
 
@@ -634,10 +638,10 @@ int main(int argc, char **argv) {
 
         CoreRangeSet all_cores;
         std::vector<CoreCoord> all_cores_list;
-        if (device->arch() == tt::ARCH::WORMHOLE_B0) {
-            get_dram_reader_core_coords_wormhole_b0(device, all_cores, all_cores_list);
-        } else {
+        if (device->arch() == tt::ARCH::GRAYSKULL) {
             get_dram_reader_core_coords_grayskull(device, all_cores, all_cores_list);
+        } else {
+            get_dram_reader_core_coords(device, all_cores, all_cores_list);
         }
 
         uint32_t num_tiles_per_core = num_tiles / num_cores;