From c9996d5949149e769f66731d06934bc2c0e9022e Mon Sep 17 00:00:00 2001 From: Almeet Bhullar Date: Thu, 24 Oct 2024 00:31:20 +0000 Subject: [PATCH] #5174: Uplifitng microbenchmarks to run on BH --- tests/scripts/run_moreh_microbenchmark.sh | 8 +-- tests/scripts/test_moreh_microbenchmark.py | 72 ++++++++++++++++++- .../1_compute_mm/test_compute_mm.cpp | 9 ++- .../test_dram_read.cpp | 18 +++-- 4 files changed, 94 insertions(+), 13 deletions(-) diff --git a/tests/scripts/run_moreh_microbenchmark.sh b/tests/scripts/run_moreh_microbenchmark.sh index 2b7107bb7df..f2a085fa2c7 100755 --- a/tests/scripts/run_moreh_microbenchmark.sh +++ b/tests/scripts/run_moreh_microbenchmark.sh @@ -29,12 +29,12 @@ run_profiling_test() { pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_pcie_h2d_l1 -k $ARCH_NAME pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_pcie_d2h_l1 -k $ARCH_NAME # pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_noc -k $ARCH_NAME - pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_dram -k $ARCH_NAME - pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_l1 -k $ARCH_NAME + pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_dram -k $ARCH_NAME # how to set r and c for this + pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_l1 -k $ARCH_NAME # how to set r and c for this - if [[ "$ARCH_NAME" == "wormhole_b0" ]]; then + if [[ "$ARCH_NAME" != "grayskull" ]]; then pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_single_core_sharded -k $ARCH_NAME - pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_12_core -k $ARCH_NAME + pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_all_core -k $ARCH_NAME pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_remote_cb_sync -k $ARCH_NAME fi # bypass wh_b0 for now until we can move FD cores to last col diff --git a/tests/scripts/test_moreh_microbenchmark.py b/tests/scripts/test_moreh_microbenchmark.py index 6076d6c034e..a6f946eda36 100755 --- a/tests/scripts/test_moreh_microbenchmark.py +++ b/tests/scripts/test_moreh_microbenchmark.py @@ -529,6 +529,7 @@ def test_pcie_d2h_dram(iteration, test_vector_small, test_vector_large): [ ("grayskull", 2, 1048576, np.array([4096, 16384, 65536, 262144, 1048576, 4194304, 16777216])), ("wormhole_b0", 2, 1499136, np.array([4096, 16384, 65536, 262144, 1048576, 4194304, 16777216])), + ("blackhole", 2, 1499136, np.array([4096, 16384, 65536, 262144, 1048576, 4194304, 16777216])), ], ) def test_pcie_h2d_l1(arch, iteration, L1_size, test_vector): @@ -553,6 +554,7 @@ def test_pcie_h2d_l1(arch, iteration, L1_size, test_vector): [ ("grayskull", 2, 1048576, np.array([4096, 16384, 65536])), ("wormhole_b0", 2, 1499136, np.array([4096, 16384, 65536])), + ("blackhole", 2, 1499136, np.array([4096, 16384, 65536])), ], ) def test_pcie_d2h_l1(arch, iteration, L1_size, test_vector): @@ -649,6 +651,16 @@ def test_matmul_dram(arch, freq, r, c, test_vector): ("wormhole_b0", 1000, np.array([[512, 512, 256]]), 0, 1, 1, 8, 0, 0, 0, 0, 658522.0), ("wormhole_b0", 1000, np.array([[512, 512, 256]]), 0, 0, 1, 8, 1, 0, 0, 0, 346350.0), ("wormhole_b0", 1000, np.array([[512, 512, 256]]), 0, 1, 1, 8, 1, 0, 0, 0, 597457.0), + # ########################### 512 512 512 x 8 subblock 4 2 ################################ + ("blackhole", 800, np.array([[512, 512, 512]]), 0, 0, 1, 8, 0, 0, 0, 0, 717089.0), + ("blackhole", 800, np.array([[512, 512, 512]]), 0, 1, 1, 8, 0, 0, 0, 0, 1233930.0), + ("blackhole", 800, np.array([[512, 512, 512]]), 0, 0, 1, 8, 1, 0, 0, 0, 664492.0), + ("blackhole", 800, np.array([[512, 512, 512]]), 0, 1, 1, 8, 1, 0, 0, 0, 1173029.0), + # ########################### 512 512 256x8 subblock 4 2 ################################ + ("blackhole", 800, np.array([[512, 512, 256]]), 0, 0, 1, 8, 0, 0, 0, 0, 399068.0), + ("blackhole", 800, np.array([[512, 512, 256]]), 0, 1, 1, 8, 0, 0, 0, 0, 658522.0), + ("blackhole", 800, np.array([[512, 512, 256]]), 0, 0, 1, 8, 1, 0, 0, 0, 346350.0), + ("blackhole", 800, np.array([[512, 512, 256]]), 0, 1, 1, 8, 1, 0, 0, 0, 597457.0), ], ) def test_matmul_single_core_sharded( @@ -716,9 +728,12 @@ def test_matmul_single_core_sharded( ("wormhole_b0", 1000, np.array([32768, 12 * 128]), 1, 8, 0, 12, 0), ("wormhole_b0", 1000, np.array([32768, 12 * 128]), 1, 8, 1, 12, 0), ("wormhole_b0", 1000, np.array([2048, 3840]), 1, 4, 1, 12, 0), # Padded FF1 shapes for llama 70b on TG + ("blackhole", 800, np.array([32768, 8 * 128]), 1, 8, 0, 8, 0), + ("blackhole", 800, np.array([32768, 8 * 128]), 1, 8, 1, 8, 0), + ("blackhole", 800, np.array([2048, 3840]), 1, 4, 1, 8, 0), # Padded FF1 shapes for llama 70b on TG ], ) -def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id): +def test_dram_read_all_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id): data = [] cycle_list = [] time_list = [] @@ -770,6 +785,16 @@ def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_form 0, 240, ), # 244 GB/s + ( + "blackhole", + np.array([2048, 3840]), + 1, + 16, + 0, + 8, + 0, + 240, + ), # 244 GB/s # FF2 shapes for TG llama 70b ( "wormhole_b0", @@ -781,6 +806,16 @@ def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_form 0, 250, ), # 255 GB/s + ( + "blackhole", + np.array([3584, 2304]), + 1, + 28, + 1, + 8, + 0, + 250, + ), # 255 GB/s # Dense Out shapes for TG llama 70b ( "wormhole_b0", @@ -792,6 +827,16 @@ def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_form 0, 220, ), # 226 GB/s + ( + "blackhole", + np.array([1024, 2304]), + 1, + 8, + 1, + 8, + 0, + 220, + ), # 226 GB/s # QKV shapes for TG llama 70b ( "wormhole_b0", @@ -803,6 +848,16 @@ def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_form 0, 225, ), # 232 GB/s + ( + "blackhole", + np.array([2048, 1536]), + 1, + 16, + 1, + 8, + 0, + 225, + ), # 232 GB/ss ], ) def test_dram_read_l1_write_core( @@ -866,6 +921,20 @@ def test_dram_read_l1_write_core( ("wormhole_b0", "Matmul", np.array([32, 2048, 128]), 1, 8, 10, 256, 1, 2, 1), # # multi layer multi receiver test ("wormhole_b0", "Matmul", np.array([32, 2048, 128]), 1, 8, 10, 256, 1, 2, 15), + # single layer single receiver test + ("blackhole", None, np.array([32768, 128]), 1, 64, 5, 256, 1, 1, 1), + # single layer multi receiver test + ("blackhole", None, np.array([32768, 128]), 1, 64, 3, 256, 1, 2, 1), + # multi layer multi receiver test + ("blackhole", None, np.array([32768, 256]), 1, 64, 5, 256, 1, 4, 15), + # Matmul test does not support mixed data format, just test for either bfp8 or fp16 + # single layer single receiver test + ("blackhole", "Matmul", np.array([32, 4096, 128]), 1, 8, 10, 256, 0, 1, 1), + ("blackhole", "Matmul", np.array([32, 2048, 128]), 1, 8, 10, 256, 1, 1, 1), + # # single layer multi receiver test + ("blackhole", "Matmul", np.array([32, 2048, 128]), 1, 8, 10, 256, 1, 2, 1), + # # multi layer multi receiver test + ("blackhole", "Matmul", np.array([32, 2048, 128]), 1, 8, 10, 256, 1, 2, 15), ], ) @pytest.mark.parametrize( @@ -944,6 +1013,7 @@ def test_dram_read_remote_cb_sync( [ ("grayskull", 1020, 9, 12, np.array([[3456, 3072, 1024], [2304, 3072, 768]]), np.array([[2304, 3072, 768]])), ("wormhole_b0", 1000, 6, 6, np.array([[2304, 1536, 1024], [1536, 1536, 768]]), np.array([[1536, 1536, 768]])), + ("blackhole", 800, 6, 6, np.array([[2304, 1536, 1024], [1536, 1536, 768]]), np.array([[1536, 1536, 768]])), ], ) def test_matmul_l1(arch, freq, r, c, test_vector_global, test_vector_local): diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp index ced445053f8..d72ac2a08b1 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp @@ -684,17 +684,21 @@ int main(int argc, char** argv) { uint32_t get_l1_size(tt::ARCH arch) { constexpr uint32_t GS_L1_SIZE = 1048576; constexpr uint32_t WH_L1_SIZE = 1499136; + constexpr uint32_t BH_L1_SIZE = 1499136; uint32_t l1_size = 0; if (arch == tt::ARCH::WORMHOLE_B0) { l1_size = WH_L1_SIZE; } else if (arch == tt::ARCH::GRAYSKULL) { l1_size = GS_L1_SIZE; + } else if (arch == tt::ARCH::BLACKHOLE) { + l1_size = BH_L1_SIZE; } return l1_size; } double get_tt_npu_rpeak_tflops(tt::ARCH arch, CoreCoord grid_size, int tt_npu_clock) { + constexpr double BH_FPU_BFP8_TFLOPS_PER_TENSIX = 2.97; constexpr double WH_FPU_BFP8_TFLOPS_PER_TENSIX = 2.05; constexpr double GS_FPU_BFP8_TFLOPS_PER_TENSIX = 0.58; @@ -707,6 +711,9 @@ double get_tt_npu_rpeak_tflops(tt::ARCH arch, CoreCoord grid_size, int tt_npu_cl } else if (arch == tt::ARCH::GRAYSKULL) { rpeak_tflops = GS_FPU_BFP8_TFLOPS_PER_TENSIX * static_cast(num_compute_core) * static_cast(clock); + } else if (arch == tt::ARCH::BLACKHOLE) { + rpeak_tflops = + BH_FPU_BFP8_TFLOPS_PER_TENSIX * static_cast(num_compute_core) * static_cast(clock); } log_debug(LogTest, "Rpeak {} TFLOPS", rpeak_tflops); @@ -777,7 +784,7 @@ CoreCoord get_core_range( std::tuple get_compute_params(tt::ARCH arch) { MathFidelity math_fidelity = MathFidelity::HiFi4; bool fp32_dest_acc_en = false; - if (arch == tt::ARCH::WORMHOLE_B0) { + if (arch == tt::ARCH::WORMHOLE_B0 or arch == tt::ARCH::BLACKHOLE) { math_fidelity = MathFidelity::HiFi2; // TODO: apply packer_l1_acc // TODO: need to consider whether to set these variablias as arguments diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp index d52e39f31c7..79e0a1d3740 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp @@ -222,12 +222,15 @@ bool validation( uint32_t get_dram_bandwidth(tt::ARCH arch) { constexpr uint32_t GS_DRAM_BANDWIDTH_GB_PER_SEC = 100; constexpr uint32_t WH_DRAM_BANDWIDTH_GB_PER_SEC = 384; + constexpr uint32_t BH_DRAM_BANDWIDTH_GB_PER_SEC = 512; uint32_t dram_bandwidth_gb_per_sec = 0; if (arch == tt::ARCH::WORMHOLE_B0) { dram_bandwidth_gb_per_sec = WH_DRAM_BANDWIDTH_GB_PER_SEC; } else if (arch == tt::ARCH::GRAYSKULL) { dram_bandwidth_gb_per_sec = GS_DRAM_BANDWIDTH_GB_PER_SEC; + } else if (arch == tt::ARCH::BLACKHOLE) { + dram_bandwidth_gb_per_sec = BH_DRAM_BANDWIDTH_GB_PER_SEC; } return dram_bandwidth_gb_per_sec; } @@ -327,10 +330,11 @@ void get_dram_reader_core_coords_grayskull( all_cores_ordered = adj_core_logical_realloc; } -void get_dram_reader_core_coords_wormhole_b0( +void get_dram_reader_core_coords( tt_metal::Device* device, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { - // hardcoded for wh_b0 - uint32_t full_grid_size_y = 12; + + uint32_t full_grid_size_x = device->grid_size().x; + uint32_t full_grid_size_y = device->grid_size().y; uint32_t x_step = 3; // get all the logical coord @@ -624,7 +628,7 @@ int main(int argc, char **argv) { tt_metal::Device *device = tt_metal::CreateDevice(device_id); dram_bandwidth_spec = get_dram_bandwidth(device->arch()); - TT_ASSERT(device->arch() == ARCH::WORMHOLE_B0, "device must be wh_b0"); + TT_ASSERT(device->arch() == ARCH::WORMHOLE_B0 or device->arch() == ARCH::BLACKHOLE, "device must be wh_b0 or bh"); int clock_freq_mhz = get_tt_npu_clock(device); @@ -634,10 +638,10 @@ int main(int argc, char **argv) { CoreRangeSet all_cores; std::vector all_cores_list; - if (device->arch() == tt::ARCH::WORMHOLE_B0) { - get_dram_reader_core_coords_wormhole_b0(device, all_cores, all_cores_list); - } else { + if (device->arch() == tt::ARCH::GRAYSKULL) { get_dram_reader_core_coords_grayskull(device, all_cores, all_cores_list); + } else { + get_dram_reader_core_coords(device, all_cores, all_cores_list); } uint32_t num_tiles_per_core = num_tiles / num_cores;