Skip to content

Commit

Permalink
#5174: Uplifitng microbenchmarks to run on BH
Browse files Browse the repository at this point in the history
  • Loading branch information
abhullar-tt committed Nov 16, 2024
1 parent dd03632 commit c9996d5
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 13 deletions.
8 changes: 4 additions & 4 deletions tests/scripts/run_moreh_microbenchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ run_profiling_test() {
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_pcie_h2d_l1 -k $ARCH_NAME
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_pcie_d2h_l1 -k $ARCH_NAME
# pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_noc -k $ARCH_NAME
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_dram -k $ARCH_NAME
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_l1 -k $ARCH_NAME
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_dram -k $ARCH_NAME # how to set r and c for this
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_l1 -k $ARCH_NAME # how to set r and c for this

if [[ "$ARCH_NAME" == "wormhole_b0" ]]; then
if [[ "$ARCH_NAME" != "grayskull" ]]; then
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_single_core_sharded -k $ARCH_NAME
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_12_core -k $ARCH_NAME
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_all_core -k $ARCH_NAME
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_remote_cb_sync -k $ARCH_NAME
fi
# bypass wh_b0 for now until we can move FD cores to last col
Expand Down
72 changes: 71 additions & 1 deletion tests/scripts/test_moreh_microbenchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,7 @@ def test_pcie_d2h_dram(iteration, test_vector_small, test_vector_large):
[
("grayskull", 2, 1048576, np.array([4096, 16384, 65536, 262144, 1048576, 4194304, 16777216])),
("wormhole_b0", 2, 1499136, np.array([4096, 16384, 65536, 262144, 1048576, 4194304, 16777216])),
("blackhole", 2, 1499136, np.array([4096, 16384, 65536, 262144, 1048576, 4194304, 16777216])),
],
)
def test_pcie_h2d_l1(arch, iteration, L1_size, test_vector):
Expand All @@ -553,6 +554,7 @@ def test_pcie_h2d_l1(arch, iteration, L1_size, test_vector):
[
("grayskull", 2, 1048576, np.array([4096, 16384, 65536])),
("wormhole_b0", 2, 1499136, np.array([4096, 16384, 65536])),
("blackhole", 2, 1499136, np.array([4096, 16384, 65536])),
],
)
def test_pcie_d2h_l1(arch, iteration, L1_size, test_vector):
Expand Down Expand Up @@ -649,6 +651,16 @@ def test_matmul_dram(arch, freq, r, c, test_vector):
("wormhole_b0", 1000, np.array([[512, 512, 256]]), 0, 1, 1, 8, 0, 0, 0, 0, 658522.0),
("wormhole_b0", 1000, np.array([[512, 512, 256]]), 0, 0, 1, 8, 1, 0, 0, 0, 346350.0),
("wormhole_b0", 1000, np.array([[512, 512, 256]]), 0, 1, 1, 8, 1, 0, 0, 0, 597457.0),
# ########################### 512 512 512 x 8 subblock 4 2 ################################
("blackhole", 800, np.array([[512, 512, 512]]), 0, 0, 1, 8, 0, 0, 0, 0, 717089.0),
("blackhole", 800, np.array([[512, 512, 512]]), 0, 1, 1, 8, 0, 0, 0, 0, 1233930.0),
("blackhole", 800, np.array([[512, 512, 512]]), 0, 0, 1, 8, 1, 0, 0, 0, 664492.0),
("blackhole", 800, np.array([[512, 512, 512]]), 0, 1, 1, 8, 1, 0, 0, 0, 1173029.0),
# ########################### 512 512 256x8 subblock 4 2 ################################
("blackhole", 800, np.array([[512, 512, 256]]), 0, 0, 1, 8, 0, 0, 0, 0, 399068.0),
("blackhole", 800, np.array([[512, 512, 256]]), 0, 1, 1, 8, 0, 0, 0, 0, 658522.0),
("blackhole", 800, np.array([[512, 512, 256]]), 0, 0, 1, 8, 1, 0, 0, 0, 346350.0),
("blackhole", 800, np.array([[512, 512, 256]]), 0, 1, 1, 8, 1, 0, 0, 0, 597457.0),
],
)
def test_matmul_single_core_sharded(
Expand Down Expand Up @@ -716,9 +728,12 @@ def test_matmul_single_core_sharded(
("wormhole_b0", 1000, np.array([32768, 12 * 128]), 1, 8, 0, 12, 0),
("wormhole_b0", 1000, np.array([32768, 12 * 128]), 1, 8, 1, 12, 0),
("wormhole_b0", 1000, np.array([2048, 3840]), 1, 4, 1, 12, 0), # Padded FF1 shapes for llama 70b on TG
("blackhole", 800, np.array([32768, 8 * 128]), 1, 8, 0, 8, 0),
("blackhole", 800, np.array([32768, 8 * 128]), 1, 8, 1, 8, 0),
("blackhole", 800, np.array([2048, 3840]), 1, 4, 1, 8, 0), # Padded FF1 shapes for llama 70b on TG
],
)
def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id):
def test_dram_read_all_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id):
data = []
cycle_list = []
time_list = []
Expand Down Expand Up @@ -770,6 +785,16 @@ def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_form
0,
240,
), # 244 GB/s
(
"blackhole",
np.array([2048, 3840]),
1,
16,
0,
8,
0,
240,
), # 244 GB/s
# FF2 shapes for TG llama 70b
(
"wormhole_b0",
Expand All @@ -781,6 +806,16 @@ def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_form
0,
250,
), # 255 GB/s
(
"blackhole",
np.array([3584, 2304]),
1,
28,
1,
8,
0,
250,
), # 255 GB/s
# Dense Out shapes for TG llama 70b
(
"wormhole_b0",
Expand All @@ -792,6 +827,16 @@ def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_form
0,
220,
), # 226 GB/s
(
"blackhole",
np.array([1024, 2304]),
1,
8,
1,
8,
0,
220,
), # 226 GB/s
# QKV shapes for TG llama 70b
(
"wormhole_b0",
Expand All @@ -803,6 +848,16 @@ def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_form
0,
225,
), # 232 GB/s
(
"blackhole",
np.array([2048, 1536]),
1,
16,
1,
8,
0,
225,
), # 232 GB/ss
],
)
def test_dram_read_l1_write_core(
Expand Down Expand Up @@ -866,6 +921,20 @@ def test_dram_read_l1_write_core(
("wormhole_b0", "Matmul", np.array([32, 2048, 128]), 1, 8, 10, 256, 1, 2, 1),
# # multi layer multi receiver test
("wormhole_b0", "Matmul", np.array([32, 2048, 128]), 1, 8, 10, 256, 1, 2, 15),
# single layer single receiver test
("blackhole", None, np.array([32768, 128]), 1, 64, 5, 256, 1, 1, 1),
# single layer multi receiver test
("blackhole", None, np.array([32768, 128]), 1, 64, 3, 256, 1, 2, 1),
# multi layer multi receiver test
("blackhole", None, np.array([32768, 256]), 1, 64, 5, 256, 1, 4, 15),
# Matmul test does not support mixed data format, just test for either bfp8 or fp16
# single layer single receiver test
("blackhole", "Matmul", np.array([32, 4096, 128]), 1, 8, 10, 256, 0, 1, 1),
("blackhole", "Matmul", np.array([32, 2048, 128]), 1, 8, 10, 256, 1, 1, 1),
# # single layer multi receiver test
("blackhole", "Matmul", np.array([32, 2048, 128]), 1, 8, 10, 256, 1, 2, 1),
# # multi layer multi receiver test
("blackhole", "Matmul", np.array([32, 2048, 128]), 1, 8, 10, 256, 1, 2, 15),
],
)
@pytest.mark.parametrize(
Expand Down Expand Up @@ -944,6 +1013,7 @@ def test_dram_read_remote_cb_sync(
[
("grayskull", 1020, 9, 12, np.array([[3456, 3072, 1024], [2304, 3072, 768]]), np.array([[2304, 3072, 768]])),
("wormhole_b0", 1000, 6, 6, np.array([[2304, 1536, 1024], [1536, 1536, 768]]), np.array([[1536, 1536, 768]])),
("blackhole", 800, 6, 6, np.array([[2304, 1536, 1024], [1536, 1536, 768]]), np.array([[1536, 1536, 768]])),
],
)
def test_matmul_l1(arch, freq, r, c, test_vector_global, test_vector_local):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -684,17 +684,21 @@ int main(int argc, char** argv) {
uint32_t get_l1_size(tt::ARCH arch) {
constexpr uint32_t GS_L1_SIZE = 1048576;
constexpr uint32_t WH_L1_SIZE = 1499136;
constexpr uint32_t BH_L1_SIZE = 1499136;

uint32_t l1_size = 0;
if (arch == tt::ARCH::WORMHOLE_B0) {
l1_size = WH_L1_SIZE;
} else if (arch == tt::ARCH::GRAYSKULL) {
l1_size = GS_L1_SIZE;
} else if (arch == tt::ARCH::BLACKHOLE) {
l1_size = BH_L1_SIZE;
}
return l1_size;
}

double get_tt_npu_rpeak_tflops(tt::ARCH arch, CoreCoord grid_size, int tt_npu_clock) {
constexpr double BH_FPU_BFP8_TFLOPS_PER_TENSIX = 2.97;
constexpr double WH_FPU_BFP8_TFLOPS_PER_TENSIX = 2.05;
constexpr double GS_FPU_BFP8_TFLOPS_PER_TENSIX = 0.58;

Expand All @@ -707,6 +711,9 @@ double get_tt_npu_rpeak_tflops(tt::ARCH arch, CoreCoord grid_size, int tt_npu_cl
} else if (arch == tt::ARCH::GRAYSKULL) {
rpeak_tflops =
GS_FPU_BFP8_TFLOPS_PER_TENSIX * static_cast<double>(num_compute_core) * static_cast<double>(clock);
} else if (arch == tt::ARCH::BLACKHOLE) {
rpeak_tflops =
BH_FPU_BFP8_TFLOPS_PER_TENSIX * static_cast<double>(num_compute_core) * static_cast<double>(clock);
}

log_debug(LogTest, "Rpeak {} TFLOPS", rpeak_tflops);
Expand Down Expand Up @@ -777,7 +784,7 @@ CoreCoord get_core_range(
std::tuple<MathFidelity, bool> get_compute_params(tt::ARCH arch) {
MathFidelity math_fidelity = MathFidelity::HiFi4;
bool fp32_dest_acc_en = false;
if (arch == tt::ARCH::WORMHOLE_B0) {
if (arch == tt::ARCH::WORMHOLE_B0 or arch == tt::ARCH::BLACKHOLE) {
math_fidelity = MathFidelity::HiFi2;
// TODO: apply packer_l1_acc
// TODO: need to consider whether to set these variablias as arguments
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -222,12 +222,15 @@ bool validation(
uint32_t get_dram_bandwidth(tt::ARCH arch) {
constexpr uint32_t GS_DRAM_BANDWIDTH_GB_PER_SEC = 100;
constexpr uint32_t WH_DRAM_BANDWIDTH_GB_PER_SEC = 384;
constexpr uint32_t BH_DRAM_BANDWIDTH_GB_PER_SEC = 512;

uint32_t dram_bandwidth_gb_per_sec = 0;
if (arch == tt::ARCH::WORMHOLE_B0) {
dram_bandwidth_gb_per_sec = WH_DRAM_BANDWIDTH_GB_PER_SEC;
} else if (arch == tt::ARCH::GRAYSKULL) {
dram_bandwidth_gb_per_sec = GS_DRAM_BANDWIDTH_GB_PER_SEC;
} else if (arch == tt::ARCH::BLACKHOLE) {
dram_bandwidth_gb_per_sec = BH_DRAM_BANDWIDTH_GB_PER_SEC;
}
return dram_bandwidth_gb_per_sec;
}
Expand Down Expand Up @@ -327,10 +330,11 @@ void get_dram_reader_core_coords_grayskull(
all_cores_ordered = adj_core_logical_realloc;
}

void get_dram_reader_core_coords_wormhole_b0(
void get_dram_reader_core_coords(
tt_metal::Device* device, CoreRangeSet& all_cores, std::vector<CoreCoord>& all_cores_ordered) {
// hardcoded for wh_b0
uint32_t full_grid_size_y = 12;

uint32_t full_grid_size_x = device->grid_size().x;
uint32_t full_grid_size_y = device->grid_size().y;
uint32_t x_step = 3;

// get all the logical coord
Expand Down Expand Up @@ -624,7 +628,7 @@ int main(int argc, char **argv) {
tt_metal::Device *device = tt_metal::CreateDevice(device_id);
dram_bandwidth_spec = get_dram_bandwidth(device->arch());

TT_ASSERT(device->arch() == ARCH::WORMHOLE_B0, "device must be wh_b0");
TT_ASSERT(device->arch() == ARCH::WORMHOLE_B0 or device->arch() == ARCH::BLACKHOLE, "device must be wh_b0 or bh");

int clock_freq_mhz = get_tt_npu_clock(device);

Expand All @@ -634,10 +638,10 @@ int main(int argc, char **argv) {

CoreRangeSet all_cores;
std::vector<CoreCoord> all_cores_list;
if (device->arch() == tt::ARCH::WORMHOLE_B0) {
get_dram_reader_core_coords_wormhole_b0(device, all_cores, all_cores_list);
} else {
if (device->arch() == tt::ARCH::GRAYSKULL) {
get_dram_reader_core_coords_grayskull(device, all_cores, all_cores_list);
} else {
get_dram_reader_core_coords(device, all_cores, all_cores_list);
}

uint32_t num_tiles_per_core = num_tiles / num_cores;
Expand Down

0 comments on commit c9996d5

Please sign in to comment.