Skip to content

Commit

Permalink
#7823: Keep expected PCC at one place
Browse files Browse the repository at this point in the history
  • Loading branch information
s-jovic committed Jun 21, 2024
1 parent 5904433 commit 4657af0
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 67 deletions.
60 changes: 58 additions & 2 deletions models/demos/falcon7b/tests/run_falcon_end_to_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

# SPDX-License-Identifier: Apache-2.0

from enum import Enum
import torch
from loguru import logger
import numpy as np
Expand Down Expand Up @@ -47,6 +48,59 @@ def get_inputs_on_device(llm_mode, tt_FalconCausalLM, model_input, kv_cache_len,
return tt_input_ids, tt_attention_mask


class DeviceSetup(Enum):
GRAYSKULL = 0
WORMHOLE_B0 = 1
T3000 = 2


# CONFIG_TO_PCC[arch][model_config_str][seq_len] = (output_pcc, k_cache_pcc, v_cache_pcc)
PREFILL_CONFIG_TO_PCC = {
DeviceSetup.GRAYSKULL: {
"BFLOAT16-DRAM": {
128: (0.85, 0.97, 0.86),
256: (0.90, 0.97, 0.87),
},
"BFLOAT16-L1": {
128: (0.85, 0.97, 0.86),
256: (0.90, 0.97, 0.87),
},
},
DeviceSetup.WORMHOLE_B0: {
"BFLOAT16-DRAM": {
128: (0.97, 0.99, 0.97),
256: (0.99, 0.99, 0.97),
1024: (0.99, 0.99, 0.98),
2048: (0.99, 0.99, 0.98),
}
},
DeviceSetup.T3000: {
"BFLOAT16-DRAM": {
128: (0.98, 0.99, 0.97),
256: (0.99, 0.99, 0.97),
1024: (0.99, 0.99, 0.98),
2048: (0.99, 0.99, 0.98),
}
},
}

# CONFIG_TO_PCC[arch][model_config_str][kv_cache_len] = (output_pcc, k_cache_pcc, v_cache_pcc)
DECODE_CONFIG_TO_PCC = {
DeviceSetup.GRAYSKULL: {
"BFLOAT16-DRAM": {128: (0.63, 0.80, 0.84), 1024: (0.56, 0.86, 0.88), 2047: (0.55, 0.91, 0.89)},
"BFLOAT16-L1": {128: (0.63, 0.80, 0.84), 1024: (0.56, 0.86, 0.88), 2047: (0.55, 0.91, 0.89)},
},
DeviceSetup.WORMHOLE_B0: {
"BFLOAT16-DRAM": {128: (0.91, 0.92, 0.93), 1024: (0.86, 0.92, 0.92), 2047: (0.88, 0.93, 0.93)},
"BFLOAT16-L1": {128: (0.91, 0.92, 0.93), 1024: (0.86, 0.92, 0.92), 2047: (0.88, 0.93, 0.93)},
"BFLOAT16-L1_SHARDED": {128: (0.92, 0.95, 0.95), 1024: (0.87, 0.94, 0.94), 2047: (0.88, 0.92, 0.93)},
},
DeviceSetup.T3000: {
"BFLOAT16-L1_SHARDED": {128: (0.89, 0.94, 0.94), 1024: (0.86, 0.90, 0.91), 2047: (0.77, 0.69, 0.72)}
},
}


def run_test_FalconCausalLM_end_to_end(
devices,
model_version,
Expand All @@ -60,12 +114,14 @@ def run_test_FalconCausalLM_end_to_end(
model_config_str,
tt_cache_path,
model_location_generator,
expected_inference_time,
async_mode=False,
e2e_perf=False,
expected_inference_time=None,
device_perf=False,
async_mode=False,
):
assert not (e2e_perf and device_perf), "Cannot run both e2e and device perf test at the same time"
if e2e_perf:
assert expected_inference_time is not None, "Expected inference time is required for e2e perf test"

# Clear global profiler state before starting measurements
if e2e_perf:
Expand Down
29 changes: 19 additions & 10 deletions models/demos/falcon7b/tests/test_falcon_device_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@

import pytest

from models.demos.falcon7b.tests.run_falcon_end_to_end import run_test_FalconCausalLM_end_to_end
from models.demos.falcon7b.tests.run_falcon_end_to_end import (
DECODE_CONFIG_TO_PCC,
PREFILL_CONFIG_TO_PCC,
DeviceSetup,
run_test_FalconCausalLM_end_to_end,
)
from models.demos.falcon7b.tt.model_config import get_model_config
from models.perf.device_perf_utils import check_device_perf, prep_device_perf_report, run_device_perf
from models.utility_functions import disable_compilation_reports, disable_persistent_kernel_cache
Expand All @@ -16,11 +21,11 @@
ids=["falcon_7b"],
)
@pytest.mark.parametrize(
"llm_mode, num_layers, batch, seq_len, kv_cache_len, model_config_str, expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc, expected_inference_time",
"llm_mode, num_layers, batch, seq_len, kv_cache_len, model_config_str",
(
("prefill", 32, 1, 128, 0, "BFLOAT16-DRAM", 0.97, 0.99, 0.97, 0.1),
("prefill", 32, 1, 1024, 0, "BFLOAT16-DRAM", 0.99, 0.99, 0.98, 0.5),
("prefill", 32, 1, 2048, 0, "BFLOAT16-DRAM", 0.99, 0.99, 0.98, 1.1),
("prefill", 32, 1, 128, 0, "BFLOAT16-DRAM"),
("prefill", 32, 1, 1024, 0, "BFLOAT16-DRAM"),
("prefill", 32, 1, 2048, 0, "BFLOAT16-DRAM"),
),
ids=[
"prefill_seq128_bfloat16-dram",
Expand All @@ -34,11 +39,7 @@ def test_device_perf_wh_bare_metal(
batch,
seq_len,
kv_cache_len,
expected_inference_time,
num_layers,
expected_output_pcc,
expected_k_cache_pcc,
expected_v_cache_pcc,
model_config_str,
model_location_generator,
get_tt_cache_path,
Expand All @@ -52,6 +53,15 @@ def test_device_perf_wh_bare_metal(
disable_persistent_kernel_cache()
disable_compilation_reports()

if llm_mode == "prefill":
expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc = PREFILL_CONFIG_TO_PCC[
DeviceSetup.WORMHOLE_B0
][model_config_str][seq_len]
else:
expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc = DECODE_CONFIG_TO_PCC[DeviceSetup.WORMHOLE_B0][
model_config_str
][kv_cache_len]

run_test_FalconCausalLM_end_to_end(
[device],
model_version,
Expand All @@ -65,7 +75,6 @@ def test_device_perf_wh_bare_metal(
model_config_str,
tt_cache_path,
model_location_generator,
expected_inference_time,
device_perf=True,
)

Expand Down
131 changes: 76 additions & 55 deletions models/demos/falcon7b/tests/test_perf_falcon.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@

import pytest

from models.demos.falcon7b.tests.run_falcon_end_to_end import run_test_FalconCausalLM_end_to_end
from models.demos.falcon7b.tests.run_falcon_end_to_end import (
DECODE_CONFIG_TO_PCC,
PREFILL_CONFIG_TO_PCC,
DeviceSetup,
run_test_FalconCausalLM_end_to_end,
)
from models.demos.falcon7b.tt.model_config import (
get_model_config,
)
Expand All @@ -27,18 +32,18 @@
class TestParametrized:
@pytest.mark.models_performance_bare_metal
@pytest.mark.parametrize(
"llm_mode, num_layers, batch, seq_len, kv_cache_len, model_config_str, expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc, expected_inference_time",
"llm_mode, num_layers, batch, seq_len, kv_cache_len, model_config_str, expected_inference_time",
(
("prefill", 32, 1, 128, 0, "BFLOAT16-DRAM", 0.85, 0.97, 0.86, 0.31),
("prefill", 32, 1, 128, 0, "BFLOAT16-L1", 0.85, 0.97, 0.86, 0.29),
("prefill", 32, 1, 256, 0, "BFLOAT16-DRAM", 0.90, 0.97, 0.87, 0.43),
("prefill", 32, 1, 256, 0, "BFLOAT16-L1", 0.90, 0.97, 0.87, 0.34),
("decode", 32, 32, 1, 128, "BFLOAT16-DRAM", 0.63, 0.80, 0.84, 0.28),
("decode", 32, 32, 1, 128, "BFLOAT16-L1", 0.63, 0.80, 0.84, 0.28),
("decode", 32, 32, 1, 1024, "BFLOAT16-DRAM", 0.56, 0.86, 0.88, 0.37),
("decode", 32, 32, 1, 1024, "BFLOAT16-L1", 0.56, 0.86, 0.88, 0.31),
("decode", 32, 32, 1, 2047, "BFLOAT16-DRAM", 0.55, 0.91, 0.89, 0.40),
("decode", 32, 32, 1, 2047, "BFLOAT16-L1", 0.55, 0.91, 0.89, 0.35),
("prefill", 32, 1, 128, 0, "BFLOAT16-DRAM", 0.31),
("prefill", 32, 1, 128, 0, "BFLOAT16-L1", 0.29),
("prefill", 32, 1, 256, 0, "BFLOAT16-DRAM", 0.43),
("prefill", 32, 1, 256, 0, "BFLOAT16-L1", 0.34),
("decode", 32, 32, 1, 128, "BFLOAT16-DRAM", 0.28),
("decode", 32, 32, 1, 128, "BFLOAT16-L1", 0.28),
("decode", 32, 32, 1, 1024, "BFLOAT16-DRAM", 0.37),
("decode", 32, 32, 1, 1024, "BFLOAT16-L1", 0.31),
("decode", 32, 32, 1, 2047, "BFLOAT16-DRAM", 0.40),
("decode", 32, 32, 1, 2047, "BFLOAT16-L1", 0.35),
),
ids=[
"prefill_seq128_bf16_dram",
Expand All @@ -63,10 +68,6 @@ def test_perf_gs_bare_metal(
kv_cache_len,
expected_inference_time,
num_layers,
expected_output_pcc,
expected_k_cache_pcc,
expected_v_cache_pcc,
request,
model_config_str,
model_location_generator,
get_tt_cache_path,
Expand All @@ -87,6 +88,15 @@ def test_perf_gs_bare_metal(
disable_persistent_kernel_cache()
disable_compilation_reports()

if llm_mode == "prefill":
expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc = PREFILL_CONFIG_TO_PCC[
DeviceSetup.GRAYSKULL
][model_config_str][seq_len]
else:
expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc = DECODE_CONFIG_TO_PCC[
DeviceSetup.GRAYSKULL
][model_config_str][kv_cache_len]

run_test_FalconCausalLM_end_to_end(
[device],
model_version,
Expand All @@ -100,8 +110,8 @@ def test_perf_gs_bare_metal(
model_config_str,
tt_cache_path,
model_location_generator,
expected_inference_time,
e2e_perf=True,
expected_inference_time=expected_inference_time,
)

def run_perf_wh_bare_metal(
Expand Down Expand Up @@ -151,27 +161,27 @@ def run_perf_wh_bare_metal(
model_config_str,
tt_cache_path,
model_location_generator,
expected_inference_time,
async_mode,
e2e_perf=True,
expected_inference_time=expected_inference_time,
async_mode=async_mode,
)

@pytest.mark.models_performance_bare_metal
@pytest.mark.parametrize(
"llm_mode, num_layers, batch, seq_len, kv_cache_len, model_config_str, expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc, expected_inference_time",
"llm_mode, num_layers, batch, seq_len, kv_cache_len, model_config_str, expected_inference_time",
(
("prefill", 32, 1, 128, 0, "BFLOAT16-DRAM", 0.97, 0.99, 0.97, 0.1),
("prefill", 32, 1, 1024, 0, "BFLOAT16-DRAM", 0.99, 0.99, 0.98, 0.5),
("prefill", 32, 1, 2048, 0, "BFLOAT16-DRAM", 0.99, 0.99, 0.98, 1.1),
("decode", 32, 32, 1, 128, "BFLOAT16-DRAM", 0.91, 0.92, 0.93, 0.15),
("decode", 32, 32, 1, 128, "BFLOAT16-L1", 0.91, 0.92, 0.93, 0.15),
("decode", 32, 32, 1, 128, "BFLOAT16-L1_SHARDED", 0.92, 0.95, 0.95, 0.1),
("decode", 32, 32, 1, 1024, "BFLOAT16-DRAM", 0.86, 0.92, 0.92, 0.4),
("decode", 32, 32, 1, 1024, "BFLOAT16-L1", 0.86, 0.92, 0.92, 0.35),
("decode", 32, 32, 1, 1024, "BFLOAT16-L1_SHARDED", 0.87, 0.94, 0.94, 0.1),
("decode", 32, 32, 1, 2047, "BFLOAT16-DRAM", 0.88, 0.93, 0.93, 0.75),
("decode", 32, 32, 1, 2047, "BFLOAT16-L1", 0.88, 0.93, 0.93, 0.6),
("decode", 32, 32, 1, 2047, "BFLOAT16-L1_SHARDED", 0.88, 0.92, 0.93, 0.11),
("prefill", 32, 1, 128, 0, "BFLOAT16-DRAM", 0.1),
("prefill", 32, 1, 1024, 0, "BFLOAT16-DRAM", 0.5),
("prefill", 32, 1, 2048, 0, "BFLOAT16-DRAM", 1.1),
("decode", 32, 32, 1, 128, "BFLOAT16-DRAM", 0.15),
("decode", 32, 32, 1, 128, "BFLOAT16-L1", 0.15),
("decode", 32, 32, 1, 128, "BFLOAT16-L1_SHARDED", 0.1),
("decode", 32, 32, 1, 1024, "BFLOAT16-DRAM", 0.4),
("decode", 32, 32, 1, 1024, "BFLOAT16-L1", 0.35),
("decode", 32, 32, 1, 1024, "BFLOAT16-L1_SHARDED", 0.1),
("decode", 32, 32, 1, 2047, "BFLOAT16-DRAM", 0.75),
("decode", 32, 32, 1, 2047, "BFLOAT16-L1", 0.6),
("decode", 32, 32, 1, 2047, "BFLOAT16-L1_SHARDED", 0.11),
),
ids=[
"prefill_seq128_bf16_dram",
Expand Down Expand Up @@ -199,10 +209,6 @@ def test_perf_wh_bare_metal(
kv_cache_len,
expected_inference_time,
num_layers,
expected_output_pcc,
expected_k_cache_pcc,
expected_v_cache_pcc,
request,
model_config_str,
model_location_generator,
get_tt_cache_path,
Expand All @@ -215,6 +221,16 @@ def test_perf_wh_bare_metal(
pytest.skip(
f"Skipping {llm_mode} with {kv_cache_len} in async mode. Config is supported but provides redundant testing."
)

if llm_mode == "prefill":
expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc = PREFILL_CONFIG_TO_PCC[
DeviceSetup.WORMHOLE_B0
][model_config_str][seq_len]
else:
expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc = DECODE_CONFIG_TO_PCC[
DeviceSetup.WORMHOLE_B0
][model_config_str][kv_cache_len]

self.run_perf_wh_bare_metal(
model_version,
1,
Expand All @@ -234,22 +250,22 @@ def test_perf_wh_bare_metal(

@pytest.mark.model_perf_t3000
@pytest.mark.parametrize(
"llm_mode, num_devices, num_layers, batch, seq_len, kv_cache_len, model_config_str, expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc, expected_inference_time, async_mode",
"llm_mode, num_devices, num_layers, batch, seq_len, kv_cache_len, model_config_str, expected_inference_time, async_mode",
(
("prefill", 4, 32, 1, 128, 0, "BFLOAT16-DRAM", 0.98, 0.99, 0.97, 0.1, False),
("prefill", 4, 32, 1, 256, 0, "BFLOAT16-DRAM", 0.99, 0.99, 0.97, 0.18, False),
("prefill", 4, 32, 1, 1024, 0, "BFLOAT16-DRAM", 0.99, 0.99, 0.98, 0.5, False),
("prefill", 4, 32, 1, 2048, 0, "BFLOAT16-DRAM", 0.99, 0.99, 0.98, 1.1, False),
("decode", 4, 32, 32, 1, 128, "BFLOAT16-L1_SHARDED", 0.89, 0.94, 0.94, 0.09, False),
("decode", 4, 32, 32, 1, 1024, "BFLOAT16-L1_SHARDED", 0.86, 0.90, 0.91, 0.09, False),
("decode", 4, 32, 32, 1, 2047, "BFLOAT16-L1_SHARDED", 0.77, 0.69, 0.72, 0.1, False),
("prefill", 4, 32, 1, 128, 0, "BFLOAT16-DRAM", 0.98, 0.99, 0.97, 0.11, True), # Issue 9422
("prefill", 4, 32, 1, 256, 0, "BFLOAT16-DRAM", 0.99, 0.99, 0.97, 0.18, True),
("prefill", 4, 32, 1, 1024, 0, "BFLOAT16-DRAM", 0.99, 0.99, 0.98, 0.5, True),
("prefill", 4, 32, 1, 2048, 0, "BFLOAT16-DRAM", 0.99, 0.99, 0.98, 1.1, True),
("decode", 4, 32, 32, 1, 128, "BFLOAT16-L1_SHARDED", 0.89, 0.94, 0.94, 0.09, True),
("decode", 4, 32, 32, 1, 1024, "BFLOAT16-L1_SHARDED", 0.86, 0.90, 0.91, 0.09, True),
("decode", 4, 32, 32, 1, 2047, "BFLOAT16-L1_SHARDED", 0.77, 0.69, 0.72, 0.09, True),
("prefill", 4, 32, 1, 128, 0, "BFLOAT16-DRAM", 0.1, False),
("prefill", 4, 32, 1, 256, 0, "BFLOAT16-DRAM", 0.18, False),
("prefill", 4, 32, 1, 1024, 0, "BFLOAT16-DRAM", 0.5, False),
("prefill", 4, 32, 1, 2048, 0, "BFLOAT16-DRAM", 1.1, False),
("decode", 4, 32, 32, 1, 128, "BFLOAT16-L1_SHARDED", 0.09, False),
("decode", 4, 32, 32, 1, 1024, "BFLOAT16-L1_SHARDED", 0.09, False),
("decode", 4, 32, 32, 1, 2047, "BFLOAT16-L1_SHARDED", 0.1, False),
("prefill", 4, 32, 1, 128, 0, "BFLOAT16-DRAM", 0.11, True), # Issue 9422
("prefill", 4, 32, 1, 256, 0, "BFLOAT16-DRAM", 0.18, True),
("prefill", 4, 32, 1, 1024, 0, "BFLOAT16-DRAM", 0.5, True),
("prefill", 4, 32, 1, 2048, 0, "BFLOAT16-DRAM", 1.1, True),
("decode", 4, 32, 32, 1, 128, "BFLOAT16-L1_SHARDED", 0.09, True),
("decode", 4, 32, 32, 1, 1024, "BFLOAT16-L1_SHARDED", 0.09, True),
("decode", 4, 32, 32, 1, 2047, "BFLOAT16-L1_SHARDED", 0.09, True),
),
ids=[
"prefill_seq128",
Expand Down Expand Up @@ -281,15 +297,20 @@ def test_perf_t3000_bare_metal(
expected_inference_time,
async_mode,
num_layers,
expected_output_pcc,
expected_k_cache_pcc,
expected_v_cache_pcc,
request,
model_config_str,
model_location_generator,
get_tt_cache_path,
all_devices,
):
if llm_mode == "prefill":
expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc = PREFILL_CONFIG_TO_PCC[DeviceSetup.T3000][
model_config_str
][seq_len]
else:
expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc = DECODE_CONFIG_TO_PCC[DeviceSetup.T3000][
model_config_str
][kv_cache_len]

self.run_perf_wh_bare_metal(
model_version,
num_devices,
Expand Down

0 comments on commit 4657af0

Please sign in to comment.