#7823: Keep expected PCC at one place

tenstorrent · Jun 21, 2024 · 4657af0 · 4657af0
1 parent 5904433
commit 4657af0
Show file tree

Hide file tree

Showing 3 changed files with 153 additions and 67 deletions.
diff --git a/models/demos/falcon7b/tests/run_falcon_end_to_end.py b/models/demos/falcon7b/tests/run_falcon_end_to_end.py
@@ -2,6 +2,7 @@
 
 # SPDX-License-Identifier: Apache-2.0
 
+from enum import Enum
 import torch
 from loguru import logger
 import numpy as np
@@ -47,6 +48,59 @@ def get_inputs_on_device(llm_mode, tt_FalconCausalLM, model_input, kv_cache_len,
     return tt_input_ids, tt_attention_mask
 
 
+class DeviceSetup(Enum):
+    GRAYSKULL = 0
+    WORMHOLE_B0 = 1
+    T3000 = 2
+
+
+# CONFIG_TO_PCC[arch][model_config_str][seq_len] = (output_pcc, k_cache_pcc, v_cache_pcc)
+PREFILL_CONFIG_TO_PCC = {
+    DeviceSetup.GRAYSKULL: {
+        "BFLOAT16-DRAM": {
+            128: (0.85, 0.97, 0.86),
+            256: (0.90, 0.97, 0.87),
+        },
+        "BFLOAT16-L1": {
+            128: (0.85, 0.97, 0.86),
+            256: (0.90, 0.97, 0.87),
+        },
+    },
+    DeviceSetup.WORMHOLE_B0: {
+        "BFLOAT16-DRAM": {
+            128: (0.97, 0.99, 0.97),
+            256: (0.99, 0.99, 0.97),
+            1024: (0.99, 0.99, 0.98),
+            2048: (0.99, 0.99, 0.98),
+        }
+    },
+    DeviceSetup.T3000: {
+        "BFLOAT16-DRAM": {
+            128: (0.98, 0.99, 0.97),
+            256: (0.99, 0.99, 0.97),
+            1024: (0.99, 0.99, 0.98),
+            2048: (0.99, 0.99, 0.98),
+        }
+    },
+}
+
+# CONFIG_TO_PCC[arch][model_config_str][kv_cache_len] = (output_pcc, k_cache_pcc, v_cache_pcc)
+DECODE_CONFIG_TO_PCC = {
+    DeviceSetup.GRAYSKULL: {
+        "BFLOAT16-DRAM": {128: (0.63, 0.80, 0.84), 1024: (0.56, 0.86, 0.88), 2047: (0.55, 0.91, 0.89)},
+        "BFLOAT16-L1": {128: (0.63, 0.80, 0.84), 1024: (0.56, 0.86, 0.88), 2047: (0.55, 0.91, 0.89)},
+    },
+    DeviceSetup.WORMHOLE_B0: {
+        "BFLOAT16-DRAM": {128: (0.91, 0.92, 0.93), 1024: (0.86, 0.92, 0.92), 2047: (0.88, 0.93, 0.93)},
+        "BFLOAT16-L1": {128: (0.91, 0.92, 0.93), 1024: (0.86, 0.92, 0.92), 2047: (0.88, 0.93, 0.93)},
+        "BFLOAT16-L1_SHARDED": {128: (0.92, 0.95, 0.95), 1024: (0.87, 0.94, 0.94), 2047: (0.88, 0.92, 0.93)},
+    },
+    DeviceSetup.T3000: {
+        "BFLOAT16-L1_SHARDED": {128: (0.89, 0.94, 0.94), 1024: (0.86, 0.90, 0.91), 2047: (0.77, 0.69, 0.72)}
+    },
+}
+
+
 def run_test_FalconCausalLM_end_to_end(
     devices,
     model_version,
@@ -60,12 +114,14 @@ def run_test_FalconCausalLM_end_to_end(
     model_config_str,
     tt_cache_path,
     model_location_generator,
-    expected_inference_time,
-    async_mode=False,
     e2e_perf=False,
+    expected_inference_time=None,
     device_perf=False,
+    async_mode=False,
 ):
     assert not (e2e_perf and device_perf), "Cannot run both e2e and device perf test at the same time"
+    if e2e_perf:
+        assert expected_inference_time is not None, "Expected inference time is required for e2e perf test"
 
     # Clear global profiler state before starting measurements
     if e2e_perf:

diff --git a/models/demos/falcon7b/tests/test_falcon_device_perf.py b/models/demos/falcon7b/tests/test_falcon_device_perf.py
@@ -4,7 +4,12 @@
 
 import pytest
 
-from models.demos.falcon7b.tests.run_falcon_end_to_end import run_test_FalconCausalLM_end_to_end
+from models.demos.falcon7b.tests.run_falcon_end_to_end import (
+    DECODE_CONFIG_TO_PCC,
+    PREFILL_CONFIG_TO_PCC,
+    DeviceSetup,
+    run_test_FalconCausalLM_end_to_end,
+)
 from models.demos.falcon7b.tt.model_config import get_model_config
 from models.perf.device_perf_utils import check_device_perf, prep_device_perf_report, run_device_perf
 from models.utility_functions import disable_compilation_reports, disable_persistent_kernel_cache
@@ -16,11 +21,11 @@
     ids=["falcon_7b"],
 )
 @pytest.mark.parametrize(
-    "llm_mode, num_layers, batch, seq_len, kv_cache_len, model_config_str, expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc, expected_inference_time",
+    "llm_mode, num_layers, batch, seq_len, kv_cache_len, model_config_str",
     (
-        ("prefill", 32, 1, 128, 0, "BFLOAT16-DRAM", 0.97, 0.99, 0.97, 0.1),
-        ("prefill", 32, 1, 1024, 0, "BFLOAT16-DRAM", 0.99, 0.99, 0.98, 0.5),
-        ("prefill", 32, 1, 2048, 0, "BFLOAT16-DRAM", 0.99, 0.99, 0.98, 1.1),
+        ("prefill", 32, 1, 128, 0, "BFLOAT16-DRAM"),
+        ("prefill", 32, 1, 1024, 0, "BFLOAT16-DRAM"),
+        ("prefill", 32, 1, 2048, 0, "BFLOAT16-DRAM"),
     ),
     ids=[
         "prefill_seq128_bfloat16-dram",
@@ -34,11 +39,7 @@ def test_device_perf_wh_bare_metal(
     batch,
     seq_len,
     kv_cache_len,
-    expected_inference_time,
     num_layers,
-    expected_output_pcc,
-    expected_k_cache_pcc,
-    expected_v_cache_pcc,
     model_config_str,
     model_location_generator,
     get_tt_cache_path,
@@ -52,6 +53,15 @@ def test_device_perf_wh_bare_metal(
     disable_persistent_kernel_cache()
     disable_compilation_reports()
 
+    if llm_mode == "prefill":
+        expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc = PREFILL_CONFIG_TO_PCC[
+            DeviceSetup.WORMHOLE_B0
+        ][model_config_str][seq_len]
+    else:
+        expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc = DECODE_CONFIG_TO_PCC[DeviceSetup.WORMHOLE_B0][
+            model_config_str
+        ][kv_cache_len]
+
     run_test_FalconCausalLM_end_to_end(
         [device],
         model_version,
@@ -65,7 +75,6 @@ def test_device_perf_wh_bare_metal(
         model_config_str,
         tt_cache_path,
         model_location_generator,
-        expected_inference_time,
         device_perf=True,
     )
 

diff --git a/models/demos/falcon7b/tests/test_perf_falcon.py b/models/demos/falcon7b/tests/test_perf_falcon.py
@@ -4,7 +4,12 @@
 
 import pytest
 
-from models.demos.falcon7b.tests.run_falcon_end_to_end import run_test_FalconCausalLM_end_to_end
+from models.demos.falcon7b.tests.run_falcon_end_to_end import (
+    DECODE_CONFIG_TO_PCC,
+    PREFILL_CONFIG_TO_PCC,
+    DeviceSetup,
+    run_test_FalconCausalLM_end_to_end,
+)
 from models.demos.falcon7b.tt.model_config import (
     get_model_config,
 )
@@ -27,18 +32,18 @@
 class TestParametrized:
     @pytest.mark.models_performance_bare_metal
     @pytest.mark.parametrize(
-        "llm_mode, num_layers, batch, seq_len, kv_cache_len, model_config_str, expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc, expected_inference_time",
+        "llm_mode, num_layers, batch, seq_len, kv_cache_len, model_config_str, expected_inference_time",
         (
-            ("prefill", 32, 1, 128, 0, "BFLOAT16-DRAM", 0.85, 0.97, 0.86, 0.31),
-            ("prefill", 32, 1, 128, 0, "BFLOAT16-L1", 0.85, 0.97, 0.86, 0.29),
-            ("prefill", 32, 1, 256, 0, "BFLOAT16-DRAM", 0.90, 0.97, 0.87, 0.43),
-            ("prefill", 32, 1, 256, 0, "BFLOAT16-L1", 0.90, 0.97, 0.87, 0.34),
-            ("decode", 32, 32, 1, 128, "BFLOAT16-DRAM", 0.63, 0.80, 0.84, 0.28),
-            ("decode", 32, 32, 1, 128, "BFLOAT16-L1", 0.63, 0.80, 0.84, 0.28),
-            ("decode", 32, 32, 1, 1024, "BFLOAT16-DRAM", 0.56, 0.86, 0.88, 0.37),
-            ("decode", 32, 32, 1, 1024, "BFLOAT16-L1", 0.56, 0.86, 0.88, 0.31),
-            ("decode", 32, 32, 1, 2047, "BFLOAT16-DRAM", 0.55, 0.91, 0.89, 0.40),
-            ("decode", 32, 32, 1, 2047, "BFLOAT16-L1", 0.55, 0.91, 0.89, 0.35),
+            ("prefill", 32, 1, 128, 0, "BFLOAT16-DRAM", 0.31),
+            ("prefill", 32, 1, 128, 0, "BFLOAT16-L1", 0.29),
+            ("prefill", 32, 1, 256, 0, "BFLOAT16-DRAM", 0.43),
+            ("prefill", 32, 1, 256, 0, "BFLOAT16-L1", 0.34),
+            ("decode", 32, 32, 1, 128, "BFLOAT16-DRAM", 0.28),
+            ("decode", 32, 32, 1, 128, "BFLOAT16-L1", 0.28),
+            ("decode", 32, 32, 1, 1024, "BFLOAT16-DRAM", 0.37),
+            ("decode", 32, 32, 1, 1024, "BFLOAT16-L1", 0.31),
+            ("decode", 32, 32, 1, 2047, "BFLOAT16-DRAM", 0.40),
+            ("decode", 32, 32, 1, 2047, "BFLOAT16-L1", 0.35),
         ),
         ids=[
             "prefill_seq128_bf16_dram",
@@ -63,10 +68,6 @@ def test_perf_gs_bare_metal(
         kv_cache_len,
         expected_inference_time,
         num_layers,
-        expected_output_pcc,
-        expected_k_cache_pcc,
-        expected_v_cache_pcc,
-        request,
         model_config_str,
         model_location_generator,
         get_tt_cache_path,
@@ -87,6 +88,15 @@ def test_perf_gs_bare_metal(
         disable_persistent_kernel_cache()
         disable_compilation_reports()
 
+        if llm_mode == "prefill":
+            expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc = PREFILL_CONFIG_TO_PCC[
+                DeviceSetup.GRAYSKULL
+            ][model_config_str][seq_len]
+        else:
+            expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc = DECODE_CONFIG_TO_PCC[
+                DeviceSetup.GRAYSKULL
+            ][model_config_str][kv_cache_len]
+
         run_test_FalconCausalLM_end_to_end(
             [device],
             model_version,
@@ -100,8 +110,8 @@ def test_perf_gs_bare_metal(
             model_config_str,
             tt_cache_path,
             model_location_generator,
-            expected_inference_time,
             e2e_perf=True,
+            expected_inference_time=expected_inference_time,
         )
 
     def run_perf_wh_bare_metal(
@@ -151,27 +161,27 @@ def run_perf_wh_bare_metal(
             model_config_str,
             tt_cache_path,
             model_location_generator,
-            expected_inference_time,
-            async_mode,
             e2e_perf=True,
+            expected_inference_time=expected_inference_time,
+            async_mode=async_mode,
         )
 
     @pytest.mark.models_performance_bare_metal
     @pytest.mark.parametrize(
-        "llm_mode, num_layers, batch, seq_len, kv_cache_len, model_config_str, expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc, expected_inference_time",
+        "llm_mode, num_layers, batch, seq_len, kv_cache_len, model_config_str, expected_inference_time",
         (
-            ("prefill", 32, 1, 128, 0, "BFLOAT16-DRAM", 0.97, 0.99, 0.97, 0.1),
-            ("prefill", 32, 1, 1024, 0, "BFLOAT16-DRAM", 0.99, 0.99, 0.98, 0.5),
-            ("prefill", 32, 1, 2048, 0, "BFLOAT16-DRAM", 0.99, 0.99, 0.98, 1.1),
-            ("decode", 32, 32, 1, 128, "BFLOAT16-DRAM", 0.91, 0.92, 0.93, 0.15),
-            ("decode", 32, 32, 1, 128, "BFLOAT16-L1", 0.91, 0.92, 0.93, 0.15),
-            ("decode", 32, 32, 1, 128, "BFLOAT16-L1_SHARDED", 0.92, 0.95, 0.95, 0.1),
-            ("decode", 32, 32, 1, 1024, "BFLOAT16-DRAM", 0.86, 0.92, 0.92, 0.4),
-            ("decode", 32, 32, 1, 1024, "BFLOAT16-L1", 0.86, 0.92, 0.92, 0.35),
-            ("decode", 32, 32, 1, 1024, "BFLOAT16-L1_SHARDED", 0.87, 0.94, 0.94, 0.1),
-            ("decode", 32, 32, 1, 2047, "BFLOAT16-DRAM", 0.88, 0.93, 0.93, 0.75),
-            ("decode", 32, 32, 1, 2047, "BFLOAT16-L1", 0.88, 0.93, 0.93, 0.6),
-            ("decode", 32, 32, 1, 2047, "BFLOAT16-L1_SHARDED", 0.88, 0.92, 0.93, 0.11),
+            ("prefill", 32, 1, 128, 0, "BFLOAT16-DRAM", 0.1),
+            ("prefill", 32, 1, 1024, 0, "BFLOAT16-DRAM", 0.5),
+            ("prefill", 32, 1, 2048, 0, "BFLOAT16-DRAM", 1.1),
+            ("decode", 32, 32, 1, 128, "BFLOAT16-DRAM", 0.15),
+            ("decode", 32, 32, 1, 128, "BFLOAT16-L1", 0.15),
+            ("decode", 32, 32, 1, 128, "BFLOAT16-L1_SHARDED", 0.1),
+            ("decode", 32, 32, 1, 1024, "BFLOAT16-DRAM", 0.4),
+            ("decode", 32, 32, 1, 1024, "BFLOAT16-L1", 0.35),
+            ("decode", 32, 32, 1, 1024, "BFLOAT16-L1_SHARDED", 0.1),
+            ("decode", 32, 32, 1, 2047, "BFLOAT16-DRAM", 0.75),
+            ("decode", 32, 32, 1, 2047, "BFLOAT16-L1", 0.6),
+            ("decode", 32, 32, 1, 2047, "BFLOAT16-L1_SHARDED", 0.11),
         ),
         ids=[
             "prefill_seq128_bf16_dram",
@@ -199,10 +209,6 @@ def test_perf_wh_bare_metal(
         kv_cache_len,
         expected_inference_time,
         num_layers,
-        expected_output_pcc,
-        expected_k_cache_pcc,
-        expected_v_cache_pcc,
-        request,
         model_config_str,
         model_location_generator,
         get_tt_cache_path,
@@ -215,6 +221,16 @@ def test_perf_wh_bare_metal(
                 pytest.skip(
                     f"Skipping {llm_mode} with {kv_cache_len} in async mode. Config is supported but provides redundant testing."
                 )
+
+        if llm_mode == "prefill":
+            expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc = PREFILL_CONFIG_TO_PCC[
+                DeviceSetup.WORMHOLE_B0
+            ][model_config_str][seq_len]
+        else:
+            expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc = DECODE_CONFIG_TO_PCC[
+                DeviceSetup.WORMHOLE_B0
+            ][model_config_str][kv_cache_len]
+
         self.run_perf_wh_bare_metal(
             model_version,
             1,
@@ -234,22 +250,22 @@ def test_perf_wh_bare_metal(
 
     @pytest.mark.model_perf_t3000
     @pytest.mark.parametrize(
-        "llm_mode, num_devices, num_layers, batch, seq_len, kv_cache_len, model_config_str, expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc, expected_inference_time, async_mode",
+        "llm_mode, num_devices, num_layers, batch, seq_len, kv_cache_len, model_config_str, expected_inference_time, async_mode",
         (
-            ("prefill", 4, 32, 1, 128, 0, "BFLOAT16-DRAM", 0.98, 0.99, 0.97, 0.1, False),
-            ("prefill", 4, 32, 1, 256, 0, "BFLOAT16-DRAM", 0.99, 0.99, 0.97, 0.18, False),
-            ("prefill", 4, 32, 1, 1024, 0, "BFLOAT16-DRAM", 0.99, 0.99, 0.98, 0.5, False),
-            ("prefill", 4, 32, 1, 2048, 0, "BFLOAT16-DRAM", 0.99, 0.99, 0.98, 1.1, False),
-            ("decode", 4, 32, 32, 1, 128, "BFLOAT16-L1_SHARDED", 0.89, 0.94, 0.94, 0.09, False),
-            ("decode", 4, 32, 32, 1, 1024, "BFLOAT16-L1_SHARDED", 0.86, 0.90, 0.91, 0.09, False),
-            ("decode", 4, 32, 32, 1, 2047, "BFLOAT16-L1_SHARDED", 0.77, 0.69, 0.72, 0.1, False),
-            ("prefill", 4, 32, 1, 128, 0, "BFLOAT16-DRAM", 0.98, 0.99, 0.97, 0.11, True),  # Issue 9422
-            ("prefill", 4, 32, 1, 256, 0, "BFLOAT16-DRAM", 0.99, 0.99, 0.97, 0.18, True),
-            ("prefill", 4, 32, 1, 1024, 0, "BFLOAT16-DRAM", 0.99, 0.99, 0.98, 0.5, True),
-            ("prefill", 4, 32, 1, 2048, 0, "BFLOAT16-DRAM", 0.99, 0.99, 0.98, 1.1, True),
-            ("decode", 4, 32, 32, 1, 128, "BFLOAT16-L1_SHARDED", 0.89, 0.94, 0.94, 0.09, True),
-            ("decode", 4, 32, 32, 1, 1024, "BFLOAT16-L1_SHARDED", 0.86, 0.90, 0.91, 0.09, True),
-            ("decode", 4, 32, 32, 1, 2047, "BFLOAT16-L1_SHARDED", 0.77, 0.69, 0.72, 0.09, True),
+            ("prefill", 4, 32, 1, 128, 0, "BFLOAT16-DRAM", 0.1, False),
+            ("prefill", 4, 32, 1, 256, 0, "BFLOAT16-DRAM", 0.18, False),
+            ("prefill", 4, 32, 1, 1024, 0, "BFLOAT16-DRAM", 0.5, False),
+            ("prefill", 4, 32, 1, 2048, 0, "BFLOAT16-DRAM", 1.1, False),
+            ("decode", 4, 32, 32, 1, 128, "BFLOAT16-L1_SHARDED", 0.09, False),
+            ("decode", 4, 32, 32, 1, 1024, "BFLOAT16-L1_SHARDED", 0.09, False),
+            ("decode", 4, 32, 32, 1, 2047, "BFLOAT16-L1_SHARDED", 0.1, False),
+            ("prefill", 4, 32, 1, 128, 0, "BFLOAT16-DRAM", 0.11, True),  # Issue 9422
+            ("prefill", 4, 32, 1, 256, 0, "BFLOAT16-DRAM", 0.18, True),
+            ("prefill", 4, 32, 1, 1024, 0, "BFLOAT16-DRAM", 0.5, True),
+            ("prefill", 4, 32, 1, 2048, 0, "BFLOAT16-DRAM", 1.1, True),
+            ("decode", 4, 32, 32, 1, 128, "BFLOAT16-L1_SHARDED", 0.09, True),
+            ("decode", 4, 32, 32, 1, 1024, "BFLOAT16-L1_SHARDED", 0.09, True),
+            ("decode", 4, 32, 32, 1, 2047, "BFLOAT16-L1_SHARDED", 0.09, True),
         ),
         ids=[
             "prefill_seq128",
@@ -281,15 +297,20 @@ def test_perf_t3000_bare_metal(
         expected_inference_time,
         async_mode,
         num_layers,
-        expected_output_pcc,
-        expected_k_cache_pcc,
-        expected_v_cache_pcc,
-        request,
         model_config_str,
         model_location_generator,
         get_tt_cache_path,
         all_devices,
     ):
+        if llm_mode == "prefill":
+            expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc = PREFILL_CONFIG_TO_PCC[DeviceSetup.T3000][
+                model_config_str
+            ][seq_len]
+        else:
+            expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc = DECODE_CONFIG_TO_PCC[DeviceSetup.T3000][
+                model_config_str
+            ][kv_cache_len]
+
         self.run_perf_wh_bare_metal(
             model_version,
             num_devices,