Merge branch 'main' into blozano/rc_local

tenstorrent · Dec 12, 2024 · 2d40904 · 2d40904
2 parents da6fb63 + 6e983a7
commit 2d40904
Show file tree

Hide file tree

Showing 53 changed files with 958 additions and 500 deletions.
diff --git a/.clang-format b/.clang-format
@@ -95,6 +95,7 @@ PenaltyBreakTemplateDeclaration: 10
 PenaltyExcessCharacter: 1000000
 PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Left
+QualifierAlignment: Left
 RawStringFormats:
   - Language:        Cpp
     Delimiters:

diff --git a/.clang-tidy b/.clang-tidy
@@ -195,3 +195,5 @@ Checks: >
 CheckOptions:
   - key: readability-function-cognitive-complexity.IgnoreMacros
     value: true
+
+FormatStyle: 'file'
diff --git a/.github/workflows/_produce-data.yaml b/.github/workflows/_produce-data.yaml
@@ -24,6 +24,7 @@ on:
       - "(Single-card) Demo tests"
       - "(Single-card) Tests for new models"
       - "Nightly fast dispatch tests"
+      - "(Single-card) Nightly model and ttnn tests"
       - "(Single-card) Tests for new models"
       - "(T3K) T3000 demo tests"
       - "(T3K) T3000 model perf tests"

diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml
@@ -40,13 +40,6 @@ jobs:
               cmd: tests/scripts/single_card/nightly/run_ttnn.sh,
               timeout: 70
             },
-            {
-              name: "WH N300 pgm dispatch nightly",
-              arch: wormhole_b0,
-              runs-on: ["cloud-virtual-machine", "N300", "issue-15821"],
-              cmd: ./tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/compare_pgm_dispatch_perf_ci.sh,
-              timeout: 10
-            },
             {
               name: "GS-only models",
               arch: grayskull,
@@ -151,25 +144,26 @@ jobs:
         test-config:
           - model: "stable_diffusion"
             cmd: pytest --timeout 900 -n auto tests/nightly/single_card/stable_diffusion
-          - model: "mamba 1"
-            cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 1
-          - model: "mamba 2"
-            cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 2
-          - model: "mamba 3"
-            cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 3
-          - model: "mamba 4"
-            cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 4
+              # Skipping due to issue #15932
+              # - model: "mamba 1"
+              # cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 1
+              # - model: "mamba 2"
+              # cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 2
+              # - model: "mamba 3"
+              # cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 3
+              # - model: "mamba 4"
+              # cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 4
           - model: "mamba 5"
             cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 5
-          - model: "mamba 6"
-            cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 6
+              # - model: "mamba 6"
+              # cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 6
         card: [N150, N300]
     name: "[Unstable] Nightly ${{ matrix.card }} ${{ matrix.test-config.model }}"
     env:
       ARCH_NAME: wormhole_b0
       LOGURU_LEVEL: INFO
       LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
-    runs-on: ["cloud-virtual-machine", "in-service", "${{ matrix.card }}"]
+    runs-on: ["cloud-virtual-machine", "issue-15821", "${{ matrix.card }}"]
     steps:
       - uses: tenstorrent-metal/metal-workflows/.github/actions/[email protected]
       - uses: ./.github/actions/retry-command

diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models.yaml
@@ -1,4 +1,4 @@
-name: Nightly fast dispatch tests
+name: "(Single-card) Nightly model and ttnn tests"
 
 on:
   workflow_dispatch:

diff --git a/INSTALLING.md b/INSTALLING.md
@@ -20,9 +20,9 @@ Note the current compatability matrix:
 
 | Device              | OS              | Python   | Driver (TT-KMD)    | Firmware (TT-Flash)                        | TT-SMI                | TT-Topology                    |
 |---------------------|-----------------|----------|--------------------|--------------------------------------------|-----------------------|--------------------------------|
-| Grayskull           | Ubuntu 20.04    | 3.8.10   | v1.27.1            | fw_pack-80.9.0.0 (v80.9.0.0)               | v2.2.0 or above       | N/A                            |
-| Wormhole            | Ubuntu 20.04    | 3.8.10   | v1.27.1            | fw_pack-80.10.0.0 (v80.10.0.0)             | v2.2.0 or above       | N/A                            |
-| T3000 (Wormhole)    | Ubuntu 20.04    | 3.8.10   | v1.27.1            | fw_pack-80.10.0.0 (v80.10.0.0)             | v2.2.0 or above       | v1.1.3 or above, `mesh` config |
+| Grayskull           | Ubuntu 20.04    | 3.8.10   | v1.29              | fw_pack-80.9.0.0 (v80.9.0.0)               | v2.2.0 or above       | N/A                            |
+| Wormhole            | Ubuntu 20.04    | 3.8.10   | v1.29              | fw_pack-80.13.0.0 (v80.13.0.0)             | v2.2.0 or above       | N/A                            |
+| T3000 (Wormhole)    | Ubuntu 20.04    | 3.8.10   | v1.29              | fw_pack-80.13.0.0 (v80.13.0.0)             | v2.2.0 or above       | v1.1.3 or above, `mesh` config |
 
 ---
 

diff --git a/models/demos/vgg/tests/test_perf_vgg.py b/models/demos/vgg/tests/test_perf_vgg.py
@@ -22,7 +22,7 @@
 
 
 def get_expected_times(vgg):
-    return (16, 10.5)
+    return (17, 10.5)
 
 
 @pytest.mark.models_performance_bare_metal

diff --git a/models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py b/models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py
@@ -72,6 +72,7 @@ def unsqueeze_all_params_to_4d(params):
         (2, 4, 64, 64),
     ],
 )
+@pytest.mark.skip(reason="#15931: Failing, skip for now")
 def test_unet_2d_condition_model_512x512(device, batch_size, in_channels, input_height, input_width):
     device.enable_program_cache()
 

diff --git a/models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py b/models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py
@@ -29,6 +29,7 @@
 @pytest.mark.parametrize("res_hidden_states_tuple", [([2, 1280, 8, 8], [2, 1280, 8, 8], [2, 1280, 8, 8])])
 @pytest.mark.parametrize("hidden_states", [[2, 1280, 8, 8]])
 @pytest.mark.parametrize("temb", [[1, 1, 2, 1280]])
+@pytest.mark.skip(reason="#15931: Fails, need to investigate")
 def test_upblock_512x512(reset_seeds, device, res_hidden_states_tuple, hidden_states, temb):
     # TODO
     # setup pytorch model

diff --git a/models/demos/yolov4/tests/test_perf_yolo.py b/models/demos/yolov4/tests/test_perf_yolo.py
@@ -23,7 +23,7 @@
 
 
 def get_expected_times():
-    return (40, 16)
+    return (40, 16.2)
 
 
 @pytest.mark.models_performance_bare_metal
@@ -96,7 +96,7 @@ def test_perf_device_bare_metal_yolov4(batch_size, model_name):
     num_iterations = 1
     margin = 0.03
 
-    expected_perf = 197.89
+    expected_perf = 199.89
     command = f"pytest tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py"
 
     cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"]

diff --git a/models/utility_functions.py b/models/utility_functions.py
@@ -15,6 +15,8 @@
 
 from ttnn.device import Arch
 
+from typing_extensions import deprecated
+
 
 ### Math operations ###
 def _nearest_32(x):
@@ -430,108 +432,22 @@ def convert_act_2d_matrix(activation, kernel_y, kernel_x, stride_y, stride_x, pa
 
 
 ### Tilizing / Untilizing ###
+@deprecated("PyTorch data is handled automatically in tensor infra. This function does nothing now:")
 def tilize(x):
-    """
-    This function tilizes a tensor. The last two tensor dims must be divisible by 32, after which this function
-    produces row major tiles and creates faces. The output of this function is a flattened list that
-    we can send to the device.
-
-    :param x: Input PyTorch Tensor
-    :type x: class:`torch.Tensor`
-
-    WARNING: This function should eventually be retired in favour of fully tilizing on device.
-    """
-    nearest_32 = _nearest_32
-
-    assert isinstance(
-        x, (torch.Tensor, np.ndarray)
-    ), "Input to this function must be an instance of torch.Tensor or np.array"
-    assert len(x.shape) == 4, "Only 4D tensors suppported"
-    assert (x.shape[-2] % 32) == 0 and (
-        x.shape[-1] % 32
-    ) == 0, "The last two dimensions of the tensor must be divisible by 32"
-
-    if isinstance(x, torch.Tensor):
-        ret = torch.zeros(np.prod(x.shape))
-    else:
-        ret = np.zeros(np.prod(x.shape))
-
-    idx = 0
-    for B in range(x.shape[0]):
-        for C in range(x.shape[1]):
-            for H in range(0, x.shape[2], 32):
-                for W in range(0, x.shape[3], 32):
-                    unfaced_tile = x[B, C, H : H + 32, W : W + 32]
-
-                    face0 = unfaced_tile[:16, :16]
-                    face1 = unfaced_tile[:16, 16:]
-                    face2 = unfaced_tile[16:, :16]
-                    face3 = unfaced_tile[16:, 16:]
-
-                    for face in (face0, face1, face2, face3):
-                        ret[idx : idx + 256] = face.reshape(-1)
-                        idx += 256
-
-    return ret.reshape(x.shape)
+    return x
 
 
+@deprecated("PyTorch data is handled automatically in tensor infra. This function does nothing now:")
 def tilize_to_list(x):
     """
-    Tilize a PyTorch and then return the values as a flat list. The last two
-    tensor dims must be divisible by 32, after which this function produces row
-    major tiles and creates faces.
-
-    :param x: Input PyTorch Tensor
-    :type x: class:`torch.Tensor`
-
-    WARNING: This function should eventually be retired in favour of fully tilizing on device.
+    Returns a flattened list of the tensor
     """
-
     return tilize(x).reshape(-1).tolist()
 
 
+@deprecated("PyTorch data is handled automatically in tensor infra. This function does nothing now:")
 def untilize(x):
-    """
-    This function untilizes a tensor to row major format.
-
-    :param x: Input PyTorch Tensor
-    :type x: class:`torch.Tensor`
-
-    WARNING: This function should eventually be retired in favour of fully tilizing on device.
-    """
-    nearest_32 = _nearest_32
-
-    assert isinstance(x, (torch.Tensor, np.ndarray)), "Input to this function must be an instance of torch.Tensor"
-    assert len(x.shape) == 4, "Only 4D tensors suppported"
-    assert (x.shape[-2] % 32) == 0 and (
-        x.shape[-1] % 32
-    ) == 0, "The last two dimensions of the tensor must be divisible by 32"
-
-    if isinstance(x, torch.Tensor):
-        ret = torch.zeros(x.shape, dtype=x.dtype)
-    else:
-        ret = np.zeros(x.shape, dtype=x.dtype)
-
-    for B in range(x.shape[0]):
-        for C in range(x.shape[1]):
-            x_hw = x[B, C, :].reshape(-1)
-            hw = 0
-            for h in range(0, x.shape[2], 32):
-                for w in range(0, x.shape[3], 32):
-                    f_tile = x_hw[hw : hw + 256].reshape(16, 16)
-                    ret[B, C, h : h + 16, w : w + 16] = f_tile
-
-                    f_tile = x_hw[hw + 256 : hw + 512].reshape(16, 16)
-                    ret[B, C, h : h + 16, w + 16 : w + 32] = f_tile
-
-                    f_tile = x_hw[hw + 512 : hw + 768].reshape(16, 16)
-                    ret[B, C, h + 16 : h + 32, w : w + 16] = f_tile
-
-                    f_tile = x_hw[hw + 768 : hw + 1024].reshape(16, 16)
-                    ret[B, C, h + 16 : h + 32, w + 16 : w + 32] = f_tile
-                    hw += 1024  # traverse tiles in RM-order
-
-    return ret
+    return x
 
 
 ### Measuring accuracy and other metrics ###

diff --git a/scripts/docker/requirements_dev.txt b/scripts/docker/requirements_dev.txt
@@ -1,7 +1,9 @@
-sudo
-nano
 acl
+emacs
 jq
+less
+libmpfr-dev
+nano
 openssh-server
+sudo
 vim
-libmpfr-dev
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -5,8 +5,7 @@ target_link_libraries(
     test_common_libs
     INTERFACE
         pthread
-        gtest
-        gtest_main
+        gmock_main
         magic_enum
         fmt::fmt-header-only
         span

diff --git a/tests/sweep_framework/sweep_utils/conv2d_common.py b/tests/sweep_framework/sweep_utils/conv2d_common.py
@@ -48,7 +48,7 @@ def mesh_device_fixture():
     ttnn.close_device(device)
 
 
-def run_full(
+def run_conv2d_full_sweep(
     input_specs,
     input_channels,
     output_channels,
@@ -174,7 +174,7 @@ def run_full(
     return [check_with_pcc(torch_output_tensor, torch_out_golden_tensor, pcc=0.998), e2e_perf]
 
 
-def run_short(
+def run_conv2d_short_sweep(
     input_specs,
     device,
 ) -> list:
@@ -256,3 +256,77 @@ def run_short(
     torch_output_tensor = torch.permute(torch_output_tensor, (0, 3, 1, 2))
 
     return [check_with_pcc(torch_output_tensor, torch_out_golden_tensor, pcc=0.998), e2e_perf]
+
+
+def run_conv1d_short_sweep(
+    input_specs,
+    device,
+) -> list:
+    [
+        batch_size,
+        output_channels,
+        input_channels,
+        input_length,
+        kernel_size,
+        stride,
+        padding,
+        groups,
+        has_bias,
+        dilation,
+    ] = input_specs
+    print(input_specs)
+
+    # has_bias = False
+    torch.manual_seed(0)
+    conv_input_shape = [batch_size, input_channels, input_length]
+    conv_weight_shape = [output_channels, input_channels // groups, kernel_size]
+    conv_bias_shape = [1, 1, 1, output_channels]
+    torch_input_tensor_ncl = torch.randn(conv_input_shape, dtype=torch.bfloat16).float()
+    torch_input_tensor = torch.permute(torch_input_tensor_ncl, (0, 2, 1))
+    torch_weight_tensor = torch.randn(conv_weight_shape, dtype=torch.bfloat16).float()
+    torch_bias_tensor = torch.randn(conv_bias_shape, dtype=torch.bfloat16).float() if has_bias else None
+    torch_out_golden_tensor = torch.nn.functional.conv1d(
+        torch_input_tensor_ncl,
+        torch_weight_tensor,
+        bias=torch_bias_tensor.reshape(-1) if has_bias else None,
+        stride=stride,
+        padding=padding,
+        groups=groups,
+    )
+
+    tt_weight_tensor = ttnn.from_torch(torch_weight_tensor, ttnn.bfloat16)
+    tt_bias_tensor = None
+    if has_bias:
+        tt_bias_tensor = ttnn.from_torch(torch_bias_tensor, ttnn.bfloat16)
+
+    tt_input_tensor = ttnn.from_torch(torch_input_tensor, ttnn.bfloat16)
+
+    start_time = start_measuring_time()
+    [tt_output_tensor_on_device, out_length, [weights_device, bias_device]] = ttnn.Conv1d(
+        input_tensor=tt_input_tensor,
+        weight_tensor=tt_weight_tensor,
+        in_channels=input_channels,
+        out_channels=output_channels,
+        device=device,
+        bias_tensor=tt_bias_tensor,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        batch_size=batch_size,
+        input_length=input_length,
+        groups=groups,
+        return_output_dim=True,
+        return_weights_and_bias=True,
+    )
+
+    tt_output_tensor = ttnn.from_device(tt_output_tensor_on_device)
+    torch_output_tensor = ttnn.to_torch(tt_output_tensor)
+    e2e_perf = stop_measuring_time(start_time)
+
+    # torch_output_tensor is in row major layout and NLC shape
+    # NLC to NCL
+    torch_output_tensor = torch_output_tensor.reshape(batch_size, out_length, output_channels)
+
+    torch_output_tensor = torch.permute(torch_output_tensor, (0, 2, 1))
+
+    return [check_with_pcc(torch_output_tensor, torch_out_golden_tensor, pcc=0.998), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/conv2d/full/conv2d_misc.py b/tests/sweep_framework/sweeps/conv2d/full/conv2d_misc.py
@@ -12,7 +12,7 @@
 
 from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
 from models.utility_functions import torch_random
-from tests.sweep_framework.sweep_utils.conv2d_common import run_full, get_input_specs, mesh_device_fixture
+from tests.sweep_framework.sweep_utils.conv2d_common import run_conv2d_full_sweep, get_input_specs, mesh_device_fixture
 
 # Override the default timeout in seconds for hang detection.
 TIMEOUT = 30
@@ -242,7 +242,7 @@ def run(
     *,
     device,
 ) -> list:
-    return run_full(
+    return run_conv2d_full_sweep(
         input_specs,
         input_channels,
         output_channels,