Merge branch 'main' into sabira/ttnn_lenet

tenstorrent · Dec 6, 2024 · 28a9bf3 · 28a9bf3
2 parents 7d948f4 + 317d346
commit 28a9bf3
Show file tree

Hide file tree

Showing 71 changed files with 4,666 additions and 1,819 deletions.
diff --git a/.github/workflows/all-post-commit-workflows.yaml b/.github/workflows/all-post-commit-workflows.yaml
@@ -159,7 +159,6 @@ jobs:
     secrets: inherit
     with:
       os: ubuntu-22.04-amd64
-    if: github.event_name == 'push'
   tt-train-cpp-unit-tests:
     needs: build-artifact
     secrets: inherit

diff --git a/.github/workflows/metal-run-microbenchmarks.yaml b/.github/workflows/metal-run-microbenchmarks.yaml
@@ -42,6 +42,8 @@ jobs:
           PIPELINE_TYPE="microbenchmarks"
           if [ "${{ matrix.runner-info.ccl }}" == "true" ]; then
             PIPELINE_TYPE="ccl_microbenchmarks"
+          else
+            TT_METAL_SLOW_DISPATCH_MODE=1 ./tests/scripts/run_tunneler_tests.sh --machine-type ${{ matrix.runner-info.runs-on[0] }}
           fi
           ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type "$PIPELINE_TYPE"
       - name: Upload microbenchmark report csvs

diff --git a/.github/workflows/test-comment.yaml b/.github/workflows/test-comment.yaml
diff --git a/.github/workflows/ttnn-run-sweeps.yaml b/.github/workflows/ttnn-run-sweeps.yaml
@@ -42,9 +42,11 @@ on:
           - eltwise.unary.rsqrt.rsqrt_pytorch2
           - eltwise.unary.rdiv.rdiv
           - eltwise.unary.frac.frac
+          - eltwise.unary.frac.frac_sharded
           - eltwise.unary.ceil.ceil
           - eltwise.unary.ceil.ceil_pytorch2
           - eltwise.unary.trunc.trunc
+          - eltwise.unary.trunc.trunc_sharded
           - eltwise.unary.floor.floor
           - eltwise.unary.floor.floor_pytorch2
           - eltwise.unary.clone.clone
@@ -111,6 +113,7 @@ on:
           - eltwise.unary.relu_max.relu_max
           - eltwise.unary.softplus.softplus
           - eltwise.unary.selu.selu
+          - eltwise.unary.softshrink.softshrink_sharded
           - eltwise.unary_backward.fill_zero_bw
           - eltwise.unary_backward.log_sigmoid_bw
           - eltwise.unary_backward.logit_bw
@@ -180,6 +183,7 @@ on:
           - eltwise.unary.mish.mish
           - eltwise.unary.mish.mish_sharded
           - eltwise.unary.multigammaln.multigammaln
+          - eltwise.unary.multigammaln.multigammaln_sharded
           - eltwise.unary.isfinite.isfinite
           - eltwise.unary.isfinite.isfinite_sharded
           - eltwise.unary.isinf.isinf

diff --git a/CODEOWNERS b/CODEOWNERS
@@ -155,6 +155,8 @@ models/demos/t3000/mixtral8x7b @yieldthought @mtairum @uaydonat
 models/demos/tg/llama3_70b @cglagovichTT @uaydonat @johanna-rock-tt @djordje-tt @kpaigwar
 models/demos/tg/falcon7b @skhorasganiTT @djordje-tt @uaydonat
 models/demos/grayskull @uaydonat
+models/demos/yolov4 @dvartaniansTT @shwetankTT
+models/demos/wormhole/yolov4 @dvartaniansTT @shwetankTT
 models/demos/**/*resnet* @mywoodstock @shwetankTT @tt-aho
 models/experimental/functional_unet @esmalTT @uaydonat @mywoodstock
 models/perf/ @uaydonat

diff --git a/models/demos/mnist/tests/test_perf_mnist.py b/models/demos/mnist/tests/test_perf_mnist.py
@@ -112,7 +112,7 @@ def test_perf_device_bare_metal(batch_size, reset_seeds):
     num_iterations = 1
     margin = 0.03
     if is_grayskull():
-        expected_perf = 390000.0
+        expected_perf = 402500.0
     elif is_wormhole_b0():
         expected_perf = 900000.0
 

diff --git a/models/demos/vgg/tests/test_perf_vgg.py b/models/demos/vgg/tests/test_perf_vgg.py
@@ -137,10 +137,10 @@ def test_perf_device_bare_metal_vgg(batch_size, model_name):
     margin = 0.03
 
     if model_name == "ttnn_vgg11":
-        expected_perf = 36 if is_grayskull() else 104
+        expected_perf = 36 if is_grayskull() else 114
         command = f"pytest tests/ttnn/integration_tests/vgg/test_ttnn_vgg11.py"
     else:
-        expected_perf = 34 if is_grayskull() else 90
+        expected_perf = 34 if is_grayskull() else 105
         command = f"pytest tests/ttnn/integration_tests/vgg/test_ttnn_vgg16.py"
 
     cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"]

diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py
@@ -416,6 +416,8 @@ def __call__(
         hidden_states = ttnn.reshape(
             hidden_states, (self.batch_size, 1, self.conv2_input_height * self.conv2_input_width, in_channels)
         )
+        hidden_states = ttnn.reallocate(hidden_states)
+
         hidden_states = ttnn.group_norm(
             hidden_states,
             num_groups=groups,

diff --git a/tests/scripts/run_tt_eager.py b/tests/scripts/run_tt_eager.py
@@ -30,12 +30,11 @@
 )
 
 TT_EAGER_COMMON_TEST_ENTRIES = (
-    void_for_gs(TestEntry("tt_eager/tests/ops/ccl/test_ccl_helpers", "ops/ccl/test_ccl_helpers")),
-    void_for_gs(TestEntry("tt_eager/tests/ops/ccl/test_ccl_tensor_slicers", "ops/ccl/test_ccl_tensor_slicers")),
     TestEntry("tt_eager/tests/ops/test_eltwise_binary_op", "ops/test_eltwise_binary_op"),
     TestEntry("tt_eager/tests/ops/test_bcast_op", "ops/test_bcast_op"),
     TestEntry("tt_eager/tests/ops/test_transpose_op", "ops/test_transpose_op"),
     TestEntry("tt_eager/tests/ops/test_sliding_window_ops", "ops/test_sliding_window_ops"),
+    TestEntry("tt_eager/tests/ops/test_tensor_utils", "ops/test_tensor_utils"),
     TestEntry("tt_eager/tests/ops/test_bmm_op", "ops/test_bmm_op"),
     void_for_bh(void_for_whb0(TestEntry("tt_eager/tests/ops/test_eltwise_unary_op", "ops/test_eltwise_unary_op"))),
     void_for_whb0(

diff --git a/tests/scripts/run_tunneler_tests.sh b/tests/scripts/run_tunneler_tests.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+set -eo pipefail
+
+if [[ -z "$TT_METAL_HOME" ]]; then
+    echo "Must provide TT_METAL_HOME in environment" 1>&2
+    exit 1
+fi
+
+if [[ -z "$TT_METAL_SLOW_DISPATCH_MODE" ]]; then
+    echo "Must provide TT_METAL_SLOW_DISPATCH_MODE in environment" 1>&2
+    exit 1
+fi
+
+export TT_METAL_CLEAR_L1=1
+
+echo "Running tunneler tests now...";
+
+run_test() {
+    echo $1
+    $1
+    echo
+};
+
+run_test_with_watcher() {
+    echo $1
+    TT_METAL_WATCHER=1 TT_METAL_WATCHER_NOINLINE=1 $1
+    echo
+};
+
+main() {
+    # Parse the arguments
+    while [[ $# -gt 0 ]]; do
+      case $1 in
+        --machine-type)
+          machine_type=$2
+          shift
+          ;;
+        *)
+          echo "Unknown option: $1"
+          exit 1
+          ;;
+      esac
+      shift
+    done
+
+    if [[ $ARCH_NAME == "wormhole_b0" && $machine_type != "N150" ]]; then
+      for max_packet_size_words in 256 512 1024 2048; do
+        run_test "./build/test/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel  --tx_x 4 --tx_y 7 --mux_x 0 --mux_y 7 --demux_x 0 --demux_y 0 --rx_x 0 --rx_y 1 --max_packet_size_words $max_packet_size_words --tx_skip_pkt_content_gen 1 --rx_disable_data_check 1 --rx_disable_header_check 1 --tx_pkt_dest_size_choice 1 --check_txrx_timeout 1 --data_kb_per_tx 1048576 --tunneler_queue_size_bytes 32768 --tx_queue_size_bytes 65536 --rx_queue_size_bytes 131072 --mux_queue_size_bytes 65536 --demux_queue_size_bytes 65536"
+        run_test "./build/test/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep  --tx_x 4 --tx_y 7 --mux_x 0 --mux_y 7 --demux_x 0 --demux_y 0 --rx_x 0 --rx_y 1 --max_packet_size_words $max_packet_size_words --tx_skip_pkt_content_gen 1 --rx_disable_data_check 1 --rx_disable_header_check 1 --tx_pkt_dest_size_choice 1 --check_txrx_timeout 1 --data_kb_per_tx 1048576 --tunneler_queue_size_bytes 32768 --tx_queue_size_bytes 65536 --rx_queue_size_bytes 131072 --mux_queue_size_bytes 65536 --demux_queue_size_bytes 65536"
+        run_test "./build/test/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep --tx_x 4 --tx_y 7 --mux_x 0 --mux_y 7 --demux_x 0 --demux_y 0 --rx_x 0 --rx_y 1 --max_packet_size_words $max_packet_size_words --tx_skip_pkt_content_gen 1 --rx_disable_data_check 1 --rx_disable_header_check 1 --tx_pkt_dest_size_choice 1 --check_txrx_timeout 1 --data_kb_per_tx 1048576 --tunneler_queue_size_bytes 16384 --tx_queue_size_bytes 65536 --rx_queue_size_bytes 131072 --mux_queue_size_bytes 65536 --demux_queue_size_bytes 65536"
+      done
+    fi
+
+}
+
+main "$@"
diff --git a/tests/sweep_framework/sweep_utils/sharding_utils.py b/tests/sweep_framework/sweep_utils/sharding_utils.py
@@ -9,10 +9,10 @@
 from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import _gen_reshape_args_from_volume
 
 
-def gen_sharded_spec_unary(num_shapes, max_tensor_size=4 * 1024 * 1024, layouts=["TILE_LAYOUT", "ROW_MAJOR_LAYOUT"]):
+def gen_sharded_spec_unary(num_shapes, max_tensor_size_per_core=62 * 1024, layouts=["TILE_LAYOUT", "ROW_MAJOR_LAYOUT"]):
     # device.compute_with_storage_grid_size()
-    y = 8
-    x = 8
+    Y = 8
+    X = 8
 
     # ["BLOCK", "WIDTH", "HEIGHT", "tensor_wh"]
     sharding_strategy_list = ["BLOCK", "WIDTH", "HEIGHT", "tensor_wh"]
@@ -29,6 +29,10 @@ def gen_sharded_spec_unary(num_shapes, max_tensor_size=4 * 1024 * 1024, layouts=
             tensor_hw_as_shard_shape = False
 
         for _ in range(num_shapes):
+            x = random.randint(1, X)
+            y = random.randint(1, Y)
+            max_tensor_size = max_tensor_size_per_core * x * y
+
             if tensor_hw_as_shard_shape:
                 # Gets stuck:
                 # X 8 Y 8 input_shape [1, 17792, 8] DataType.BFLOAT8_B Layout.TILE ShardStrategy.BLOCK ShardOrientation.COL_MAJOR tensor_hw_as_shard_shape True
@@ -53,11 +57,6 @@ def gen_sharded_spec_unary(num_shapes, max_tensor_size=4 * 1024 * 1024, layouts=
                     input_shape[-1] *= 2
                     input_shape[-2] //= 2
 
-                if shard_orientation == "COL_MAJOR":
-                    tmp = input_shape[-2]
-                    input_shape[-2] = input_shape[-1]
-                    input_shape[-1] = tmp
-
             elif sharding_strategy == "BLOCK":
                 min_shard_size_y = 32 * y
                 min_shard_size_x = 32 * x
@@ -68,6 +67,11 @@ def gen_sharded_spec_unary(num_shapes, max_tensor_size=4 * 1024 * 1024, layouts=
                 physical_shape[1] *= min_shard_size_y
                 physical_shape[0] *= min_shard_size_x
 
+                if shard_orientation == "ROW_MAJOR":
+                    tmp = physical_shape[-2]
+                    physical_shape[-2] = physical_shape[-1]
+                    physical_shape[-1] = tmp
+
                 input_shape = random.choice(_gen_reshape_args_from_volume(physical_shape[0], step=1, out_dims=rank - 1))
                 input_shape = list(input_shape["reshape_dims"])
                 input_shape.append(physical_shape[1])

diff --git a/tests/sweep_framework/sweeps/eltwise/unary/frac/frac_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/frac/frac_sharded.py
@@ -0,0 +1,109 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import gen_sharded_spec_unary, parse_sharding_spec
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_spec": gen_sharded_spec_unary(12, layouts=["TILE_LAYOUT"]),
+        "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+    },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    input_shape, X, Y, sharding_strategy, _, _, input_layout = test_vector["input_spec"].values()
+    pre_sharded_height = math.prod(input_shape[:-1])
+    pre_sharded_width = input_shape[-1]
+
+    if input_layout == "ROW_MAJOR_LAYOUT":
+        return True, "Input to eltwise binary must be tilized"
+
+    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is only supported on tiled layout"
+
+    return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_spec,
+    input_a_dtype,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    (
+        input_shape,
+        core_grid,
+        sharding_strategy,
+        shard_orientation,
+        tensor_hw_as_shard_shape,
+        input_layout,
+    ) = parse_sharding_spec(input_spec)
+
+    if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+        input_shape = sanitize_shape_rm(input_shape)
+
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+
+    torch_op = ttnn.get_golden_function(ttnn.frac)
+    torch_output_tensor = torch_op(torch_input_tensor_a)
+
+    sharded_config = ttnn.create_sharded_memory_config_(
+        shape=input_shape,
+        core_grid=core_grid,
+        strategy=sharding_strategy,
+        orientation=shard_orientation,
+        use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+    )
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.frac(input_tensor_a, memory_config=sharded_config)
+    e2e_perf = stop_measuring_time(start_time)
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+    return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/lgamma/lgamma_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/lgamma/lgamma_sharded.py
@@ -29,7 +29,7 @@
 # Developers can create their own generator functions and pass them to the parameters as inputs.
 parameters = {
     "nightly": {
-        "input_spec": gen_sharded_spec_unary(16, max_tensor_size=2 * 1024 * 1024, layouts=["TILE_LAYOUT"]),
+        "input_spec": gen_sharded_spec_unary(16, max_tensor_size_per_core=20 * 1024, layouts=["TILE_LAYOUT"]),
         "input_a_dtype": [ttnn.bfloat16],
     },
 }

diff --git a/tests/sweep_framework/sweeps/eltwise/unary/logit/logit_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/logit/logit_sharded.py
@@ -29,7 +29,7 @@
 # Developers can create their own generator functions and pass them to the parameters as inputs.
 parameters = {
     "nightly": {
-        "input_spec": gen_sharded_spec_unary(16, max_tensor_size=1 * 1024 * 1024, layouts=["TILE_LAYOUT"]),
+        "input_spec": gen_sharded_spec_unary(16, max_tensor_size_per_core=14 * 1024, layouts=["TILE_LAYOUT"]),
         "input_a_dtype": [ttnn.bfloat16],
         "eps": [0.2],  # 0, 10e-6, 10e-4, 10e-2,
     },