Skip to content

Commit

Permalink
Merge branch 'main' into blozano/rc_local
Browse files Browse the repository at this point in the history
  • Loading branch information
blozano-tt authored Dec 12, 2024
2 parents da6fb63 + 6e983a7 commit 2d40904
Show file tree
Hide file tree
Showing 53 changed files with 958 additions and 500 deletions.
1 change: 1 addition & 0 deletions .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
QualifierAlignment: Left
RawStringFormats:
- Language: Cpp
Delimiters:
Expand Down
2 changes: 2 additions & 0 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -195,3 +195,5 @@ Checks: >
CheckOptions:
- key: readability-function-cognitive-complexity.IgnoreMacros
value: true

FormatStyle: 'file'
1 change: 1 addition & 0 deletions .github/workflows/_produce-data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ on:
- "(Single-card) Demo tests"
- "(Single-card) Tests for new models"
- "Nightly fast dispatch tests"
- "(Single-card) Nightly model and ttnn tests"
- "(Single-card) Tests for new models"
- "(T3K) T3000 demo tests"
- "(T3K) T3000 model perf tests"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,6 @@ jobs:
cmd: tests/scripts/single_card/nightly/run_ttnn.sh,
timeout: 70
},
{
name: "WH N300 pgm dispatch nightly",
arch: wormhole_b0,
runs-on: ["cloud-virtual-machine", "N300", "issue-15821"],
cmd: ./tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/compare_pgm_dispatch_perf_ci.sh,
timeout: 10
},
{
name: "GS-only models",
arch: grayskull,
Expand Down Expand Up @@ -151,25 +144,26 @@ jobs:
test-config:
- model: "stable_diffusion"
cmd: pytest --timeout 900 -n auto tests/nightly/single_card/stable_diffusion
- model: "mamba 1"
cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 1
- model: "mamba 2"
cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 2
- model: "mamba 3"
cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 3
- model: "mamba 4"
cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 4
# Skipping due to issue #15932
# - model: "mamba 1"
# cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 1
# - model: "mamba 2"
# cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 2
# - model: "mamba 3"
# cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 3
# - model: "mamba 4"
# cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 4
- model: "mamba 5"
cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 5
- model: "mamba 6"
cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 6
# - model: "mamba 6"
# cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 6
card: [N150, N300]
name: "[Unstable] Nightly ${{ matrix.card }} ${{ matrix.test-config.model }}"
env:
ARCH_NAME: wormhole_b0
LOGURU_LEVEL: INFO
LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
runs-on: ["cloud-virtual-machine", "in-service", "${{ matrix.card }}"]
runs-on: ["cloud-virtual-machine", "issue-15821", "${{ matrix.card }}"]
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/[email protected]
- uses: ./.github/actions/retry-command
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Nightly fast dispatch tests
name: "(Single-card) Nightly model and ttnn tests"

on:
workflow_dispatch:
Expand Down
6 changes: 3 additions & 3 deletions INSTALLING.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ Note the current compatability matrix:

| Device | OS | Python | Driver (TT-KMD) | Firmware (TT-Flash) | TT-SMI | TT-Topology |
|---------------------|-----------------|----------|--------------------|--------------------------------------------|-----------------------|--------------------------------|
| Grayskull | Ubuntu 20.04 | 3.8.10 | v1.27.1 | fw_pack-80.9.0.0 (v80.9.0.0) | v2.2.0 or above | N/A |
| Wormhole | Ubuntu 20.04 | 3.8.10 | v1.27.1 | fw_pack-80.10.0.0 (v80.10.0.0) | v2.2.0 or above | N/A |
| T3000 (Wormhole) | Ubuntu 20.04 | 3.8.10 | v1.27.1 | fw_pack-80.10.0.0 (v80.10.0.0) | v2.2.0 or above | v1.1.3 or above, `mesh` config |
| Grayskull | Ubuntu 20.04 | 3.8.10 | v1.29 | fw_pack-80.9.0.0 (v80.9.0.0) | v2.2.0 or above | N/A |
| Wormhole | Ubuntu 20.04 | 3.8.10 | v1.29 | fw_pack-80.13.0.0 (v80.13.0.0) | v2.2.0 or above | N/A |
| T3000 (Wormhole) | Ubuntu 20.04 | 3.8.10 | v1.29 | fw_pack-80.13.0.0 (v80.13.0.0) | v2.2.0 or above | v1.1.3 or above, `mesh` config |

---

Expand Down
2 changes: 1 addition & 1 deletion models/demos/vgg/tests/test_perf_vgg.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@


def get_expected_times(vgg):
return (16, 10.5)
return (17, 10.5)


@pytest.mark.models_performance_bare_metal
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def unsqueeze_all_params_to_4d(params):
(2, 4, 64, 64),
],
)
@pytest.mark.skip(reason="#15931: Failing, skip for now")
def test_unet_2d_condition_model_512x512(device, batch_size, in_channels, input_height, input_width):
device.enable_program_cache()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
@pytest.mark.parametrize("res_hidden_states_tuple", [([2, 1280, 8, 8], [2, 1280, 8, 8], [2, 1280, 8, 8])])
@pytest.mark.parametrize("hidden_states", [[2, 1280, 8, 8]])
@pytest.mark.parametrize("temb", [[1, 1, 2, 1280]])
@pytest.mark.skip(reason="#15931: Fails, need to investigate")
def test_upblock_512x512(reset_seeds, device, res_hidden_states_tuple, hidden_states, temb):
# TODO
# setup pytorch model
Expand Down
4 changes: 2 additions & 2 deletions models/demos/yolov4/tests/test_perf_yolo.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@


def get_expected_times():
return (40, 16)
return (40, 16.2)


@pytest.mark.models_performance_bare_metal
Expand Down Expand Up @@ -96,7 +96,7 @@ def test_perf_device_bare_metal_yolov4(batch_size, model_name):
num_iterations = 1
margin = 0.03

expected_perf = 197.89
expected_perf = 199.89
command = f"pytest tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py"

cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"]
Expand Down
100 changes: 8 additions & 92 deletions models/utility_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

from ttnn.device import Arch

from typing_extensions import deprecated


### Math operations ###
def _nearest_32(x):
Expand Down Expand Up @@ -430,108 +432,22 @@ def convert_act_2d_matrix(activation, kernel_y, kernel_x, stride_y, stride_x, pa


### Tilizing / Untilizing ###
@deprecated("PyTorch data is handled automatically in tensor infra. This function does nothing now:")
def tilize(x):
"""
This function tilizes a tensor. The last two tensor dims must be divisible by 32, after which this function
produces row major tiles and creates faces. The output of this function is a flattened list that
we can send to the device.
:param x: Input PyTorch Tensor
:type x: class:`torch.Tensor`
WARNING: This function should eventually be retired in favour of fully tilizing on device.
"""
nearest_32 = _nearest_32

assert isinstance(
x, (torch.Tensor, np.ndarray)
), "Input to this function must be an instance of torch.Tensor or np.array"
assert len(x.shape) == 4, "Only 4D tensors suppported"
assert (x.shape[-2] % 32) == 0 and (
x.shape[-1] % 32
) == 0, "The last two dimensions of the tensor must be divisible by 32"

if isinstance(x, torch.Tensor):
ret = torch.zeros(np.prod(x.shape))
else:
ret = np.zeros(np.prod(x.shape))

idx = 0
for B in range(x.shape[0]):
for C in range(x.shape[1]):
for H in range(0, x.shape[2], 32):
for W in range(0, x.shape[3], 32):
unfaced_tile = x[B, C, H : H + 32, W : W + 32]

face0 = unfaced_tile[:16, :16]
face1 = unfaced_tile[:16, 16:]
face2 = unfaced_tile[16:, :16]
face3 = unfaced_tile[16:, 16:]

for face in (face0, face1, face2, face3):
ret[idx : idx + 256] = face.reshape(-1)
idx += 256

return ret.reshape(x.shape)
return x


@deprecated("PyTorch data is handled automatically in tensor infra. This function does nothing now:")
def tilize_to_list(x):
"""
Tilize a PyTorch and then return the values as a flat list. The last two
tensor dims must be divisible by 32, after which this function produces row
major tiles and creates faces.
:param x: Input PyTorch Tensor
:type x: class:`torch.Tensor`
WARNING: This function should eventually be retired in favour of fully tilizing on device.
Returns a flattened list of the tensor
"""

return tilize(x).reshape(-1).tolist()


@deprecated("PyTorch data is handled automatically in tensor infra. This function does nothing now:")
def untilize(x):
"""
This function untilizes a tensor to row major format.
:param x: Input PyTorch Tensor
:type x: class:`torch.Tensor`
WARNING: This function should eventually be retired in favour of fully tilizing on device.
"""
nearest_32 = _nearest_32

assert isinstance(x, (torch.Tensor, np.ndarray)), "Input to this function must be an instance of torch.Tensor"
assert len(x.shape) == 4, "Only 4D tensors suppported"
assert (x.shape[-2] % 32) == 0 and (
x.shape[-1] % 32
) == 0, "The last two dimensions of the tensor must be divisible by 32"

if isinstance(x, torch.Tensor):
ret = torch.zeros(x.shape, dtype=x.dtype)
else:
ret = np.zeros(x.shape, dtype=x.dtype)

for B in range(x.shape[0]):
for C in range(x.shape[1]):
x_hw = x[B, C, :].reshape(-1)
hw = 0
for h in range(0, x.shape[2], 32):
for w in range(0, x.shape[3], 32):
f_tile = x_hw[hw : hw + 256].reshape(16, 16)
ret[B, C, h : h + 16, w : w + 16] = f_tile

f_tile = x_hw[hw + 256 : hw + 512].reshape(16, 16)
ret[B, C, h : h + 16, w + 16 : w + 32] = f_tile

f_tile = x_hw[hw + 512 : hw + 768].reshape(16, 16)
ret[B, C, h + 16 : h + 32, w : w + 16] = f_tile

f_tile = x_hw[hw + 768 : hw + 1024].reshape(16, 16)
ret[B, C, h + 16 : h + 32, w + 16 : w + 32] = f_tile
hw += 1024 # traverse tiles in RM-order

return ret
return x


### Measuring accuracy and other metrics ###
Expand Down
8 changes: 5 additions & 3 deletions scripts/docker/requirements_dev.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
sudo
nano
acl
emacs
jq
less
libmpfr-dev
nano
openssh-server
sudo
vim
libmpfr-dev
3 changes: 1 addition & 2 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ target_link_libraries(
test_common_libs
INTERFACE
pthread
gtest
gtest_main
gmock_main
magic_enum
fmt::fmt-header-only
span
Expand Down
78 changes: 76 additions & 2 deletions tests/sweep_framework/sweep_utils/conv2d_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def mesh_device_fixture():
ttnn.close_device(device)


def run_full(
def run_conv2d_full_sweep(
input_specs,
input_channels,
output_channels,
Expand Down Expand Up @@ -174,7 +174,7 @@ def run_full(
return [check_with_pcc(torch_output_tensor, torch_out_golden_tensor, pcc=0.998), e2e_perf]


def run_short(
def run_conv2d_short_sweep(
input_specs,
device,
) -> list:
Expand Down Expand Up @@ -256,3 +256,77 @@ def run_short(
torch_output_tensor = torch.permute(torch_output_tensor, (0, 3, 1, 2))

return [check_with_pcc(torch_output_tensor, torch_out_golden_tensor, pcc=0.998), e2e_perf]


def run_conv1d_short_sweep(
input_specs,
device,
) -> list:
[
batch_size,
output_channels,
input_channels,
input_length,
kernel_size,
stride,
padding,
groups,
has_bias,
dilation,
] = input_specs
print(input_specs)

# has_bias = False
torch.manual_seed(0)
conv_input_shape = [batch_size, input_channels, input_length]
conv_weight_shape = [output_channels, input_channels // groups, kernel_size]
conv_bias_shape = [1, 1, 1, output_channels]
torch_input_tensor_ncl = torch.randn(conv_input_shape, dtype=torch.bfloat16).float()
torch_input_tensor = torch.permute(torch_input_tensor_ncl, (0, 2, 1))
torch_weight_tensor = torch.randn(conv_weight_shape, dtype=torch.bfloat16).float()
torch_bias_tensor = torch.randn(conv_bias_shape, dtype=torch.bfloat16).float() if has_bias else None
torch_out_golden_tensor = torch.nn.functional.conv1d(
torch_input_tensor_ncl,
torch_weight_tensor,
bias=torch_bias_tensor.reshape(-1) if has_bias else None,
stride=stride,
padding=padding,
groups=groups,
)

tt_weight_tensor = ttnn.from_torch(torch_weight_tensor, ttnn.bfloat16)
tt_bias_tensor = None
if has_bias:
tt_bias_tensor = ttnn.from_torch(torch_bias_tensor, ttnn.bfloat16)

tt_input_tensor = ttnn.from_torch(torch_input_tensor, ttnn.bfloat16)

start_time = start_measuring_time()
[tt_output_tensor_on_device, out_length, [weights_device, bias_device]] = ttnn.Conv1d(
input_tensor=tt_input_tensor,
weight_tensor=tt_weight_tensor,
in_channels=input_channels,
out_channels=output_channels,
device=device,
bias_tensor=tt_bias_tensor,
kernel_size=kernel_size,
stride=stride,
padding=padding,
batch_size=batch_size,
input_length=input_length,
groups=groups,
return_output_dim=True,
return_weights_and_bias=True,
)

tt_output_tensor = ttnn.from_device(tt_output_tensor_on_device)
torch_output_tensor = ttnn.to_torch(tt_output_tensor)
e2e_perf = stop_measuring_time(start_time)

# torch_output_tensor is in row major layout and NLC shape
# NLC to NCL
torch_output_tensor = torch_output_tensor.reshape(batch_size, out_length, output_channels)

torch_output_tensor = torch.permute(torch_output_tensor, (0, 2, 1))

return [check_with_pcc(torch_output_tensor, torch_out_golden_tensor, pcc=0.998), e2e_perf]
4 changes: 2 additions & 2 deletions tests/sweep_framework/sweeps/conv2d/full/conv2d_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
from models.utility_functions import torch_random
from tests.sweep_framework.sweep_utils.conv2d_common import run_full, get_input_specs, mesh_device_fixture
from tests.sweep_framework.sweep_utils.conv2d_common import run_conv2d_full_sweep, get_input_specs, mesh_device_fixture

# Override the default timeout in seconds for hang detection.
TIMEOUT = 30
Expand Down Expand Up @@ -242,7 +242,7 @@ def run(
*,
device,
) -> list:
return run_full(
return run_conv2d_full_sweep(
input_specs,
input_channels,
output_channels,
Expand Down
Loading

0 comments on commit 2d40904

Please sign in to comment.