diff --git a/.clang-format b/.clang-format index d29b0dbdcef5..301ae10318a3 100644 --- a/.clang-format +++ b/.clang-format @@ -95,6 +95,7 @@ PenaltyBreakTemplateDeclaration: 10 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 200 PointerAlignment: Left +QualifierAlignment: Left RawStringFormats: - Language: Cpp Delimiters: diff --git a/.clang-tidy b/.clang-tidy index 273f9d885e80..9e775a897962 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -195,3 +195,5 @@ Checks: > CheckOptions: - key: readability-function-cognitive-complexity.IgnoreMacros value: true + +FormatStyle: 'file' diff --git a/.github/workflows/_produce-data.yaml b/.github/workflows/_produce-data.yaml index c66c5bb57023..1f13fd8274e5 100644 --- a/.github/workflows/_produce-data.yaml +++ b/.github/workflows/_produce-data.yaml @@ -24,6 +24,7 @@ on: - "(Single-card) Demo tests" - "(Single-card) Tests for new models" - "Nightly fast dispatch tests" + - "(Single-card) Nightly model and ttnn tests" - "(Single-card) Tests for new models" - "(T3K) T3000 demo tests" - "(T3K) T3000 model perf tests" diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml index 17062e6f41ee..613b7bf55022 100644 --- a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml +++ b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml @@ -40,13 +40,6 @@ jobs: cmd: tests/scripts/single_card/nightly/run_ttnn.sh, timeout: 70 }, - { - name: "WH N300 pgm dispatch nightly", - arch: wormhole_b0, - runs-on: ["cloud-virtual-machine", "N300", "issue-15821"], - cmd: ./tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/compare_pgm_dispatch_perf_ci.sh, - timeout: 10 - }, { name: "GS-only models", arch: grayskull, @@ -151,25 +144,26 @@ jobs: test-config: - model: "stable_diffusion" cmd: pytest --timeout 900 -n auto tests/nightly/single_card/stable_diffusion - - model: "mamba 1" - cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 1 - - model: "mamba 2" - cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 2 - - model: "mamba 3" - cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 3 - - model: "mamba 4" - cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 4 + # Skipping due to issue #15932 + # - model: "mamba 1" + # cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 1 + # - model: "mamba 2" + # cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 2 + # - model: "mamba 3" + # cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 3 + # - model: "mamba 4" + # cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 4 - model: "mamba 5" cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 5 - - model: "mamba 6" - cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 6 + # - model: "mamba 6" + # cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 6 card: [N150, N300] name: "[Unstable] Nightly ${{ matrix.card }} ${{ matrix.test-config.model }}" env: ARCH_NAME: wormhole_b0 LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib - runs-on: ["cloud-virtual-machine", "in-service", "${{ matrix.card }}"] + runs-on: ["cloud-virtual-machine", "issue-15821", "${{ matrix.card }}"] steps: - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0 - uses: ./.github/actions/retry-command diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models.yaml index 409833993f46..65bacbceada8 100644 --- a/.github/workflows/fast-dispatch-full-regressions-and-models.yaml +++ b/.github/workflows/fast-dispatch-full-regressions-and-models.yaml @@ -1,4 +1,4 @@ -name: Nightly fast dispatch tests +name: "(Single-card) Nightly model and ttnn tests" on: workflow_dispatch: diff --git a/INSTALLING.md b/INSTALLING.md index 76f8cecd6277..09092083db98 100644 --- a/INSTALLING.md +++ b/INSTALLING.md @@ -20,9 +20,9 @@ Note the current compatability matrix: | Device | OS | Python | Driver (TT-KMD) | Firmware (TT-Flash) | TT-SMI | TT-Topology | |---------------------|-----------------|----------|--------------------|--------------------------------------------|-----------------------|--------------------------------| -| Grayskull | Ubuntu 20.04 | 3.8.10 | v1.27.1 | fw_pack-80.9.0.0 (v80.9.0.0) | v2.2.0 or above | N/A | -| Wormhole | Ubuntu 20.04 | 3.8.10 | v1.27.1 | fw_pack-80.10.0.0 (v80.10.0.0) | v2.2.0 or above | N/A | -| T3000 (Wormhole) | Ubuntu 20.04 | 3.8.10 | v1.27.1 | fw_pack-80.10.0.0 (v80.10.0.0) | v2.2.0 or above | v1.1.3 or above, `mesh` config | +| Grayskull | Ubuntu 20.04 | 3.8.10 | v1.29 | fw_pack-80.9.0.0 (v80.9.0.0) | v2.2.0 or above | N/A | +| Wormhole | Ubuntu 20.04 | 3.8.10 | v1.29 | fw_pack-80.13.0.0 (v80.13.0.0) | v2.2.0 or above | N/A | +| T3000 (Wormhole) | Ubuntu 20.04 | 3.8.10 | v1.29 | fw_pack-80.13.0.0 (v80.13.0.0) | v2.2.0 or above | v1.1.3 or above, `mesh` config | --- diff --git a/models/demos/vgg/tests/test_perf_vgg.py b/models/demos/vgg/tests/test_perf_vgg.py index 9cc0397bb073..8faff8a5b733 100644 --- a/models/demos/vgg/tests/test_perf_vgg.py +++ b/models/demos/vgg/tests/test_perf_vgg.py @@ -22,7 +22,7 @@ def get_expected_times(vgg): - return (16, 10.5) + return (17, 10.5) @pytest.mark.models_performance_bare_metal diff --git a/models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py b/models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py index 72efdb4e178e..24b9e22dea2d 100644 --- a/models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py +++ b/models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py @@ -72,6 +72,7 @@ def unsqueeze_all_params_to_4d(params): (2, 4, 64, 64), ], ) +@pytest.mark.skip(reason="#15931: Failing, skip for now") def test_unet_2d_condition_model_512x512(device, batch_size, in_channels, input_height, input_width): device.enable_program_cache() diff --git a/models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py b/models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py index 0445d58164be..cb9fd165f3f7 100644 --- a/models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py +++ b/models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py @@ -29,6 +29,7 @@ @pytest.mark.parametrize("res_hidden_states_tuple", [([2, 1280, 8, 8], [2, 1280, 8, 8], [2, 1280, 8, 8])]) @pytest.mark.parametrize("hidden_states", [[2, 1280, 8, 8]]) @pytest.mark.parametrize("temb", [[1, 1, 2, 1280]]) +@pytest.mark.skip(reason="#15931: Fails, need to investigate") def test_upblock_512x512(reset_seeds, device, res_hidden_states_tuple, hidden_states, temb): # TODO # setup pytorch model diff --git a/models/demos/yolov4/tests/test_perf_yolo.py b/models/demos/yolov4/tests/test_perf_yolo.py index f04566ebbd1e..a9e0009e6f5b 100644 --- a/models/demos/yolov4/tests/test_perf_yolo.py +++ b/models/demos/yolov4/tests/test_perf_yolo.py @@ -23,7 +23,7 @@ def get_expected_times(): - return (40, 16) + return (40, 16.2) @pytest.mark.models_performance_bare_metal @@ -96,7 +96,7 @@ def test_perf_device_bare_metal_yolov4(batch_size, model_name): num_iterations = 1 margin = 0.03 - expected_perf = 197.89 + expected_perf = 199.89 command = f"pytest tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py" cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"] diff --git a/models/utility_functions.py b/models/utility_functions.py index 2b652f815424..f13fd48d8ca0 100644 --- a/models/utility_functions.py +++ b/models/utility_functions.py @@ -15,6 +15,8 @@ from ttnn.device import Arch +from typing_extensions import deprecated + ### Math operations ### def _nearest_32(x): @@ -430,108 +432,22 @@ def convert_act_2d_matrix(activation, kernel_y, kernel_x, stride_y, stride_x, pa ### Tilizing / Untilizing ### +@deprecated("PyTorch data is handled automatically in tensor infra. This function does nothing now:") def tilize(x): - """ - This function tilizes a tensor. The last two tensor dims must be divisible by 32, after which this function - produces row major tiles and creates faces. The output of this function is a flattened list that - we can send to the device. - - :param x: Input PyTorch Tensor - :type x: class:`torch.Tensor` - - WARNING: This function should eventually be retired in favour of fully tilizing on device. - """ - nearest_32 = _nearest_32 - - assert isinstance( - x, (torch.Tensor, np.ndarray) - ), "Input to this function must be an instance of torch.Tensor or np.array" - assert len(x.shape) == 4, "Only 4D tensors suppported" - assert (x.shape[-2] % 32) == 0 and ( - x.shape[-1] % 32 - ) == 0, "The last two dimensions of the tensor must be divisible by 32" - - if isinstance(x, torch.Tensor): - ret = torch.zeros(np.prod(x.shape)) - else: - ret = np.zeros(np.prod(x.shape)) - - idx = 0 - for B in range(x.shape[0]): - for C in range(x.shape[1]): - for H in range(0, x.shape[2], 32): - for W in range(0, x.shape[3], 32): - unfaced_tile = x[B, C, H : H + 32, W : W + 32] - - face0 = unfaced_tile[:16, :16] - face1 = unfaced_tile[:16, 16:] - face2 = unfaced_tile[16:, :16] - face3 = unfaced_tile[16:, 16:] - - for face in (face0, face1, face2, face3): - ret[idx : idx + 256] = face.reshape(-1) - idx += 256 - - return ret.reshape(x.shape) + return x +@deprecated("PyTorch data is handled automatically in tensor infra. This function does nothing now:") def tilize_to_list(x): """ - Tilize a PyTorch and then return the values as a flat list. The last two - tensor dims must be divisible by 32, after which this function produces row - major tiles and creates faces. - - :param x: Input PyTorch Tensor - :type x: class:`torch.Tensor` - - WARNING: This function should eventually be retired in favour of fully tilizing on device. + Returns a flattened list of the tensor """ - return tilize(x).reshape(-1).tolist() +@deprecated("PyTorch data is handled automatically in tensor infra. This function does nothing now:") def untilize(x): - """ - This function untilizes a tensor to row major format. - - :param x: Input PyTorch Tensor - :type x: class:`torch.Tensor` - - WARNING: This function should eventually be retired in favour of fully tilizing on device. - """ - nearest_32 = _nearest_32 - - assert isinstance(x, (torch.Tensor, np.ndarray)), "Input to this function must be an instance of torch.Tensor" - assert len(x.shape) == 4, "Only 4D tensors suppported" - assert (x.shape[-2] % 32) == 0 and ( - x.shape[-1] % 32 - ) == 0, "The last two dimensions of the tensor must be divisible by 32" - - if isinstance(x, torch.Tensor): - ret = torch.zeros(x.shape, dtype=x.dtype) - else: - ret = np.zeros(x.shape, dtype=x.dtype) - - for B in range(x.shape[0]): - for C in range(x.shape[1]): - x_hw = x[B, C, :].reshape(-1) - hw = 0 - for h in range(0, x.shape[2], 32): - for w in range(0, x.shape[3], 32): - f_tile = x_hw[hw : hw + 256].reshape(16, 16) - ret[B, C, h : h + 16, w : w + 16] = f_tile - - f_tile = x_hw[hw + 256 : hw + 512].reshape(16, 16) - ret[B, C, h : h + 16, w + 16 : w + 32] = f_tile - - f_tile = x_hw[hw + 512 : hw + 768].reshape(16, 16) - ret[B, C, h + 16 : h + 32, w : w + 16] = f_tile - - f_tile = x_hw[hw + 768 : hw + 1024].reshape(16, 16) - ret[B, C, h + 16 : h + 32, w + 16 : w + 32] = f_tile - hw += 1024 # traverse tiles in RM-order - - return ret + return x ### Measuring accuracy and other metrics ### diff --git a/scripts/docker/requirements_dev.txt b/scripts/docker/requirements_dev.txt index e32fe8958d24..e7029ab3bc7f 100644 --- a/scripts/docker/requirements_dev.txt +++ b/scripts/docker/requirements_dev.txt @@ -1,7 +1,9 @@ -sudo -nano acl +emacs jq +less +libmpfr-dev +nano openssh-server +sudo vim -libmpfr-dev diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 544d624088c8..6a15d0c6db40 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -5,8 +5,7 @@ target_link_libraries( test_common_libs INTERFACE pthread - gtest - gtest_main + gmock_main magic_enum fmt::fmt-header-only span diff --git a/tests/sweep_framework/sweep_utils/conv2d_common.py b/tests/sweep_framework/sweep_utils/conv2d_common.py index c7509247213e..a7354f8363a3 100644 --- a/tests/sweep_framework/sweep_utils/conv2d_common.py +++ b/tests/sweep_framework/sweep_utils/conv2d_common.py @@ -48,7 +48,7 @@ def mesh_device_fixture(): ttnn.close_device(device) -def run_full( +def run_conv2d_full_sweep( input_specs, input_channels, output_channels, @@ -174,7 +174,7 @@ def run_full( return [check_with_pcc(torch_output_tensor, torch_out_golden_tensor, pcc=0.998), e2e_perf] -def run_short( +def run_conv2d_short_sweep( input_specs, device, ) -> list: @@ -256,3 +256,77 @@ def run_short( torch_output_tensor = torch.permute(torch_output_tensor, (0, 3, 1, 2)) return [check_with_pcc(torch_output_tensor, torch_out_golden_tensor, pcc=0.998), e2e_perf] + + +def run_conv1d_short_sweep( + input_specs, + device, +) -> list: + [ + batch_size, + output_channels, + input_channels, + input_length, + kernel_size, + stride, + padding, + groups, + has_bias, + dilation, + ] = input_specs + print(input_specs) + + # has_bias = False + torch.manual_seed(0) + conv_input_shape = [batch_size, input_channels, input_length] + conv_weight_shape = [output_channels, input_channels // groups, kernel_size] + conv_bias_shape = [1, 1, 1, output_channels] + torch_input_tensor_ncl = torch.randn(conv_input_shape, dtype=torch.bfloat16).float() + torch_input_tensor = torch.permute(torch_input_tensor_ncl, (0, 2, 1)) + torch_weight_tensor = torch.randn(conv_weight_shape, dtype=torch.bfloat16).float() + torch_bias_tensor = torch.randn(conv_bias_shape, dtype=torch.bfloat16).float() if has_bias else None + torch_out_golden_tensor = torch.nn.functional.conv1d( + torch_input_tensor_ncl, + torch_weight_tensor, + bias=torch_bias_tensor.reshape(-1) if has_bias else None, + stride=stride, + padding=padding, + groups=groups, + ) + + tt_weight_tensor = ttnn.from_torch(torch_weight_tensor, ttnn.bfloat16) + tt_bias_tensor = None + if has_bias: + tt_bias_tensor = ttnn.from_torch(torch_bias_tensor, ttnn.bfloat16) + + tt_input_tensor = ttnn.from_torch(torch_input_tensor, ttnn.bfloat16) + + start_time = start_measuring_time() + [tt_output_tensor_on_device, out_length, [weights_device, bias_device]] = ttnn.Conv1d( + input_tensor=tt_input_tensor, + weight_tensor=tt_weight_tensor, + in_channels=input_channels, + out_channels=output_channels, + device=device, + bias_tensor=tt_bias_tensor, + kernel_size=kernel_size, + stride=stride, + padding=padding, + batch_size=batch_size, + input_length=input_length, + groups=groups, + return_output_dim=True, + return_weights_and_bias=True, + ) + + tt_output_tensor = ttnn.from_device(tt_output_tensor_on_device) + torch_output_tensor = ttnn.to_torch(tt_output_tensor) + e2e_perf = stop_measuring_time(start_time) + + # torch_output_tensor is in row major layout and NLC shape + # NLC to NCL + torch_output_tensor = torch_output_tensor.reshape(batch_size, out_length, output_channels) + + torch_output_tensor = torch.permute(torch_output_tensor, (0, 2, 1)) + + return [check_with_pcc(torch_output_tensor, torch_out_golden_tensor, pcc=0.998), e2e_perf] diff --git a/tests/sweep_framework/sweeps/conv2d/full/conv2d_misc.py b/tests/sweep_framework/sweeps/conv2d/full/conv2d_misc.py index 144640d642d6..0bb58c4dac09 100644 --- a/tests/sweep_framework/sweeps/conv2d/full/conv2d_misc.py +++ b/tests/sweep_framework/sweeps/conv2d/full/conv2d_misc.py @@ -12,7 +12,7 @@ from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time from models.utility_functions import torch_random -from tests.sweep_framework.sweep_utils.conv2d_common import run_full, get_input_specs, mesh_device_fixture +from tests.sweep_framework.sweep_utils.conv2d_common import run_conv2d_full_sweep, get_input_specs, mesh_device_fixture # Override the default timeout in seconds for hang detection. TIMEOUT = 30 @@ -242,7 +242,7 @@ def run( *, device, ) -> list: - return run_full( + return run_conv2d_full_sweep( input_specs, input_channels, output_channels, diff --git a/tests/sweep_framework/sweeps/conv2d/full/conv2d_sharding.py b/tests/sweep_framework/sweeps/conv2d/full/conv2d_sharding.py index 4b6a3a29259a..bfcc0d058009 100644 --- a/tests/sweep_framework/sweeps/conv2d/full/conv2d_sharding.py +++ b/tests/sweep_framework/sweeps/conv2d/full/conv2d_sharding.py @@ -12,7 +12,7 @@ from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time from models.utility_functions import torch_random -from tests.sweep_framework.sweep_utils.conv2d_common import run_full, get_input_specs, mesh_device_fixture +from tests.sweep_framework.sweep_utils.conv2d_common import run_conv2d_full_sweep, get_input_specs, mesh_device_fixture # Override the default timeout in seconds for hang detection. TIMEOUT = 30 @@ -111,7 +111,7 @@ def run( *, device, ) -> list: - return run_full( + return run_conv2d_full_sweep( input_specs, input_channels, output_channels, diff --git a/tests/sweep_framework/sweeps/conv2d/full/conv2d_sliding_window.py b/tests/sweep_framework/sweeps/conv2d/full/conv2d_sliding_window.py index 3b3db7da7ed2..83c2f233fd5e 100644 --- a/tests/sweep_framework/sweeps/conv2d/full/conv2d_sliding_window.py +++ b/tests/sweep_framework/sweeps/conv2d/full/conv2d_sliding_window.py @@ -11,7 +11,7 @@ from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time from models.utility_functions import torch_random -from tests.sweep_framework.sweep_utils.conv2d_common import run_full, get_input_specs, mesh_device_fixture +from tests.sweep_framework.sweep_utils.conv2d_common import run_conv2d_full_sweep, get_input_specs, mesh_device_fixture # Override the default timeout in seconds for hang detection. TIMEOUT = 30 @@ -109,7 +109,7 @@ def run( *, device, ) -> list: - return run_full( + return run_conv2d_full_sweep( input_specs, input_channels, output_channels, diff --git a/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py b/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py index c41f6be90928..743d5ac652df 100644 --- a/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py +++ b/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py @@ -12,10 +12,14 @@ from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time from models.utility_functions import torch_random -from tests.sweep_framework.sweep_utils.conv2d_common import run_short, mesh_device_fixture +from tests.sweep_framework.sweep_utils.conv2d_common import ( + run_conv2d_short_sweep, + run_conv1d_short_sweep, + mesh_device_fixture, +) parameters = { - "short_sweep_suite": { + "short_sweep_suite_conv2d": { "input_specs": [ # Contains following params # [batch_size, output_channels, input_channels, input_height, input_width, kernel_height, kernel_width, stride_x, stride_y, pad_x, pad_y, groups, bias, dilation] @@ -1566,6 +1570,18 @@ [1, 320, 960, 64, 64, 1, 1, 1, 1, 0, 0, 1, True, 1], [1, 320, 960, 64, 64, 3, 3, 1, 1, 1, 1, 1, True, 1], ], + "is_conv1d": [False], + }, + "short_sweep_suite_conv1d": { + "input_specs": [ + # Contains following params + # [batch_size, output_channels, input_channels, input_length, kernel_size, stride, pad, groups, bias, dilation] + [1, 256, 1024, 512, 1, 1, 0, 1, True, 1], + [1, 1024, 256, 512, 1, 1, 0, 1, True, 1], + [1, 768, 768, 3000, 3, 2, 1, 1, True, 1], + [1, 768, 80, 3000, 3, 1, 1, 1, True, 1], + ], + "is_conv1d": [True], }, } @@ -1576,22 +1592,23 @@ def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]: def run( input_specs, + is_conv1d=False, *, device, ) -> list: - return run_short( - input_specs, - device, - ) + if is_conv1d: + return run_conv1d_short_sweep(input_specs, device) + else: + return run_conv2d_short_sweep(input_specs, device) import pytest -@pytest.mark.parametrize("input_spec", parameters["short_sweep_suite"]["input_specs"]) +@pytest.mark.parametrize("input_spec", parameters["short_sweep_suite_conv2d"]["input_specs"]) @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) def test_conv2d_localrun(device, input_spec): - run_short( + run_conv2d_short_sweep( input_spec, device, ) @@ -1658,7 +1675,7 @@ def test_conv2d_localrun(device, input_spec): @pytest.mark.parametrize("input_spec", failing_parameters) @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) def test_conv2d_localrun_fail_only(device, input_spec): - run_short( + run_conv2d_short_sweep( input_spec, device, ) diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_indexed_fill.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_indexed_fill.py index 4245a35c3c2d..3044f6bbb892 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_indexed_fill.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_indexed_fill.py @@ -9,15 +9,7 @@ import ttnn import torch import numpy as np - - -tt_dtype_to_torch_dtype = { - ttnn.uint16: torch.int16, - ttnn.uint32: torch.int32, - ttnn.float32: torch.float, - ttnn.bfloat16: torch.bfloat16, - ttnn.bfloat8_b: torch.float, -} +from tests.ttnn.utils_for_testing import tt_dtype_to_torch_dtype @pytest.mark.parametrize( diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_non_zero.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_non_zero.py index e672856c3e20..b280d8e0b66c 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_non_zero.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_non_zero.py @@ -10,15 +10,7 @@ import torch import numpy as np import ttnn - - -tt_dtype_to_torch_dtype = { - ttnn.uint16: torch.int16, - ttnn.uint32: torch.int32, - ttnn.float32: torch.float, - ttnn.bfloat16: torch.bfloat16, - ttnn.bfloat8_b: torch.float, -} +from tests.ttnn.utils_for_testing import tt_dtype_to_torch_dtype @pytest.mark.parametrize( diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py index 6eb3991d34da..88bafe821b3f 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py @@ -1845,7 +1845,6 @@ def test_sharded_tilize_with_val_padding(input_shape, sharding_config, output_dt assert passing -@skip_for_blackhole("GH #15234") @pytest.mark.parametrize("N", [8, 16]) @pytest.mark.parametrize("in_sharded", [True], ids=["in0_sharded"]) @pytest.mark.parametrize("out_sharded", [True], ids=["out_sharded"]) diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded_tensor.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded_tensor.py index 050099d62d60..1c19b8137e68 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded_tensor.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded_tensor.py @@ -11,14 +11,9 @@ import ttnn from models.utility_functions import get_debug_tensor +from tests.ttnn.utils_for_testing import tt_dtype_to_torch_dtype from enum import Enum -tt_dtype_to_torch_dtype = { - ttnn.uint32: torch.int32, - ttnn.uint16: torch.int16, - ttnn.bfloat16: torch.bfloat16, - ttnn.bfloat8_b: torch.float, -} TILE_WIDTH = 32 TILE_HEIGHT = 32 diff --git a/tests/ttnn/CMakeLists.txt b/tests/ttnn/CMakeLists.txt index c14a587dd727..3117e6b89205 100644 --- a/tests/ttnn/CMakeLists.txt +++ b/tests/ttnn/CMakeLists.txt @@ -6,7 +6,7 @@ function(setup_ttnn_test_target target_name) test_common_libs ttnn Metalium::Metal - GTest::gtest_main + GTest::gmock_main ) target_include_directories( ${target_name} diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_multi_device.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_multi_device.cpp index f4279cc87535..7ef367335f6e 100644 --- a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_multi_device.cpp +++ b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_multi_device.cpp @@ -8,6 +8,7 @@ #include "buffers/buffer_constants.hpp" #include "gtest/gtest.h" +#include "gmock/gmock.h" #include "ttnn/cpp/ttnn/operations/creation.hpp" #include "ttnn/cpp/ttnn/tensor/types.hpp" #include "ttnn/distributed/api.hpp" @@ -17,6 +18,7 @@ namespace ttnn::distributed::test { namespace { +using ::testing::SizeIs; using ::tt::tt_metal::BufferType; using ::tt::tt_metal::Layout; using ::tt::tt_metal::MemoryConfig; @@ -57,7 +59,7 @@ TEST_P(MultiDeviceTensorCreationTest, EmptyLike) { MemoryConfig{TensorMemoryLayout::INTERLEAVED, BufferType::DRAM, std::nullopt}); EXPECT_EQ(tensor.storage_type(), StorageType::DEVICE); - EXPECT_EQ(tensor.get_workers().size(), 1); + EXPECT_THAT(tensor.get_workers(), SizeIs(1)); const Tensor mesh_replicated_tensor = ttnn::empty_like( tensor, @@ -67,7 +69,7 @@ TEST_P(MultiDeviceTensorCreationTest, EmptyLike) { MemoryConfig{TensorMemoryLayout::INTERLEAVED, BufferType::DRAM, std::nullopt}); EXPECT_EQ(mesh_replicated_tensor.storage_type(), StorageType::MULTI_DEVICE); - EXPECT_EQ(mesh_replicated_tensor.get_workers().size(), mesh_device->num_devices()); + EXPECT_THAT(mesh_replicated_tensor.get_workers(), SizeIs(mesh_device->num_devices())); const auto distributed_tensor_config = get_distributed_tensor_config_from_tensor(mesh_replicated_tensor); EXPECT_TRUE(std::holds_alternative(distributed_tensor_config)); @@ -86,7 +88,7 @@ TEST_P(MultiDeviceTensorCreationTest, Full) { MemoryConfig{TensorMemoryLayout::INTERLEAVED, BufferType::DRAM, std::nullopt}); EXPECT_EQ(mesh_replicated_tensor.storage_type(), StorageType::MULTI_DEVICE); - EXPECT_EQ(mesh_replicated_tensor.get_workers().size(), mesh_device->num_devices()); + EXPECT_THAT(mesh_replicated_tensor.get_workers(), SizeIs(mesh_device->num_devices())); EXPECT_EQ(mesh_replicated_tensor.shape(), ttnn::SimpleShape({32, 32})); EXPECT_EQ(mesh_replicated_tensor.dtype(), DataType::BFLOAT16); EXPECT_EQ(mesh_replicated_tensor.layout(), Layout::ROW_MAJOR); @@ -109,7 +111,7 @@ TEST_P(MultiDeviceTensorCreationTest, FullLike) { MemoryConfig{TensorMemoryLayout::INTERLEAVED, BufferType::DRAM, std::nullopt}); EXPECT_EQ(tensor.storage_type(), StorageType::DEVICE); - EXPECT_EQ(tensor.get_workers().size(), 1); + EXPECT_THAT(tensor.get_workers(), SizeIs(1)); Tensor mesh_replicated_tensor = ttnn::full_like( tensor, @@ -119,7 +121,7 @@ TEST_P(MultiDeviceTensorCreationTest, FullLike) { std::ref(*mesh_device)); EXPECT_EQ(mesh_replicated_tensor.storage_type(), StorageType::MULTI_DEVICE); - EXPECT_EQ(mesh_replicated_tensor.get_workers().size(), mesh_device->num_devices()); + EXPECT_THAT(mesh_replicated_tensor.get_workers(), SizeIs(mesh_device->num_devices())); EXPECT_EQ(mesh_replicated_tensor.shape(), tensor.shape()); EXPECT_EQ(mesh_replicated_tensor.dtype(), tensor.dtype()); EXPECT_EQ(mesh_replicated_tensor.layout(), tensor.layout()); @@ -161,7 +163,7 @@ TEST_P(MultiDeviceTensorCreationTest, FullLikeWithOptTensor) { opt_output); EXPECT_EQ(mesh_replicated_tensor.storage_type(), StorageType::MULTI_DEVICE); - EXPECT_EQ(mesh_replicated_tensor.get_workers().size(), mesh_device->num_devices()); + EXPECT_THAT(mesh_replicated_tensor.get_workers(), SizeIs(mesh_device->num_devices())); EXPECT_EQ(mesh_replicated_tensor.shape(), tensor.shape()); EXPECT_EQ(mesh_replicated_tensor.dtype(), tensor.dtype()); EXPECT_EQ(mesh_replicated_tensor.layout(), tensor.layout()); diff --git a/tests/ttnn/unit_tests/tensor/test_tensor_conversion.py b/tests/ttnn/unit_tests/tensor/test_tensor_conversion.py index 634423088311..2fff322de44d 100644 --- a/tests/ttnn/unit_tests/tensor/test_tensor_conversion.py +++ b/tests/ttnn/unit_tests/tensor/test_tensor_conversion.py @@ -11,29 +11,10 @@ import numpy as np import ttnn - -tt_dtype_to_torch_dtype = { - ttnn.uint8: torch.uint8, - ttnn.uint16: torch.int16, - ttnn.uint32: torch.int32, - ttnn.int32: torch.int32, - ttnn.float32: torch.float, - ttnn.bfloat16: torch.bfloat16, - ttnn.bfloat8_b: torch.float, - ttnn.bfloat4_b: torch.float, -} - -tt_dtype_to_np_dtype = { - ttnn.uint8: np.ubyte, - ttnn.uint16: np.int16, - ttnn.uint32: np.int32, - ttnn.int32: np.int32, - ttnn.float32: np.float32, - ttnn.bfloat8_b: np.float32, - ttnn.bfloat4_b: np.float32, -} +from tests.ttnn.utils_for_testing import tt_dtype_to_torch_dtype, tt_dtype_to_np_dtype +@pytest.mark.parametrize("convert_to_device", [True, False]) @pytest.mark.parametrize( "tt_dtype", [ @@ -49,7 +30,7 @@ ) @pytest.mark.parametrize("shape", [(2, 3, 64, 96)]) @pytest.mark.parametrize("python_lib", [torch, np]) -def test_tensor_conversion_with_tt_dtype(python_lib, shape, tt_dtype, device): +def test_tensor_conversion_with_tt_dtype(python_lib, shape, tt_dtype, convert_to_device, device): torch.manual_seed(0) if python_lib == torch: @@ -64,7 +45,7 @@ def test_tensor_conversion_with_tt_dtype(python_lib, shape, tt_dtype, device): elif python_lib == np: if tt_dtype == ttnn.bfloat16: - pytest.skip("ttnn.bloat16 dtype is not supported yet for numpy tensors!") + pytest.skip("ttnn.bfloat16 dtype is not supported yet for numpy tensors!") dtype = tt_dtype_to_np_dtype[tt_dtype] if dtype in {np.ubyte, np.int16, np.int32}: @@ -82,8 +63,9 @@ def test_tensor_conversion_with_tt_dtype(python_lib, shape, tt_dtype, device): assert tt_tensor.storage_type() == ttnn.StorageType.BORROWED assert tt_tensor.layout == ttnn.ROW_MAJOR_LAYOUT - tt_tensor = tt_tensor.to(device) - tt_tensor = tt_tensor.cpu() + if convert_to_device: + tt_tensor = tt_tensor.to(device) + tt_tensor = tt_tensor.cpu() if python_lib == torch: py_tensor_after_round_trip = tt_tensor.to_torch() @@ -123,6 +105,7 @@ def test_tensor_conversion_with_tt_dtype(python_lib, shape, tt_dtype, device): } +@pytest.mark.parametrize("convert_to_device", [True, False]) @pytest.mark.parametrize( "python_dtype_str", [ @@ -137,7 +120,7 @@ def test_tensor_conversion_with_tt_dtype(python_lib, shape, tt_dtype, device): ) @pytest.mark.parametrize("shape", [(2, 3, 64, 96)]) @pytest.mark.parametrize("python_lib", [torch, np]) -def test_tensor_conversion_with_python_dtype(python_lib, shape, python_dtype_str, device): +def test_tensor_conversion_with_python_dtype(python_lib, shape, python_dtype_str, convert_to_device, device): torch.manual_seed(0) if python_lib == torch: @@ -165,8 +148,9 @@ def test_tensor_conversion_with_python_dtype(python_lib, shape, python_dtype_str tt_tensor = ttnn.Tensor(py_tensor) assert tt_tensor.storage_type() == ttnn.StorageType.BORROWED - tt_tensor = tt_tensor.to(device) - tt_tensor = tt_tensor.cpu() + if convert_to_device: + tt_tensor = tt_tensor.to(device) + tt_tensor = tt_tensor.cpu() if python_lib == torch: py_tensor_after_round_trip = tt_tensor.to_torch() diff --git a/tests/ttnn/unit_tests/tensor/test_tensor_creation.py b/tests/ttnn/unit_tests/tensor/test_tensor_creation.py new file mode 100644 index 000000000000..f0615abba973 --- /dev/null +++ b/tests/ttnn/unit_tests/tensor/test_tensor_creation.py @@ -0,0 +1,122 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +import os +import pathlib + +import torch +import numpy as np + +import ttnn +from tests.ttnn.utils_for_testing import tt_dtype_to_torch_dtype + + +@pytest.mark.parametrize( + "layout", + [ + ttnn.ROW_MAJOR_LAYOUT, + ttnn.TILE_LAYOUT, + ], +) +@pytest.mark.parametrize( + "tt_dtype", + [ + ttnn.uint8, + ttnn.uint16, + ttnn.uint32, + ttnn.int32, + ttnn.float32, + ttnn.bfloat16, + ttnn.bfloat8_b, + ttnn.bfloat4_b, + ], +) +@pytest.mark.parametrize("shape", [(2, 3, 64, 96)]) +def test_tensor_creation(shape, tt_dtype, layout, device): + torch.manual_seed(0) + + dtype = tt_dtype_to_torch_dtype[tt_dtype] + + if dtype in {torch.uint8, torch.int16, torch.int32}: + py_tensor = torch.randint(torch.iinfo(dtype).min, torch.iinfo(dtype).max, shape, dtype=dtype) + else: + py_tensor = torch.rand(shape, dtype=dtype) + + tt_tensor = ttnn.Tensor(py_tensor, tt_dtype, device, layout) + + tt_tensor = tt_tensor.cpu() + + py_tensor_after_round_trip = tt_tensor.to_torch() + + assert py_tensor.dtype == py_tensor_after_round_trip.dtype + assert py_tensor.shape == py_tensor_after_round_trip.shape + + allclose_kwargs = {} + if tt_dtype == ttnn.bfloat8_b: + allclose_kwargs = dict(atol=1e-2) + elif tt_dtype == ttnn.bfloat4_b: + allclose_kwargs = dict(atol=0.2) + + passing = torch.allclose(py_tensor, py_tensor_after_round_trip, **allclose_kwargs) + assert passing + + +@pytest.mark.parametrize( + "layout", + [ + ttnn.ROW_MAJOR_LAYOUT, + ttnn.TILE_LAYOUT, + ], +) +@pytest.mark.parametrize( + "tt_dtype", + [ + ttnn.uint8, + ttnn.uint16, + ttnn.uint32, + ttnn.int32, + ttnn.float32, + ttnn.bfloat16, + ttnn.bfloat8_b, + ttnn.bfloat4_b, + ], +) +@pytest.mark.parametrize("shape", [(2, 3, 64, 96)]) +def test_tensor_creation_api_parity(shape, tt_dtype, layout, device): + torch.manual_seed(0) + + if tt_dtype in (ttnn.bfloat8_b, ttnn.bfloat4_b) and layout == ttnn.ROW_MAJOR_LAYOUT: + pytest.skip("{} is only valid for ttnn.TILE_LAYOUT!".format(tt_dtype)) + + dtype = tt_dtype_to_torch_dtype[tt_dtype] + + if dtype in {torch.uint8, torch.int16, torch.int32}: + py_tensor = torch.randint(torch.iinfo(dtype).min, torch.iinfo(dtype).max, shape, dtype=dtype) + else: + py_tensor = torch.rand(shape, dtype=dtype) + + tt_tensor_1 = ttnn.Tensor(py_tensor, tt_dtype, device, layout) + tt_tensor_2 = ttnn.from_torch(py_tensor, tt_dtype, device=device, layout=layout) + + tt_tensor_1 = tt_tensor_1.cpu() + tt_tensor_2 = tt_tensor_2.cpu() + + py_tensor_after_round_trip_1 = tt_tensor_1.to_torch() + py_tensor_after_round_trip_2 = tt_tensor_2.to_torch() + py_tensor_after_round_trip_3 = ttnn.to_torch(tt_tensor_1) + py_tensor_after_round_trip_4 = ttnn.to_torch(tt_tensor_2) + + allclose_kwargs = {} + if tt_dtype == ttnn.bfloat8_b: + allclose_kwargs = dict(atol=1e-2) + elif tt_dtype == ttnn.bfloat4_b: + allclose_kwargs = dict(atol=0.2) + + passing = torch.allclose(py_tensor, py_tensor_after_round_trip_1, **allclose_kwargs) + passing = torch.allclose(py_tensor, py_tensor_after_round_trip_2, **allclose_kwargs) + passing = torch.allclose(py_tensor, py_tensor_after_round_trip_3, **allclose_kwargs) + passing = torch.allclose(py_tensor, py_tensor_after_round_trip_4, **allclose_kwargs) + assert passing diff --git a/tests/ttnn/unit_tests/tensor/test_tensor_serialization.py b/tests/ttnn/unit_tests/tensor/test_tensor_serialization.py index 1db497c08439..a56dde83d199 100644 --- a/tests/ttnn/unit_tests/tensor/test_tensor_serialization.py +++ b/tests/ttnn/unit_tests/tensor/test_tensor_serialization.py @@ -11,15 +11,7 @@ import numpy as np import ttnn - -tt_dtype_to_torch_dtype = { - ttnn.uint16: torch.int16, - ttnn.uint32: torch.int32, - ttnn.float32: torch.float, - ttnn.bfloat16: torch.bfloat16, - ttnn.bfloat8_b: torch.float, - ttnn.bfloat4_b: torch.float, -} +from tests.ttnn.utils_for_testing import tt_dtype_to_torch_dtype @pytest.mark.parametrize("shape", [(2, 3, 64, 96)]) diff --git a/tests/ttnn/unit_tests/test_print_tensor.py b/tests/ttnn/unit_tests/test_print_tensor.py index 66254f7d3634..90f1ecd51579 100644 --- a/tests/ttnn/unit_tests/test_print_tensor.py +++ b/tests/ttnn/unit_tests/test_print_tensor.py @@ -7,14 +7,8 @@ import torch import ttnn +from tests.ttnn.utils_for_testing import tt_dtype_to_torch_dtype -ttnn_dtype_to_torch_dtype = { - ttnn.uint16: torch.int16, - ttnn.uint32: torch.int32, - ttnn.float32: torch.float, - ttnn.bfloat16: torch.bfloat16, - ttnn.bfloat8_b: torch.float, -} GOLDEN_TENSOR_STRINGS = { ( @@ -77,7 +71,7 @@ def test_print(device, dtype, layout, profile, deallocate): ttnn.set_printoptions(profile=profile) - torch_dtype = ttnn_dtype_to_torch_dtype[dtype] + torch_dtype = tt_dtype_to_torch_dtype[dtype] shape = (2, 16, 64, 32) if torch_dtype in {torch.int16, torch.int32}: diff --git a/tests/ttnn/utils_for_testing.py b/tests/ttnn/utils_for_testing.py index fb083a681fff..92849b32e57d 100644 --- a/tests/ttnn/utils_for_testing.py +++ b/tests/ttnn/utils_for_testing.py @@ -10,6 +10,33 @@ from models.utility_functions import comp_pcc, comp_equal, divup, roundup from typing import Tuple +import ttnn +import torch +import numpy as np + + +# Dictionaries for converting dtypes +tt_dtype_to_torch_dtype = { + ttnn.uint8: torch.uint8, + ttnn.uint16: torch.int16, + ttnn.uint32: torch.int32, + ttnn.int32: torch.int32, + ttnn.float32: torch.float, + ttnn.bfloat16: torch.bfloat16, + ttnn.bfloat8_b: torch.float, + ttnn.bfloat4_b: torch.float, +} + +tt_dtype_to_np_dtype = { + ttnn.uint8: np.ubyte, + ttnn.uint16: np.int16, + ttnn.uint32: np.int32, + ttnn.int32: np.int32, + ttnn.float32: np.float32, + ttnn.bfloat8_b: np.float32, + ttnn.bfloat4_b: np.float32, +} + def construct_pcc_assert_message(message, expected_pytorch_result, actual_pytorch_result): messages = [] diff --git a/tt-train/tests/CMakeLists.txt b/tt-train/tests/CMakeLists.txt index 0faac2d3ee35..5eee6b8e77b8 100644 --- a/tt-train/tests/CMakeLists.txt +++ b/tt-train/tests/CMakeLists.txt @@ -13,7 +13,7 @@ file( add_executable(ttml_tests ${SOURCES}) target_link_libraries( ttml_tests - GTest::gtest_main + GTest::gmock_main ttml ) add_definitions(-DTEST_DATA_DIR="${CMAKE_SOURCE_DIR}/data") diff --git a/tt-train/tests/core/distributed_test.cpp b/tt-train/tests/core/distributed_test.cpp index 0f304788ca38..4d9bc0e8ae6a 100644 --- a/tt-train/tests/core/distributed_test.cpp +++ b/tt-train/tests/core/distributed_test.cpp @@ -2,12 +2,17 @@ // // SPDX-License-Identifier: Apache-2.0 +#include #include #include #include "core/distributed_mapping.hpp" +namespace { + +using ::testing::SizeIs; + template class MeshOpsTest : public ::testing::Test { protected: @@ -25,7 +30,7 @@ TYPED_TEST(MeshOpsTest, ChunkBasicNonDivisible3) { // Chunk into 3 parts along dimension 0 auto chunks = ttml::core::chunk(tensor, 3, 0); - ASSERT_EQ(chunks.size(), 3u); + ASSERT_THAT(chunks, SizeIs(3)); EXPECT_EQ(chunks[0].shape()[0], 4u); // first chunk size 4 EXPECT_EQ(chunks[1].shape()[0], 4u); // next chunk size 4 EXPECT_EQ(chunks[2].shape()[0], 2u); // last chunk size 2 @@ -38,7 +43,7 @@ TYPED_TEST(MeshOpsTest, ChunkBasicLessChunksThanProvided) { // Chunk into 6 parts along dimension 0 auto chunks = ttml::core::chunk(tensor, 6, 0); - ASSERT_EQ(chunks.size(), 5u); + ASSERT_THAT(chunks, SizeIs(5)); EXPECT_EQ(chunks[0].shape()[0], 3u); // first chunk size 3 EXPECT_EQ(chunks[1].shape()[0], 3u); // next chunk size 3 EXPECT_EQ(chunks[2].shape()[0], 3u); // next chunk size 3 @@ -56,7 +61,7 @@ TYPED_TEST(MeshOpsTest, ShardXTensorToMeshBasicShard) { auto shards = sharder.map(tensor); // With 4 shards, each shard should have size 2 - ASSERT_EQ(shards.size(), 4u); + ASSERT_THAT(shards, SizeIs(4)); for (auto& s : shards) { EXPECT_EQ(s.size(), 2u); } @@ -73,7 +78,7 @@ TYPED_TEST(MeshOpsTest, ShardTensor2dMeshTwoDimSharding) { ttml::core::ShardTensor2dMesh sharder(mesh_shape, {0, 1}); auto shards = sharder.map(tensor); - ASSERT_EQ(shards.size(), 4u); + ASSERT_THAT(shards, SizeIs(4)); // Check shapes of shards for (auto& shard : shards) { EXPECT_EQ(shard.shape()[0], 2u); @@ -90,7 +95,7 @@ TYPED_TEST(MeshOpsTest, ReplicateXTensorToMeshReplication) { ttml::core::ReplicateXTensorToMesh replicator(mesh_shape); auto replicas = replicator.map(tensor); - ASSERT_EQ(static_cast(replicas.size()), num_devices); + ASSERT_THAT(replicas, SizeIs(num_devices)); for (const auto& t : replicas) { EXPECT_TRUE(xt::allclose(t, tensor)); } @@ -243,3 +248,4 @@ TYPED_TEST(MeshOpsTest, ConcatenateSameParametersAsCompose) { TypeParam(0), TypeParam(1), TypeParam(2), TypeParam(3), TypeParam(4), TypeParam(5)}; EXPECT_TRUE(xt::allclose(composed, expected)); } +} // namespace diff --git a/tt-train/tests/model/gpt2s_test.cpp b/tt-train/tests/model/gpt2s_test.cpp new file mode 100644 index 000000000000..bfad28597d8d --- /dev/null +++ b/tt-train/tests/model/gpt2s_test.cpp @@ -0,0 +1,95 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include + +#include "autograd/auto_context.hpp" +#include "core/compute_kernel_config.hpp" +#include "core/tt_tensor_utils.hpp" + +enum class ExpectedResult { OK, ERROR }; + +struct MatmulInput { + ttnn::Shape shape_a; + ttnn::Shape shape_b; + bool transpose_a{false}; + bool transpose_b{false}; +}; + +struct MatmulTest { + MatmulInput input; + ExpectedResult expected_result; +}; + +// Matmul tests are based on GPT2-S model with batch size 64 +TEST(GPT2SBatch64Test, Matmul) { + std::vector tests = { + {{{64, 12, 64, 1024}, {64, 12, 1024, 64}, false, false}, ExpectedResult::OK}, + {{{64, 12, 1024, 64}, {64, 12, 1024, 64}, false, true}, ExpectedResult::OK}, + {{{64, 12, 1024, 64}, {64, 12, 1024, 64}, true, false}, ExpectedResult::OK}, + {{{64, 12, 1024, 64}, {64, 12, 64, 1024}, false, false}, ExpectedResult::OK}, + {{{64, 12, 1024, 1024}, {64, 12, 1024, 64}, false, false}, ExpectedResult::OK}, + {{{768, 65536}, {65536, 96}, false, false}, ExpectedResult::OK}, + {{{65536, 768}, {65536, 96}, true, false}, ExpectedResult::OK}, + {{{65536, 96}, {1, 1, 96, 768}, false, false}, ExpectedResult::ERROR}, + {{{65536, 96}, {1, 1, 768, 96}, false, true}, ExpectedResult::ERROR}, + {{{3072, 65536}, {65536, 768}, false, false}, ExpectedResult::OK}, + {{{65536, 3072}, {65536, 768}, true, false}, ExpectedResult::OK}, + {{{65536, 768}, {1, 1, 768, 3072}, false, false}, ExpectedResult::ERROR}, + {{{65536, 768}, {1, 1, 3072, 768}, false, true}, ExpectedResult::ERROR}, + {{{768, 65536}, {65536, 3072}, false, false}, ExpectedResult::OK}, + {{{65536, 768}, {65536, 3072}, true, false}, ExpectedResult::OK}, + {{{65536, 3072}, {1, 1, 3072, 768}, false, false}, ExpectedResult::ERROR}, + {{{65536, 3072}, {1, 1, 768, 3072}, false, true}, ExpectedResult::ERROR}, + {{{65536, 3072}, {3072, 768}, false, false}, ExpectedResult::ERROR}, + {{{65536, 3072}, {768, 3072}, false, true}, ExpectedResult::ERROR}, + {{{768, 65536}, {65536, 768}, false, false}, ExpectedResult::OK}, + {{{65536, 768}, {65536, 768}, true, false}, ExpectedResult::OK}, + {{{65536, 768}, {1, 1, 768, 768}, false, false}, ExpectedResult::ERROR}, + {{{768, 65536}, {1, 1, 768, 768}, true, false}, ExpectedResult::ERROR}, + {{{768, 65536}, {65536, 2304}, false, false}, ExpectedResult::OK}, + {{{65536, 768}, {65536, 2304}, true, false}, ExpectedResult::OK}, + {{{65536, 768}, {768, 50257}, false, false}, ExpectedResult::ERROR}, + {{{65536, 768}, {50257, 768}, false, true}, ExpectedResult::ERROR}, + {{{65536, 50257}, {50257, 768}, false, false}, ExpectedResult::ERROR}, + }; + + auto run_matmul = [](auto& a, auto& b, bool transpose_a, bool transpose_b) { + fmt::println( + "Running matmul with shapes {} and {}, tranpose_a {} transpose_b {}", + a.get_shape(), + b.get_shape(), + transpose_a, + transpose_b); + [[maybe_unused]] auto c = ttnn::matmul( + a, + b, + transpose_a, + transpose_b, + /* memory_config */ std::nullopt, + /* dtype */ std::nullopt, + /* program_config */ std::nullopt, + /* activation */ std::nullopt, + /* compute_kernel_config */ + ttml::core::ComputeKernelConfig::matmul(), + /* core_grid */ ttnn::CoreGrid{7, 8}, + /* output_tile */ std::nullopt); + }; + + for (const auto& [input, expected_result] : tests) { + auto [shape_a, shape_b, transpose_a, transpose_b] = input; + + auto* device = &ttml::autograd::ctx().get_device(); + auto a = ttml::core::empty(shape_a, device, {}); + auto b = ttml::core::empty(shape_b, device, {}); + + if (expected_result == ExpectedResult::OK) { + EXPECT_NO_THROW(run_matmul(a, b, transpose_a, transpose_b)); + } else { + EXPECT_ANY_THROW(run_matmul(a, b, transpose_a, transpose_b)); + } + } +} diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_pack_api.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_pack_api.h index 99dd5e81e54f..62d85c771b02 100644 --- a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_pack_api.h +++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_pack_api.h @@ -343,6 +343,7 @@ inline void llk_pack_reduce_config_v2(uint32_t icb_out) { if constexpr (at_kernel_start) { const std::uint32_t output_id = get_output_id(icb_out); const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); + const std::uint32_t tile_c_dim = get_output_tile_c_dim(output_id); const std::uint32_t num_faces = get_output_num_faces(output_id); const bool partial_face = get_output_partial_face(output_id); const bool narrow_tile = get_output_narrow_tile(output_id); @@ -358,6 +359,7 @@ inline void llk_pack_reduce_config_v2(uint32_t icb_out) { pack_dst_format[output_id], tile_size, face_r_dim, + tile_c_dim, num_faces, partial_face, narrow_tile, diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h new file mode 100644 index 000000000000..ccd4b2e6df29 --- /dev/null +++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h @@ -0,0 +1,27 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "ckernel.h" +#include "ckernel_defs.h" +#include "sfpi.h" + +using namespace sfpi; + +namespace ckernel { +namespace sfpu { + +template +inline void calculate_binary_left_shift(const uint dst_offset) { + _calculate_binary_left_shift_(dst_offset); +} + +template +inline void calculate_binary_right_shift(const uint dst_offset) { + _calculate_binary_right_shift_(dst_offset); +} + +} // namespace sfpu +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h new file mode 100644 index 000000000000..337fdd9df5c7 --- /dev/null +++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "llk_math_eltwise_binary_sfpu_init.h" +#include "llk_math_eltwise_binary_sfpu_params.h" +#include "ckernel_sfpu_shift.h" + +namespace ckernel { + +// New LLK SFPU APIs + +template +inline void llk_math_eltwise_binary_sfpu_shift_init() { + llk_math_eltwise_binary_sfpu_init(); +} + +template +inline void llk_math_eltwise_binary_sfpu_left_shift( + uint dst_index0, uint32_t dst_index1, int vector_mode = VectorMode::RC) { + llk_math_eltwise_binary_sfpu_params( + ckernel::sfpu::calculate_binary_left_shift, dst_index0, dst_index1, vector_mode); +} + +template +inline void llk_math_eltwise_binary_sfpu_right_shift( + uint dst_index0, uint32_t dst_index1, int vector_mode = VectorMode::RC) { + llk_math_eltwise_binary_sfpu_params( + ckernel::sfpu::calculate_binary_right_shift, dst_index0, dst_index1, vector_mode); +} + +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h new file mode 100644 index 000000000000..ccd4b2e6df29 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h @@ -0,0 +1,27 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "ckernel.h" +#include "ckernel_defs.h" +#include "sfpi.h" + +using namespace sfpi; + +namespace ckernel { +namespace sfpu { + +template +inline void calculate_binary_left_shift(const uint dst_offset) { + _calculate_binary_left_shift_(dst_offset); +} + +template +inline void calculate_binary_right_shift(const uint dst_offset) { + _calculate_binary_right_shift_(dst_offset); +} + +} // namespace sfpu +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h new file mode 100644 index 000000000000..337fdd9df5c7 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "llk_math_eltwise_binary_sfpu_init.h" +#include "llk_math_eltwise_binary_sfpu_params.h" +#include "ckernel_sfpu_shift.h" + +namespace ckernel { + +// New LLK SFPU APIs + +template +inline void llk_math_eltwise_binary_sfpu_shift_init() { + llk_math_eltwise_binary_sfpu_init(); +} + +template +inline void llk_math_eltwise_binary_sfpu_left_shift( + uint dst_index0, uint32_t dst_index1, int vector_mode = VectorMode::RC) { + llk_math_eltwise_binary_sfpu_params( + ckernel::sfpu::calculate_binary_left_shift, dst_index0, dst_index1, vector_mode); +} + +template +inline void llk_math_eltwise_binary_sfpu_right_shift( + uint dst_index0, uint32_t dst_index1, int vector_mode = VectorMode::RC) { + llk_math_eltwise_binary_sfpu_params( + ckernel::sfpu::calculate_binary_right_shift, dst_index0, dst_index1, vector_mode); +} + +} // namespace ckernel diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index 9ef41bce94f1..2ea7d2cd21a7 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -117,7 +117,7 @@ class Program_ { std::vector circular_buffers_unique_coreranges() const; - std::vector> semaphores_on_core(const CoreCoord &core) const; + std::vector> semaphores_on_core(const CoreCoord &core, CoreType core_type) const; size_t num_semaphores () const; void init_semaphores ( const Device & device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const; @@ -830,18 +830,18 @@ void detail::Program_::validate_circular_buffer_region(const Device *device) { } } -size_t Program::num_semaphores(const CoreCoord &core) const { return semaphores_on_core(core).size(); } +size_t Program::num_semaphores(const CoreCoord &core, CoreType core_type) const { return semaphores_on_core(core, core_type).size(); } size_t detail::Program_::num_semaphores() const { return semaphores_.size(); } size_t Program::num_semaphores() const { return pimpl_->num_semaphores(); } void detail::Program_::init_semaphores(const Device &device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const { - auto semaphores_on_core = this->semaphores_on_core(logical_core); uint64_t kernel_config_base = hal.get_dev_addr(programmable_core_type_index, HalL1MemAddrType::KERNEL_CONFIG); uint64_t addr = kernel_config_base + this->program_configs_[programmable_core_type_index].sem_offset; CoreType core_type = hal.get_core_type(programmable_core_type_index); + auto semaphores_on_core = this->semaphores_on_core(logical_core, core_type); for (auto semaphore : semaphores_on_core) { llrt::write_hex_vec_to_core( device.id(), @@ -1772,18 +1772,18 @@ void detail::Program_::release_buffers() { owned_buffer_pool = {}; } void Program::release_buffers() { pimpl_->release_buffers(); } -std::vector> detail::Program_::semaphores_on_core(const CoreCoord &core) const { +std::vector> detail::Program_::semaphores_on_core(const CoreCoord &core, CoreType core_type) const { std::vector> semaphores; for (const Semaphore &s : this->semaphores_) { - if (s.initialized_on_logical_core(core)) { + if (s.initialized_on_logical_core(core) && s.core_type() == core_type) { semaphores.emplace_back(std::cref(s)); } } return semaphores; } -std::vector> Program::semaphores_on_core(const CoreCoord &core) const { - return pimpl_->semaphores_on_core(core); +std::vector> Program::semaphores_on_core(const CoreCoord &core, CoreType core_type) const { + return pimpl_->semaphores_on_core(core, core_type); } bool detail::Program_::is_finalized() const { return this->finalized_; } diff --git a/tt_metal/impl/program/program.hpp b/tt_metal/impl/program/program.hpp index bd1a1c027805..2305723485b4 100644 --- a/tt_metal/impl/program/program.hpp +++ b/tt_metal/impl/program/program.hpp @@ -135,9 +135,9 @@ class Program { std::vector circular_buffers_unique_coreranges() const; - std::vector> semaphores_on_core(const CoreCoord &core) const; + std::vector> semaphores_on_core(const CoreCoord &core, CoreType core_type) const; - size_t num_semaphores ( const CoreCoord & core ) const; + size_t num_semaphores ( const CoreCoord & core, CoreType core_type ) const; size_t num_semaphores () const; void init_semaphores ( const Device & device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const; // XXXXX TODO: this should return a const reference diff --git a/tt_metal/include/compute_kernel_api/add_int32_sfpu.h b/tt_metal/include/compute_kernel_api/add_int32_sfpu.h index 4de5ee5b55af..f566c7e34da1 100644 --- a/tt_metal/include/compute_kernel_api/add_int32_sfpu.h +++ b/tt_metal/include/compute_kernel_api/add_int32_sfpu.h @@ -19,6 +19,8 @@ namespace ckernel { * Performs an elementwise add operation with the two integer inputs: y = add(x0,x1) * Output overwrites first operand in DST. * + * The DST register buffer must be in acquired state via *acquire_dst* call. This call is blocking and is only available + * on the compute engine. * A maximum of 4 tiles from each operand can be loaded into DST at once, for a total of 8 tiles, * when using 16 bit formats. This gets reduced to 2 tiles from each operand for 32 bit formats. * diff --git a/tt_metal/include/compute_kernel_api/binary_bitwise_sfpu.h b/tt_metal/include/compute_kernel_api/binary_bitwise_sfpu.h index cf2a20d00901..1ec6d40cecab 100644 --- a/tt_metal/include/compute_kernel_api/binary_bitwise_sfpu.h +++ b/tt_metal/include/compute_kernel_api/binary_bitwise_sfpu.h @@ -19,6 +19,8 @@ namespace ckernel { * Performs an elementwise binary bitwise operation with the two inputs: y = bitwise(x0,x1) * Output overwrites first operand in DST. * + * The DST register buffer must be in acquired state via *acquire_dst* call. This call is blocking and is only available + * on the compute engine. * A maximum of 4 tiles from each operand can be loaded into DST at once, for a total of 8 tiles, * when using 16 bit formats. This gets reduced to 2 tiles from each operand for 32 bit formats. * diff --git a/tt_metal/include/compute_kernel_api/binary_shift.h b/tt_metal/include/compute_kernel_api/binary_shift.h new file mode 100644 index 000000000000..3bd2ddb9a59d --- /dev/null +++ b/tt_metal/include/compute_kernel_api/binary_shift.h @@ -0,0 +1,68 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "compute_kernel_api/common_globals.h" +#ifdef TRISC_MATH +#include "llk_math_eltwise_binary_sfpu_shift.h" +#define MAIN math_main() +#define MATH(x) x +#else +#define MATH(x) +#endif + +namespace ckernel { + +/** + * Performs an elementwise shift operation to the left on the input at idst0, by input at idst1: y = x0 << x1 + * Both inputs must be of Int32 data type only. Output overwrites first operand in DST. + * + * The DST register buffer must be in acquired state via *acquire_dst* call. This call is blocking and is only available + * on the compute engine. + * A maximum of 4 tiles from each operand can be loaded into DST at once, for a total of 8 tiles, + * when using 16 bit formats. This gets reduced to 2 tiles from each operand for 32 bit formats. + * + * Return value: None + * + * | Argument | Description | Type | Valid Range | + * Required | + * |----------------|-----------------------------------------------------------------------|----------|-------------------------------------------------------|----------| + * | idst0 | The index of the tile in DST register buffer to use as first operand | uint32_t | Must be less + * than the size of the DST register buffer | True | | idst1 | The index of the tile in DST register buffer + * to use as second operand | uint32_t | Must be less than the size of the DST register buffer | True | + */ +ALWI void binary_left_shift_tile(uint32_t idst0, uint32_t idst1) { + MATH((llk_math_eltwise_binary_sfpu_left_shift(idst0, idst1))); +} + +/** + * Performs an elementwise shift operation to the right on the input at idst0, by input at idst1: y = x0 >> x1 + * Both inputs must be of Int32 data type only. Output overwrites first operand in DST. + * + * The DST register buffer must be in acquired state via *acquire_dst* call. This call is blocking and is only available + * on the compute engine. + * A maximum of 4 tiles from each operand can be loaded into DST at once, for a total of 8 tiles, + * when using 16 bit formats. This gets reduced to 2 tiles from each operand for 32 bit formats. + * + * Return value: None + * + * | Argument | Description | Type | Valid Range | + * Required | + * |----------------|-----------------------------------------------------------------------|----------|-------------------------------------------------------|----------| + * | idst0 | The index of the tile in DST register buffer to use as first operand | uint32_t | Must be less + * than the size of the DST register buffer | True | | idst1 | The index of the tile in DST register buffer + * to use as second operand | uint32_t | Must be less than the size of the DST register buffer | True | + */ + +ALWI void binary_right_shift_tile(uint32_t idst0, uint32_t idst1) { + MATH((llk_math_eltwise_binary_sfpu_right_shift(idst0, idst1))); +} + +/** + * Please refer to documentation for any_init. + */ +ALWI void binary_shift_tile_init() { MATH((llk_math_eltwise_binary_sfpu_shift_init())); } + +} // namespace ckernel diff --git a/tt_metal/include/compute_kernel_api/eltwise_binary_sfpu.h b/tt_metal/include/compute_kernel_api/eltwise_binary_sfpu.h index 22fc4c13fcf0..239958919401 100644 --- a/tt_metal/include/compute_kernel_api/eltwise_binary_sfpu.h +++ b/tt_metal/include/compute_kernel_api/eltwise_binary_sfpu.h @@ -19,6 +19,8 @@ namespace ckernel { * Performs an elementwise binop operation with the two floating point inputs: y = binop(x0,x1) * Output overwrites first operand in DST. * + * The DST register buffer must be in acquired state via *acquire_dst* call. This call is blocking and is only available + * on the compute engine. * A maximum of 4 tiles from each operand can be loaded into DST at once, for a total of 8 tiles, * when using 16 bit formats. This gets reduced to 2 tiles from each operand for 32 bit formats. * diff --git a/tt_metal/third_party/tt_llk_blackhole b/tt_metal/third_party/tt_llk_blackhole index 7536fbacd75a..973288fb014a 160000 --- a/tt_metal/third_party/tt_llk_blackhole +++ b/tt_metal/third_party/tt_llk_blackhole @@ -1 +1 @@ -Subproject commit 7536fbacd75a4ad62047c63c9c54176fae079e06 +Subproject commit 973288fb014a22ce72cdba1c38a9f41f48532d6d diff --git a/tt_metal/third_party/tt_llk_wormhole_b0 b/tt_metal/third_party/tt_llk_wormhole_b0 index 0f57d4e9dec6..33a7f6a02671 160000 --- a/tt_metal/third_party/tt_llk_wormhole_b0 +++ b/tt_metal/third_party/tt_llk_wormhole_b0 @@ -1 +1 @@ -Subproject commit 0f57d4e9dec602b68671be8891e7af876285f275 +Subproject commit 33a7f6a026719af509a119d8a4e8e36c7c31854c diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index b5402a3ac05a..b8fdf165c52c 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -162,13 +162,13 @@ void ConfigureKernelGroup( } } -std::optional get_semaphore_id(const Program& program, const CoreRange& core_range) { +std::optional get_semaphore_id(const Program &program, const CoreRange& core_range, CoreType core_type) { std::optional semaphore_id = std::nullopt; std::vector semaphore_histogram(NUM_SEMAPHORES, 0); for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { CoreCoord logical_core(x, y); - auto semaphores = program.semaphores_on_core(logical_core); + auto semaphores = program.semaphores_on_core(logical_core, core_type); if (semaphores.size() == NUM_SEMAPHORES) { TT_THROW( "Cannot add semaphore on core {}. Max number of semaphores ({}) reached!", @@ -1158,7 +1158,7 @@ uint32_t CreateSemaphore( for (const auto& core_range : crs.ranges()) { CoreCoord start_core = core_range.start_coord; CoreCoord end_core = core_range.end_coord; - std::optional semaphore_id_candidate = get_semaphore_id(program, core_range); + std::optional semaphore_id_candidate = get_semaphore_id(program, core_range, core_type); if (!semaphore_id.has_value()) { semaphore_id = semaphore_id_candidate; } else { diff --git a/ttnn/cpp/pybind11/pytensor.cpp b/ttnn/cpp/pybind11/pytensor.cpp index 48a360fb3cb2..17de2f3493e0 100644 --- a/ttnn/cpp/pybind11/pytensor.cpp +++ b/ttnn/cpp/pybind11/pytensor.cpp @@ -66,17 +66,17 @@ void log_external_operation( #endif template -Tensor create_owned_tensor( - T* data_ptr, - size_t num_elements, - tt::stl::Span shape, - DataType data_type, - Layout layout, - const std::optional& optional_tile = std::nullopt) { - auto data = std::vector(data_ptr, data_ptr + num_elements); +Tensor create_owned_tensor(T* data_ptr, const ttnn::TensorSpec& tensor_spec) { + std::size_t num_elements = tensor_spec.logical_shape().volume(); + auto data = std::vector(data_ptr, data_ptr + num_elements); auto buffer = owned_buffer::create(std::move(data)); + + if (tensor_spec.layout() == Layout::TILE) { + data = tensor_impl::convert_layout_row_major_to_tile(tensor_spec.physical_shape(), tensor_spec.tile(), buffer); + buffer = owned_buffer::create(std::move(data)); + } auto storage = OwnedStorage{std::move(buffer)}; - return Tensor(std::move(storage), shape, data_type, layout, optional_tile); + return Tensor(std::move(storage), tensor_spec); } OwnedBuffer create_owned_buffer_from_vector_of_floats(std::vector&& data, DataType data_type) { @@ -138,7 +138,7 @@ Tensor convert_float_vector_to_tt_tensor( return tensor; } auto owned_buffer = create_owned_buffer_from_vector_of_floats(std::move(data), data_type); - auto tensor = Tensor(OwnedStorage{owned_buffer}, shape, data_type, layout, tile); + auto tensor = Tensor(OwnedStorage{owned_buffer}, shape, data_type, Layout::ROW_MAJOR, tile).to(layout); if (device) { return tensor.to(device, memory_config.value_or(MemoryConfig{})); } @@ -146,23 +146,30 @@ Tensor convert_float_vector_to_tt_tensor( } Tensor create_tt_tensor_from_py_data( - std::size_t num_elements, std::size_t py_data_ptr, - const ttnn::SmallVector& shape, - const DataType data_type, - const std::optional& optional_tile, - bool enable_borrow, - const std::function& on_creation_callback = [] {}, - const std::function& on_destruction_callback = [] {}) { + const TensorSpec& tensor_spec, + Device* device, + bool override_enable_borrow, + const std::function& on_creation_callback, + const std::function& on_destruction_callback) { + auto layout = tensor_spec.layout(); + + bool enable_borrow = true; + if (layout != Layout::ROW_MAJOR or override_enable_borrow) { + enable_borrow = false; + } + + auto data_type = tensor_spec.data_type(); + std::size_t num_elements = tensor_spec.logical_shape().volume(); switch (data_type) { case DataType::UINT8: { auto data_ptr = reinterpret_cast(py_data_ptr); if (enable_borrow) { auto storage = BorrowedStorage( borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback); - return Tensor(std::move(storage), shape, data_type, Layout::ROW_MAJOR, optional_tile); + return Tensor(std::move(storage), tensor_spec); } else { - return create_owned_tensor(data_ptr, num_elements, shape, data_type, Layout::ROW_MAJOR, optional_tile); + return create_owned_tensor(data_ptr, tensor_spec); } } case DataType::UINT16: { @@ -170,9 +177,9 @@ Tensor create_tt_tensor_from_py_data( if (enable_borrow) { auto storage = BorrowedStorage( borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback); - return Tensor(std::move(storage), shape, data_type, Layout::ROW_MAJOR, optional_tile); + return Tensor(std::move(storage), tensor_spec); } else { - return create_owned_tensor(data_ptr, num_elements, shape, data_type, Layout::ROW_MAJOR, optional_tile); + return create_owned_tensor(data_ptr, tensor_spec); } } case DataType::INT32: { @@ -180,9 +187,9 @@ Tensor create_tt_tensor_from_py_data( if (enable_borrow) { auto storage = BorrowedStorage( borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback); - return Tensor(std::move(storage), shape, data_type, Layout::ROW_MAJOR, optional_tile); + return Tensor(std::move(storage), tensor_spec); } else { - return create_owned_tensor(data_ptr, num_elements, shape, data_type, Layout::ROW_MAJOR, optional_tile); + return create_owned_tensor(data_ptr, tensor_spec); } } case DataType::UINT32: { @@ -190,9 +197,9 @@ Tensor create_tt_tensor_from_py_data( if (enable_borrow) { auto storage = BorrowedStorage( borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback); - return Tensor(std::move(storage), shape, data_type, Layout::ROW_MAJOR, optional_tile); + return Tensor(std::move(storage), tensor_spec); } else { - return create_owned_tensor(data_ptr, num_elements, shape, data_type, Layout::ROW_MAJOR, optional_tile); + return create_owned_tensor(data_ptr, tensor_spec); } } case DataType::FLOAT32: { @@ -200,9 +207,9 @@ Tensor create_tt_tensor_from_py_data( if (enable_borrow) { auto storage = BorrowedStorage( borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback); - return Tensor(std::move(storage), shape, data_type, Layout::ROW_MAJOR, optional_tile); + return Tensor(std::move(storage), tensor_spec); } else { - return create_owned_tensor(data_ptr, num_elements, shape, data_type, Layout::ROW_MAJOR, optional_tile); + return create_owned_tensor(data_ptr, tensor_spec); } } // TODO: This is not supported for numpy @@ -211,27 +218,28 @@ Tensor create_tt_tensor_from_py_data( if (enable_borrow) { auto storage = BorrowedStorage( borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback); - return Tensor(std::move(storage), shape, data_type, Layout::ROW_MAJOR, optional_tile); + return Tensor(std::move(storage), tensor_spec); } else { - return create_owned_tensor(data_ptr, num_elements, shape, data_type, Layout::ROW_MAJOR, optional_tile); + return create_owned_tensor(data_ptr, tensor_spec); } } case DataType::BFLOAT8_B: case DataType::BFLOAT4_B: { auto data_ptr = reinterpret_cast(py_data_ptr); - auto data = std::vector(data_ptr, data_ptr + num_elements); - auto buffer = owned_buffer::create(std::move(data)); - auto tile = optional_tile.value_or(Tile()); - auto tensor = Tensor(OwnedStorage{buffer}, shape, DataType::FLOAT32, Layout::ROW_MAJOR, optional_tile) - .to(Layout::TILE); - auto output_float_data = owned_buffer::get_as(tensor).get(); + auto float_tensor_spec = TensorSpec( + tensor_spec.logical_shape(), + TensorLayout(DataType::FLOAT32, tensor_spec.page_config(), tensor_spec.memory_config())); + auto float_tensor = create_owned_tensor(data_ptr, float_tensor_spec); + + auto tile = tensor_spec.tensor_layout().get_page_config().get_tile(); + auto output_float_data = owned_buffer::get_as(float_tensor).get(); auto output_packed_data = data_type == DataType::BFLOAT8_B ? pack_fp32_vec_as_bfp8_tiles( output_float_data, /*row_major_input=*/false, /*is_exp_a=*/false, tile) : pack_fp32_vec_as_bfp4_tiles( output_float_data, /*row_major_input=*/false, /*is_exp_a=*/false, tile); auto output_buffer = owned_buffer::create(std::move(output_packed_data)); - return Tensor(std::move(OwnedStorage{std::move(output_buffer)}), shape, data_type, Layout::TILE, tile); + return Tensor(std::move(OwnedStorage{std::move(output_buffer)}), tensor_spec); } default: { TT_THROW("Unsupported DataType: {}", data_type); @@ -242,16 +250,26 @@ Tensor create_tt_tensor_from_py_data( Tensor convert_python_tensor_to_tt_tensor( const py::handle& py_tensor, - std::optional optional_data_type = std::nullopt, - const std::optional& optional_tile = std::nullopt, - bool enable_borrow = true) { + std::optional optional_data_type, + std::optional optional_layout, + const std::optional& optional_tile, + const MemoryConfig& memory_config, + Device* device, + bool override_enable_borrow = false) { GraphTracker::instance().track_function_start( - "tt::tt_metal::detail::convert_python_tensor_to_tt_tensor", py_tensor, optional_data_type, enable_borrow); + "tt::tt_metal::detail::convert_python_tensor_to_tt_tensor", + py_tensor, + optional_data_type, + optional_layout, + optional_tile, + memory_config, + device, + override_enable_borrow); py::object torch = py::module_::import("torch"); py::object np = py::module_::import("numpy"); auto py_dtype = py_tensor.attr("dtype"); - auto shape = py::cast>(py_tensor.attr("shape")); + auto shape = ttnn::SimpleShape(py::cast>(py_tensor.attr("shape"))); DataType data_type; @@ -323,7 +341,7 @@ Tensor convert_python_tensor_to_tt_tensor( num_elements = py::cast(contiguous_py_tensor.attr("numel")()); py_data_ptr = py::cast(contiguous_py_tensor.attr("data_ptr")()); } else if (py::isinstance(py_tensor, np.attr("ndarray"))) { - TT_FATAL(enable_borrow, "Owned storage for numpy tensors is untested!"); + TT_FATAL(!override_enable_borrow, "Disabling borrowed buffers for numpy tensors is untested!"); contiguous_py_tensor = np.attr("ascontiguousarray")(py_tensor); @@ -386,17 +404,35 @@ Tensor convert_python_tensor_to_tt_tensor( TT_THROW("The argument must be of type torch.Tensor or numpy.ndarray!"); } + // TODO: Remove check of num_elements from python against volume of ttnn::SimpleShape + TT_FATAL( + num_elements == shape.volume(), + "Number of elements from python tensor {} must match volume of shape {}!", + num_elements, + shape.volume()); + + Layout layout = optional_layout.value_or(Layout::ROW_MAJOR); + if (data_type == DataType::BFLOAT8_B or data_type == DataType::BFLOAT4_B) { + if (optional_layout.has_value() and optional_layout.value() != Layout::TILE) { + log_warning( + tt::LogAlways, + "Tensor layout must be Layout::TILE for bfloat8_b or bfloat4_b! Tensor layout will be {} instead of " + "the requested {}!", + Layout::TILE, + optional_layout.value()); + } + layout = Layout::TILE; + } + + auto tensor_spec = TensorSpec(shape, TensorLayout(data_type, PageConfig(layout, optional_tile), memory_config)); auto on_creation_callback = [tensor = contiguous_py_tensor] { tensor.inc_ref(); }; auto on_destruction_callback = [tensor = contiguous_py_tensor] { tensor.dec_ref(); }; auto output = create_tt_tensor_from_py_data( - num_elements, - py_data_ptr, - shape, - data_type, - optional_tile, - enable_borrow, - on_creation_callback, - on_destruction_callback); + py_data_ptr, tensor_spec, device, override_enable_borrow, on_creation_callback, on_destruction_callback); + + if (device) { + output = output.to(device, memory_config); + } output = tt::tt_metal::set_tensor_id(output); GraphTracker::instance().track_function_end(output); return output; @@ -411,7 +447,8 @@ Tensor convert_python_tensors_to_tt_tensors( "tt::tt_metal::detail::convert_python_tensors_to_tt_tensors", tensor_shards, data_type, strategy); std::vector tt_shards; for (const auto& shard : tensor_shards) { - tt_shards.push_back(detail::convert_python_tensor_to_tt_tensor(shard, data_type, tile, false)); + tt_shards.push_back(detail::convert_python_tensor_to_tt_tensor( + shard, data_type, Layout::ROW_MAJOR, tile, MemoryConfig{}, nullptr, true)); } std::vector host_owned_buffers; std::vector host_owned_shapes; @@ -432,15 +469,68 @@ Tensor convert_python_tensors_to_tt_tensors( return output; } -std::pair, DataType> get_buffer_and_dtype_from_tensor( - const Tensor& tt_tensor) { +template +owned_buffer::Buffer create_row_major_owned_buffer( + owned_buffer::Buffer owned_buffer, const ttnn::TensorSpec& tensor_spec) { + if (tensor_spec.layout() == Layout::TILE) { + auto data = tensor_impl::convert_layout_tile_to_row_major( + tensor_spec.physical_shape(), tensor_spec.tile(), owned_buffer); + return owned_buffer::create(std::move(data)); + } + return owned_buffer; +} + +std::variant get_host_buffer_from_tensor(const Tensor& tt_tensor) { TT_ASSERT(tt_tensor.storage_type() == StorageType::OWNED or tt_tensor.storage_type() == StorageType::BORROWED); - auto buffer = std::visit( - [](auto&& storage) -> std::variant { + const auto& tensor_spec = tt_tensor.get_tensor_spec(); + return std::visit( + [&tensor_spec, &tt_tensor](auto&& storage) -> std::variant { using T = std::decay_t; if constexpr (std::is_same_v) { - return storage.buffer; + auto tt_dtype = tensor_spec.data_type(); + switch (tt_dtype) { + case DataType::UINT8: { + return create_row_major_owned_buffer( + owned_buffer::get_as(storage.buffer), tensor_spec); + } + case DataType::UINT16: { + return create_row_major_owned_buffer( + owned_buffer::get_as(storage.buffer), tensor_spec); + } + case DataType::INT32: { + return create_row_major_owned_buffer( + owned_buffer::get_as(storage.buffer), tensor_spec); + } + case DataType::UINT32: { + return create_row_major_owned_buffer( + owned_buffer::get_as(storage.buffer), tensor_spec); + } + case DataType::FLOAT32: { + return create_row_major_owned_buffer(owned_buffer::get_as(storage.buffer), tensor_spec); + } + case DataType::BFLOAT16: { + return create_row_major_owned_buffer( + owned_buffer::get_as<::bfloat16>(storage.buffer), tensor_spec); + } + case DataType::BFLOAT8_B: + case DataType::BFLOAT4_B: { + const auto& tile = tensor_spec.tile(); + auto uint32_data = owned_buffer::get_as(storage.buffer).get(); + auto float_unpacked_data = + tt_dtype == DataType::BFLOAT8_B + ? unpack_bfp8_tiles_into_float_vec( + uint32_data, /*row_major_output=*/false, /*is_exp_a=*/false, tile) + : unpack_bfp4_tiles_into_float_vec( + uint32_data, /*row_major_output=*/false, /*is_exp_a=*/false, tile); + auto input_float_buffer = owned_buffer::create(std::move(float_unpacked_data)); + return create_row_major_owned_buffer(input_float_buffer, tensor_spec); + } + default: { + TT_THROW("Unsupported DataType: {}", tt_dtype); + break; + } + } } else if constexpr (std::is_same_v) { TT_THROW("Device tensor cannot be converted to torch"); } else if constexpr (std::is_same_v) { @@ -456,52 +546,64 @@ std::pair, DataType> get_buffer_and_dt } }, tt_tensor.get_storage()); - - const auto tile = tt_tensor.get_tensor_spec().tile(); - auto tt_dtype = tt_tensor.get_dtype(); - if (tt_dtype == DataType::BFLOAT8_B || tt_dtype == DataType::BFLOAT4_B) { - TT_ASSERT( - std::holds_alternative(buffer), - "Unexpected type {}", - tt::stl::get_active_type_name_in_variant(buffer)); - auto uint32_data = std::get>(std::get(buffer)).get(); - auto float_unpacked_data = - tt_dtype == DataType::BFLOAT8_B - ? unpack_bfp8_tiles_into_float_vec(uint32_data, /*row_major_output=*/false, /*is_exp_a=*/false, tile) - : unpack_bfp4_tiles_into_float_vec(uint32_data, /*row_major_output=*/false, /*is_exp_a=*/false, tile); - auto input_float_buffer = owned_buffer::create(std::move(float_unpacked_data)); - auto float_tensor = Tensor( - OwnedStorage{input_float_buffer}, - tt_tensor.get_shape(), - DataType::FLOAT32, - tt_tensor.get_layout(), - tile) - .to(Layout::ROW_MAJOR); - auto output_float_data = owned_buffer::get_as(float_tensor).get(); - buffer = owned_buffer::create(std::move(output_float_data)); - tt_dtype = DataType::FLOAT32; - } - - return {buffer, tt_dtype}; } py::object convert_tt_tensor_to_torch_tensor(const Tensor& tt_tensor) { GraphTracker::instance().track_function_start("tt::tt_metal::detail::convert_tt_tensor_to_torch_tensor", tt_tensor); - auto [buffer, buffer_dtype] = get_buffer_and_dtype_from_tensor(tt_tensor); + auto buffer = get_host_buffer_from_tensor(tt_tensor); py::object torch = py::module_::import("torch"); auto frombuffer = torch.attr("frombuffer"); - const auto tt_dtype_to_torch_dtype = std::map{ - {DataType::UINT8, torch.attr("uint8")}, - {DataType::UINT16, torch.attr("int16")}, // TODO(arakhmati): add DataType::INT16 - {DataType::INT32, torch.attr("int32")}, - {DataType::UINT32, torch.attr("int32")}, // TODO(arakhmati): add DataType::INT32 - {DataType::FLOAT32, torch.attr("float32")}, - {DataType::BFLOAT16, torch.attr("bfloat16")}, - }; - auto torch_dtype = tt_dtype_to_torch_dtype.at(buffer_dtype); + auto torch_dtype = [&]() { + if (std::holds_alternative(buffer)) { + return std::visit( + [&torch](auto& owned_buffer) -> py::object { + using T = std::decay_t; + if constexpr (std::is_same_v>) { + return torch.attr("uint8"); + } else if constexpr (std::is_same_v>) { + return torch.attr("int16"); + } else if constexpr (std::is_same_v>) { + return torch.attr("int32"); + } else if constexpr (std::is_same_v>) { + return torch.attr("int32"); + } else if constexpr (std::is_same_v>) { + return torch.attr("float32"); + } else if constexpr (std::is_same_v>) { + return torch.attr("bfloat16"); + } else { + static_assert(tt::stl::concepts::always_false_v, "Unsupported buffer!"); + } + }, + std::get(buffer)); + + } else if (std::holds_alternative(buffer)) { + return std::visit( + [&torch](auto& borrowed_buffer) -> py::object { + using T = std::decay_t; + if constexpr (std::is_same_v>) { + return torch.attr("uint8"); + } else if constexpr (std::is_same_v>) { + return torch.attr("int16"); + } else if constexpr (std::is_same_v>) { + return torch.attr("int32"); + } else if constexpr (std::is_same_v>) { + return torch.attr("int32"); + } else if constexpr (std::is_same_v>) { + return torch.attr("float32"); + } else if constexpr (std::is_same_v>) { + return torch.attr("bfloat16"); + } else { + static_assert(tt::stl::concepts::always_false_v, "Unsupported buffer!"); + } + }, + std::get(buffer)); + } else { + TT_THROW("Only OwnedBuffer or BorrowedBuffer is supported for converting to python buffers!"); + } + }(); auto shape = tt_tensor.get_legacy_shape(); auto torch_shape = std::vector(std::begin(shape), std::end(shape)); @@ -527,19 +629,59 @@ py::object convert_tt_tensor_to_torch_tensor(const Tensor& tt_tensor) { py::object convert_tt_tensor_to_numpy_tensor(const Tensor& tt_tensor) { GraphTracker::instance().track_function_start("tt::tt_metal::detail::convert_tt_tensor_to_numpy_tensor", tt_tensor); - auto [buffer, buffer_dtype] = get_buffer_and_dtype_from_tensor(tt_tensor); + auto buffer = get_host_buffer_from_tensor(tt_tensor); py::object np = py::module_::import("numpy"); auto frombuffer = np.attr("frombuffer"); - const auto tt_dtype_to_np_dtype = std::map{ - {DataType::UINT8, np.attr("ubyte")}, - {DataType::UINT16, np.attr("int16")}, // TODO(arakhmati): add DataType::INT16 - {DataType::INT32, np.attr("int32")}, - {DataType::UINT32, np.attr("int32")}, // TODO(arakhmati): add DataType::INT32 - {DataType::FLOAT32, np.attr("float32")}, - }; - auto np_dtype = tt_dtype_to_np_dtype.at(buffer_dtype); + auto np_dtype = [&]() { + if (std::holds_alternative(buffer)) { + return std::visit( + [&np](auto& owned_buffer) -> py::object { + using T = std::decay_t; + if constexpr (std::is_same_v>) { + return np.attr("ubyte"); + } else if constexpr (std::is_same_v>) { + return np.attr("int16"); + } else if constexpr (std::is_same_v>) { + return np.attr("int32"); + } else if constexpr (std::is_same_v>) { + return np.attr("int32"); + } else if constexpr (std::is_same_v>) { + return np.attr("float32"); + } else if constexpr (std::is_same_v>) { + TT_THROW("Bfloat16 is not supported for numpy!"); + } else { + static_assert(tt::stl::concepts::always_false_v, "Unsupported buffer!"); + } + }, + std::get(buffer)); + + } else if (std::holds_alternative(buffer)) { + return std::visit( + [&np](auto& borrowed_buffer) -> py::object { + using T = std::decay_t; + if constexpr (std::is_same_v>) { + return np.attr("ubyte"); + } else if constexpr (std::is_same_v>) { + return np.attr("int16"); + } else if constexpr (std::is_same_v>) { + return np.attr("int32"); + } else if constexpr (std::is_same_v>) { + return np.attr("int32"); + } else if constexpr (std::is_same_v>) { + return np.attr("float32"); + } else if constexpr (std::is_same_v>) { + TT_THROW("Bfloat16 is not supported for numpy!"); + } else { + static_assert(tt::stl::concepts::always_false_v, "Unsupported buffer!"); + } + }, + std::get(buffer)); + } else { + TT_THROW("Only OwnedBuffer or BorrowedBuffer is supported for converting to python buffers!"); + } + }(); auto shape = tt_tensor.get_legacy_shape(); auto np_shape = std::vector(std::begin(shape), std::end(shape)); @@ -842,7 +984,8 @@ void pytensor_module(py::module& m_tensor) { if (py::isinstance(tensor)) { return detail::convert_python_tensors_to_tt_tensors(tensor, data_type, tile, strategy); } - return detail::convert_python_tensor_to_tt_tensor(tensor, data_type, tile); + return detail::convert_python_tensor_to_tt_tensor( + tensor, data_type, std::nullopt, tile, MemoryConfig{}, nullptr); }), py::arg("tensor"), py::arg("data_type") = std::nullopt, @@ -857,6 +1000,8 @@ void pytensor_module(py::module& m_tensor) { +--------------+------------------------+ | data_type | TT Tensor data type | +--------------+------------------------+ + | tile | TT Tile Spec | + +--------------+------------------------+ Example of creating a TT Tensor that uses torch.Tensor's storage as its own storage: @@ -872,16 +1017,15 @@ void pytensor_module(py::module& m_tensor) { Layout layout, const MemoryConfig& mem_config, const std::optional& tile) { - auto tensor = detail::convert_python_tensor_to_tt_tensor(python_tensor, data_type, tile); - auto layout_tensor = tensor.to(layout); - return layout_tensor.to(device, mem_config); + return detail::convert_python_tensor_to_tt_tensor( + python_tensor, data_type, layout, tile, mem_config, device); }), py::arg("tensor"), py::arg("data_type") = std::nullopt, - py::arg("device").noconvert(), - py::arg("layout").noconvert(), - py::arg("mem_config").noconvert(), - py::arg("tile") = std::nullopt, + py::arg("device") = nullptr, + py::arg("layout").noconvert() = Layout::ROW_MAJOR, + py::arg("mem_config").noconvert() = MemoryConfig{}, + py::arg("tile").noconvert() = std::nullopt, py::return_value_policy::move, R"doc( +--------------+------------------------+ @@ -897,14 +1041,17 @@ void pytensor_module(py::module& m_tensor) { +--------------+------------------------+ | mem_config | TT memory_config | +--------------+------------------------+ + | tile | TT Tile Spec | + +--------------+------------------------+ - Example of creating a TT Tensor that uses torch.Tensor's storage as its own storage: + Example of creating a TT Tensor from numpy tensor: .. code-block:: python + device = ttnn.open_device(device_id=0) py_tensor = np.zeros((1, 1, 32, 32)) - ttnn.Tensor(py_tensor) + ttnn.Tensor(py_tensor, ttnn.bfloat16, device, ttnn.TILE_LAYOUT) )doc") .def_property_readonly("shape", [](const Tensor& self) { return self.get_shape(); }) .def_property_readonly("dtype", [](const Tensor& self) { return self.get_dtype(); }) diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved_v2.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved_v2.cpp index 9348316a2499..696aad3dfe4b 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved_v2.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved_v2.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include #include "dataflow_api.h" inline __attribute__((always_inline)) void fill_pad_cb_with_val( @@ -37,8 +38,10 @@ void kernel_main() { constexpr uint32_t stick_size_padded_end = get_compile_time_arg_val(10); constexpr uint32_t num_zero_pad_sticks_read = get_compile_time_arg_val(11); constexpr uint32_t last_zero_stick_size = get_compile_time_arg_val(12); + constexpr uint32_t stick_size_padded_aligned = get_compile_time_arg_val(21); #define not_pad_by_zero get_compile_time_arg_val(13) == 1 +#define front_padding get_compile_time_arg_val(9) #if (not_pad_by_zero) constexpr uint32_t packed_pad_value = get_compile_time_arg_val(14); constexpr uint32_t row_major_min_bytes = get_compile_time_arg_val(15); @@ -47,8 +50,9 @@ void kernel_main() { constexpr uint32_t num_sticks_padded_read = get_compile_time_arg_val(18); #endif - constexpr auto cb_in0 = tt::CBIndex::c_0; - constexpr auto cb_pad = tt::CBIndex::c_1; + constexpr uint32_t cb_in0 = tt::CBIndex::c_0; + constexpr uint32_t cb_pad = tt::CBIndex::c_1; + constexpr uint32_t cb_pad_align = tt::CBIndex::c_2; #define stick_size_is_pow2 get_compile_time_arg_val(19) == 1 #if (stick_size_is_pow2) @@ -68,8 +72,14 @@ void kernel_main() { uint64_t pad_val_addr = get_read_ptr(cb_pad); uint64_t pad_val_noc_addr = get_noc_addr(pad_val_addr); + uint64_t pad_align_addr = get_read_ptr(cb_pad_align); + uint64_t pad_align_write_addr = get_write_ptr(cb_pad_align); + uint64_t pad_align_noc_addr = get_noc_addr(pad_align_addr); + #if (not_pad_by_zero) - fill_pad_cb_with_val(cb_pad, row_major_min_bytes, packed_pad_value); + fill_pad_cb_with_val(cb_pad, stick_size_padded, packed_pad_value); +#else + fill_pad_cb_with_val(cb_pad, stick_size_padded, 0); #endif uint32_t i_stick = start_id; @@ -82,55 +92,23 @@ void kernel_main() { bool read_stick = (curr_h >= front_pad_h and curr_h < H) and (curr_c >= front_pad_c and curr_c < C) and (curr_n >= front_pad_n and curr_n < N); uint64_t read_noc_addr = get_noc_addr(i_stick, s); + noc_async_read(pad_val_noc_addr, l1_write_addr, stick_size_padded); if (read_stick) { -#if (not_pad_by_zero) - if constexpr (stick_size_padded_front != 0) { - for (uint32_t j = 0; j < num_front_pad_sticks_read; ++j) { - noc_async_read(pad_val_noc_addr, l1_write_addr, row_major_min_bytes); - l1_write_addr += row_major_min_bytes; - } - } +#if (front_padding) + // Read noc into cb_pad_align l1 + noc_async_read(read_noc_addr, get_write_ptr(cb_pad_align), stick_size_bytes); + noc_async_read_barrier(); + memmove( + (void*)(l1_write_addr + stick_size_padded_front), + (void*)(get_read_ptr(cb_pad_align)), + (size_t)(stick_size_bytes)); #else - if constexpr (stick_size_padded_front != 0) { - noc_async_read(zeros_noc_addr, l1_write_addr, stick_size_padded_front); - l1_write_addr += stick_size_padded_front; - } -#endif - noc_async_read(read_noc_addr, l1_write_addr, stick_size_bytes); - l1_write_addr += stick_size_bytes; - i_stick++; - -#if (not_pad_by_zero) - if constexpr (stick_size_padded_end != 0) { - for (uint32_t j = 0; j < num_end_pad_sticks_read; ++j) { - noc_async_read(pad_val_noc_addr, l1_write_addr, row_major_min_bytes); - l1_write_addr += row_major_min_bytes; - } - } -#else - if constexpr (stick_size_padded_end != 0) { - noc_async_read(zeros_noc_addr, l1_write_addr, stick_size_padded_end); - l1_write_addr += stick_size_padded_end; - } -#endif - - } else { -#if (not_pad_by_zero) - for (uint32_t j = 0; j < num_sticks_padded_read; ++j) { - noc_async_read(pad_val_noc_addr, l1_write_addr, row_major_min_bytes); - l1_write_addr += row_major_min_bytes; - } -#else - for (uint32_t j = 0; j < num_zero_pad_sticks_read; ++j) { - auto read_bytes = j == num_zero_pad_sticks_read - 1 ? last_zero_stick_size : 512; - noc_async_read(zeros_noc_addr, l1_write_addr, read_bytes); - l1_write_addr += read_bytes; - } #endif + i_stick++; } - + l1_write_addr += stick_size_padded_aligned; curr_h++; if (curr_h == H_padded) { curr_c++; @@ -142,7 +120,6 @@ void kernel_main() { } } noc_async_read_barrier(); - cb_push_back(cb_in0, num_read_per_barrier); } } diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_interleaved_v2.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_interleaved_v2.cpp index 658a24bb6684..80ff4013d627 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_interleaved_v2.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_interleaved_v2.cpp @@ -13,6 +13,7 @@ void kernel_main() { constexpr uint32_t cb_out0 = get_compile_time_arg_val(0); constexpr bool dst_is_dram = get_compile_time_arg_val(1) == 1; constexpr uint32_t W_size_bytes = get_compile_time_arg_val(2); + constexpr uint32_t stick_size_padded_aligned = get_compile_time_arg_val(5); const uint32_t stick_size_bytes = W_size_bytes; @@ -38,7 +39,7 @@ void kernel_main() { for (uint32_t i = 0; i < num_read_per_barrier; ++i) { uint64_t write_noc_addr = get_noc_addr(i_stick, s); noc_async_write(l1_read_addr, write_noc_addr, stick_size_bytes); - l1_read_addr += stick_size_bytes; + l1_read_addr += stick_size_padded_aligned; i_stick += 1; } noc_async_write_barrier(); diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp index e0b5e4859d01..af7ea10d2259 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp @@ -1029,6 +1029,7 @@ operation::ProgramWithCallbacks pad_rm_reader_writer_multi_core_v2( auto stick_size_padded = W_padded * a.element_size(); auto stick_size_padded_front = front_pad[-1] * a.element_size(); auto stick_size_padded_end = stick_size_padded - stick_size - stick_size_padded_front; + uint32_t stick_size_padded_aligned = align(stick_size_padded, hal.get_alignment(HalMemType::L1)); uint32_t row_major_min_bytes = 16; tt::DataFormat cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype()); @@ -1050,24 +1051,31 @@ operation::ProgramWithCallbacks pad_rm_reader_writer_multi_core_v2( num_sticks_padded_per_core_group_2] = tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, NCH_padded); - uint32_t src0_cb_index = 0; + uint32_t src0_cb_index = tt::CBIndex::c_0; auto num_sticks = num_sticks_padded_per_core_group_1 > num_sticks_padded_per_core_group_2 ? num_sticks_padded_per_core_group_1 : num_sticks_padded_per_core_group_2; tt::tt_metal::CircularBufferConfig cb_src0_config = - tt::tt_metal::CircularBufferConfig(num_sticks * stick_size_padded, {{src0_cb_index, cb_data_format}}) - .set_page_size(src0_cb_index, stick_size_padded); + tt::tt_metal::CircularBufferConfig(num_sticks * stick_size_padded_aligned, {{src0_cb_index, cb_data_format}}) + .set_page_size(src0_cb_index, stick_size_padded_aligned); auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src0_config); // construct const buffer with the pad_value bool not_pad_by_zero = pad_value != 0; - if (not_pad_by_zero) { - uint32_t src1_cb_index = 1; - tt::tt_metal::CircularBufferConfig cb_src1_config = - tt::tt_metal::CircularBufferConfig(row_major_min_bytes, {{src1_cb_index, cb_data_format}}) - .set_page_size(src1_cb_index, row_major_min_bytes); - auto cb_src1 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src1_config); + + uint32_t src1_cb_index = tt::CBIndex::c_1; + tt::tt_metal::CircularBufferConfig cb_src1_config = + tt::tt_metal::CircularBufferConfig(stick_size_padded_aligned, {{src1_cb_index, cb_data_format}}) + .set_page_size(src1_cb_index, stick_size_padded_aligned); + auto cb_src1 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src1_config); + + if (stick_size_padded_front != 0) { + uint32_t src2_cb_index = tt::CBIndex::c_2; + tt::tt_metal::CircularBufferConfig cb_src2_config = + tt::tt_metal::CircularBufferConfig(stick_size_padded_aligned, {{src2_cb_index, cb_data_format}}) + .set_page_size(src2_cb_index, stick_size_padded_aligned); + auto cb_src2 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src2_config); } Buffer* src0_buffer = a.buffer(); @@ -1104,13 +1112,15 @@ operation::ProgramWithCallbacks pad_rm_reader_writer_multi_core_v2( (std::uint32_t)(stick_size_padded_end / row_major_min_bytes), (std::uint32_t)(stick_size_padded / row_major_min_bytes), (std::uint32_t)src_stick_size_is_power_of_two, - (std::uint32_t)src_stick_size_is_power_of_two ? src_log2_stick_size : stick_size}; + (std::uint32_t)src_stick_size_is_power_of_two ? src_log2_stick_size : stick_size, + (std::uint32_t)stick_size_padded_aligned}; std::vector writer_ct_args = { (std::uint32_t)src0_cb_index, (std::uint32_t)dst_is_dram, (std::uint32_t)stick_size_padded, (std::uint32_t)dst_stick_size_is_power_of_two, - (std::uint32_t)dst_stick_size_is_power_of_two ? dst_log2_stick_size : stick_size_padded}; + (std::uint32_t)dst_stick_size_is_power_of_two ? dst_log2_stick_size : stick_size_padded, + (std::uint32_t)stick_size_padded_aligned}; KernelHandle reader_kernel_id = CreateKernel( program, diff --git a/ttnn/cpp/ttnn/tensor/tensor_spec.hpp b/ttnn/cpp/ttnn/tensor/tensor_spec.hpp index 125b3bb719f4..172e0d881f5f 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_spec.hpp +++ b/ttnn/cpp/ttnn/tensor/tensor_spec.hpp @@ -28,6 +28,7 @@ class TensorSpec final { DataType data_type() const { return tensor_layout_.get_data_type(); } Layout layout() const { return tensor_layout_.get_layout(); } PageConfig page_config() const { return tensor_layout_.get_page_config(); } + const MemoryConfig& memory_config() const { return tensor_layout_.get_memory_config(); } const ttnn::SimpleShape& padded_shape() const { return cached_padded_shape_; } const Size& physical_shape() const { return cached_physical_shape_; } ttnn::Shape shape() const { return ttnn::Shape(logical_shape_.view(), cached_padded_shape_.view()); } diff --git a/ttnn/tt_lib/fused_ops/softmax.py b/ttnn/tt_lib/fused_ops/softmax.py index f5b2f5fceb4d..904b4cea008e 100644 --- a/ttnn/tt_lib/fused_ops/softmax.py +++ b/ttnn/tt_lib/fused_ops/softmax.py @@ -42,7 +42,7 @@ def ref_stable_softmax(x): if __name__ == "__main__": - device = ttnn.open_device(0) + device = ttnn.open_device(device_id=0) H, W = 64, 96 torch.manual_seed(123) diff --git a/ttnn/tt_lib/utils.py b/ttnn/tt_lib/utils.py index 9883666b81f0..a61f97594641 100644 --- a/ttnn/tt_lib/utils.py +++ b/ttnn/tt_lib/utils.py @@ -8,6 +8,8 @@ import torch import numpy as np +from typing_extensions import deprecated + def _nearest_32(x): return math.ceil(x / 32) * 32 @@ -134,108 +136,22 @@ def convert_act_2d_matrix(activation, kernel_y, kernel_x, stride_y, stride_x, pa return ret.reshape(ret_shape) +@deprecated("PyTorch data is handled automatically in tensor infra. This function does nothing now:") def tilize(x): - """ - This function tilizes a tensor. The last two tensor dims must be divisible by 32, after which this function - produces row major tiles and creates faces. The output of this function is a flattened list that - we can send to the device. - - :param x: Input PyTorch Tensor - :type x: class:`torch.Tensor` - - WARNING: This function should eventually be retired in favour of fully tilizing on device. - """ - nearest_32 = _nearest_32 - - assert isinstance( - x, (torch.Tensor, np.ndarray) - ), "Input to this function must be an instance of torch.Tensor or np.array" - assert len(x.shape) == 4, "Only 4D tensors suppported" - assert (x.shape[-2] % 32) == 0 and ( - x.shape[-1] % 32 - ) == 0, "The last two dimensions of the tensor must be divisible by 32" - - if isinstance(x, torch.Tensor): - ret = torch.zeros(np.prod(x.shape)) - else: - ret = np.zeros(np.prod(x.shape)) - - idx = 0 - for B in range(x.shape[0]): - for C in range(x.shape[1]): - for H in range(0, x.shape[2], 32): - for W in range(0, x.shape[3], 32): - unfaced_tile = x[B, C, H : H + 32, W : W + 32] - - face0 = unfaced_tile[:16, :16] - face1 = unfaced_tile[:16, 16:] - face2 = unfaced_tile[16:, :16] - face3 = unfaced_tile[16:, 16:] - - for face in (face0, face1, face2, face3): - ret[idx : idx + 256] = face.reshape(-1) - idx += 256 - - return ret.reshape(x.shape) + return x +@deprecated("PyTorch data is handled automatically in tensor infra. This function does nothing now:") def tilize_to_list(x): """ - Tilize a PyTorch and then return the values as a flat list. The last two - tensor dims must be divisible by 32, after which this function produces row - major tiles and creates faces. - - :param x: Input PyTorch Tensor - :type x: class:`torch.Tensor` - - WARNING: This function should eventually be retired in favour of fully tilizing on device. + Returns a flattened list of the tensor """ - return tilize(x).reshape(-1).tolist() +@deprecated("PyTorch data is handled automatically in tensor infra. This function does nothing now:") def untilize(x): - """ - This function untilizes a tensor to row major format. - - :param x: Input PyTorch Tensor - :type x: class:`torch.Tensor` - - WARNING: This function should eventually be retired in favour of fully tilizing on device. - """ - nearest_32 = _nearest_32 - - assert isinstance(x, (torch.Tensor, np.ndarray)), "Input to this function must be an instance of torch.Tensor" - assert len(x.shape) == 4, "Only 4D tensors suppported" - assert (x.shape[-2] % 32) == 0 and ( - x.shape[-1] % 32 - ) == 0, "The last two dimensions of the tensor must be divisible by 32" - - if isinstance(x, torch.Tensor): - ret = torch.zeros(x.shape) - else: - ret = np.zeros(x.shape) - - for B in range(x.shape[0]): - for C in range(x.shape[1]): - x_hw = x[B, C, :].reshape(-1) - hw = 0 - for h in range(0, x.shape[2], 32): - for w in range(0, x.shape[3], 32): - f_tile = x_hw[hw : hw + 256].reshape(16, 16) - ret[B, C, h : h + 16, w : w + 16] = f_tile - - f_tile = x_hw[hw + 256 : hw + 512].reshape(16, 16) - ret[B, C, h : h + 16, w + 16 : w + 32] = f_tile - - f_tile = x_hw[hw + 512 : hw + 768].reshape(16, 16) - ret[B, C, h + 16 : h + 32, w : w + 16] = f_tile - - f_tile = x_hw[hw + 768 : hw + 1024].reshape(16, 16) - ret[B, C, h + 16 : h + 32, w + 16 : w + 32] = f_tile - hw += 1024 # traverse tiles in RM-order - - return ret + return x def print_diff_argmax(a, b, annotation=""):