diff --git a/.github/actions/docker-run/action.yml b/.github/actions/docker-run/action.yml index 59a26859289..54f7d431769 100644 --- a/.github/actions/docker-run/action.yml +++ b/.github/actions/docker-run/action.yml @@ -38,7 +38,7 @@ runs: uses: ./.github/actions/generate-docker-tag with: image: ${{ inputs.docker_os_arch }} - - name: Set + - name: Set shell: bash run: | echo "RUNNER_UID=$(id -u)" >> $GITHUB_ENV @@ -63,8 +63,10 @@ runs: # The most important option below is `--rm`. Otherwise, the machines will fill up with undeleted containers. # The mounting of /etc/passwd, /etc/shadow, and /etc/bashrc is required in order for the correct file permissions # for newly created files. - # Passing HOME variable is necessary to avoid Python lib installation into /home/ubuntu/.local folder which + # Passing HOME variable is necessary to avoid Python lib installation into /home/ubuntu/.local folder which # may not be writable by the RUNNER_UID user. + # --log-driver none: Do not save logs to disk as we're printing them to GitHub + # and it takes up space options: | -u ${{ env.RUNNER_UID }}:${{ env.RUNNER_GID }} --rm @@ -73,6 +75,7 @@ runs: -v /etc/bashrc:/etc/bashrc:ro -v ${{ github.workspace }}:${{ github.workspace }} --net=host + --log-driver none ${{ inputs.docker_opts }} -e LOGURU_LEVEL=${{ env.LOGURU_LEVEL }} -e PYTHONPATH=${{ github.workspace }} diff --git a/.github/workflows/all-static-checks.yaml b/.github/workflows/all-static-checks.yaml index 914ba2fa1ee..df96cc19546 100644 --- a/.github/workflows/all-static-checks.yaml +++ b/.github/workflows/all-static-checks.yaml @@ -8,6 +8,38 @@ on: - "main" jobs: + pre-commit: + name: Run Pre-commit Hooks + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetch all history so 'origin/main' is available + fetch-refs: true # Ensure all refs are fetched + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.11 + + - name: Run Pre-commit + uses: pre-commit/action@v3.0.1 + with: + extra_args: | + --from-ref ${{ github.event_name == 'pull_request' && format('refs/remotes/origin/{0}', github.event.pull_request.base.ref) || 'HEAD^' }} \ + --to-ref HEAD + continue-on-error: false + check-black: + runs-on: ubuntu-latest + steps: + - name: Do Nothing + run: echo "Black is covered by pre-commit. This is a placeholder to be removed after updating branch restrictions." + + check-spdx-licenses: runs-on: ubuntu-latest steps: @@ -27,11 +59,6 @@ jobs: - uses: actions/checkout@v4 - name: Check kernel count in base metal is less than maximum run: if (( $(find tt_metal/kernels/ -type f | wc -l) > 7 )); then exit 1; fi - check-black: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: psf/black@23.10.1 check-doc: runs-on: ubuntu-latest steps: diff --git a/.github/workflows/build-artifact.yaml b/.github/workflows/build-artifact.yaml index a6d5e1f0bea..bcfbd7bf2f4 100644 --- a/.github/workflows/build-artifact.yaml +++ b/.github/workflows/build-artifact.yaml @@ -127,6 +127,8 @@ jobs: -e ARCH_NAME=${{ matrix.arch }} -w ${{ github.workspace }} run: | + set -eu # basic shell hygiene + # /tmp is a tmpfs; more efficient than persisted storage mkdir -p /tmp/ccache export CCACHE_TEMPDIR=/tmp/ccache diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml index 8f1777db303..eb55fb592cc 100644 --- a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml +++ b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml @@ -54,27 +54,6 @@ jobs: cmd: tests/scripts/single_card/nightly/run_gs_only.sh, timeout: 40 }, - { - name: "API tests GS", - arch: grayskull, - runs-on: ["cloud-virtual-machine", "E150", "in-service"], - cmd: ./tests/scripts/run_tests.sh --tt-arch grayskull --pipeline-type frequent_api --dispatch-mode fast, - timeout: 10 - }, - { - name: "API tests N300 WH B0", - arch: wormhole_b0, - runs-on: ["cloud-virtual-machine", "N300", "in-service"], - cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast, - timeout: 10 - }, - { - name: "API tests N150 WH B0", - arch: wormhole_b0, - runs-on: ["cloud-virtual-machine", "N150", "in-service"], - cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast, - timeout: 10 - }, { name: "[Unstable] N150 models", arch: wormhole_b0, diff --git a/.github/workflows/ttnn-run-sweeps.yaml b/.github/workflows/ttnn-run-sweeps.yaml index 40371321e11..f216b98debc 100644 --- a/.github/workflows/ttnn-run-sweeps.yaml +++ b/.github/workflows/ttnn-run-sweeps.yaml @@ -145,6 +145,15 @@ on: - eltwise.unary_backward.hardsigmoid_bw.hardsigmoid_bw - eltwise.unary_backward.lgamma_bw.lgamma_bw - eltwise.unary_backward.multigammaln_bw.multigammaln_bw + - eltwise.unary_backward.leaky_relu_bw.leaky_relu_bw + - eltwise.unary_backward.elu_bw.elu_bw + - eltwise.unary_backward.celu_bw.celu_bw + - eltwise.unary_backward.selu_bw.selu_bw + - eltwise.unary_backward.silu_bw.silu_bw + - eltwise.unary_backward.floor_bw.floor_bw + - eltwise.unary_backward.tanhshrink_bw.tanhshrink_bw + - eltwise.unary_backward.hardswish_bw.hardswish_bw + - eltwise.unary_backward.rpow_bw.rpow_bw - eltwise.unary.lgamma - eltwise.unary.logit - eltwise.unary.mish @@ -211,6 +220,8 @@ on: - eltwise.binary_backward.subalpha_bw.subalpha_bw - eltwise.binary_backward.xlogy_bw.xlogy_bw - eltwise.binary_backward.hypot_bw.hypot_bw + - eltwise.binary_backward.rsub_bw.rsub_bw + - eltwise.binary_backward.squared_difference_bw.squared_difference_bw - eltwise.composite.binary.addalpha.addalpha - eltwise.composite.binary.subalpha.subalpha - eltwise.composite.binary.minimum.minimum @@ -228,6 +239,7 @@ on: - eltwise.ternary.where.where_pytorch2 - reduction.topk.topk - reduction.argmax.argmax + - embedding.embedding - matmul.full.matmul_default_block_sharded - matmul.full.matmul_default_height_sharded - matmul.full.matmul_default_interleaved @@ -257,6 +269,7 @@ on: - data_movement.index_select.index_select_pytorch2 - data_movement.split.split_with_sizes_pytorch2 - data_movement.repeat.repeat + - data_movement.nonzero.nonzero - conv2d.full.conv2d_misc - conv2d.full.conv2d_sharding - conv2d.full.conv2d_sliding_window diff --git a/README.md b/README.md index d917d520b0d..6863a9a9a9b 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,6 @@ | [ViT](./models/demos/grayskull/vit) | 9 | [e150](https://tenstorrent.com/hardware/grayskull) | 1,360 | 2,000 | | | [ViT](./models/demos/wormhole/vit) | 8 | [n150](https://tenstorrent.com/hardware/wormhole) | 912 | 1,600 | | | [Stable Diffusion 1.4 (512x512)](./models/demos/wormhole/stable_diffusion) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 0.167 | 0.3 | | -| [U-Net](./models/experimental/functional_unet) | 2 | [n150](https://tenstorrent.com/hardware/wormhole) | 530 | 1000 | [v0.53.0-rc22](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc22) | ## NLPs diff --git a/models/experimental/yolov4/demo/demo.py b/models/experimental/yolov4/demo/demo.py index 3c340808b25..ea4aa375530 100644 --- a/models/experimental/yolov4/demo/demo.py +++ b/models/experimental/yolov4/demo/demo.py @@ -13,7 +13,8 @@ from models.experimental.yolov4.reference.yolov4 import Yolov4 from models.experimental.yolov4.ttnn.yolov4 import TtYOLOv4 - +from models.experimental.yolov4.ttnn.weight_parameter_update import update_weight_parameters +from collections import OrderedDict import ttnn from models.utility_functions import skip_for_grayskull @@ -418,11 +419,7 @@ def do_detect(model, img, conf_thresh, nms_thresh, n_classes, device=None, class if not is_torch_model: input_shape = img.shape input_tensor = torch.permute(img, (0, 2, 3, 1)) - - input_tensor = input_tensor.reshape( - input_tensor.shape[0], 1, input_tensor.shape[1] * input_tensor.shape[2], input_tensor.shape[3] - ) - input_tensor = ttnn.from_torch(input_tensor, device=device) + input_tensor = ttnn.from_torch(input_tensor, ttnn.bfloat16) img = input_tensor t1 = time.time() @@ -534,33 +531,44 @@ def do_detect(model, img, conf_thresh, nms_thresh, n_classes, device=None, class @skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) -def test_yolov4_model(device, model_location_generator, reset_seeds, input_path): +@pytest.mark.parametrize( + "use_pretrained_weight", + [True, False], + ids=[ + "pretrained_weight_true", + "pretrained_weight_false", + ], +) +def test_yolov4_model(device, model_location_generator, reset_seeds, input_path, use_pretrained_weight): model_path = model_location_generator("models", model_subdir="Yolo") - if model_path == "models": - if not os.path.exists("tests/ttnn/integration_tests/yolov4/yolov4.pth"): # check if yolov4.th is availble - os.system( - "tests/ttnn/integration_tests/yolov4/yolov4_weights_download.sh" - ) # execute the yolov4_weights_download.sh file - - weights_pth = "tests/ttnn/integration_tests/yolov4/yolov4.pth" - else: - weights_pth = str(model_path / "yolov4.pth") - - ttnn_model = TtYOLOv4(weights_pth) - - torch_model = Yolov4() + if use_pretrained_weight: + if model_path == "models": + if not os.path.exists("tests/ttnn/integration_tests/yolov4/yolov4.pth"): # check if yolov4.th is availble + os.system( + "tests/ttnn/integration_tests/yolov4/yolov4_weights_download.sh" + ) # execute the yolov4_weights_download.sh file + + weights_pth = "tests/ttnn/integration_tests/yolov4/yolov4.pth" + else: + weights_pth = str(model_path / "yolov4.pth") - new_state_dict = {} - ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items()} + ttnn_model = TtYOLOv4(weights_pth) + torch_model = Yolov4() + new_state_dict = {} + ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items()} - keys = [name for name, parameter in torch_model.state_dict().items()] - values = [parameter for name, parameter in ds_state_dict.items()] + keys = [name for name, parameter in torch_model.state_dict().items()] + values = [parameter for name, parameter in ds_state_dict.items()] - for i in range(len(keys)): - new_state_dict[keys[i]] = values[i] + for i in range(len(keys)): + new_state_dict[keys[i]] = values[i] - torch_model.load_state_dict(new_state_dict) - torch_model.eval() + torch_model.load_state_dict(new_state_dict) + torch_model.eval() + else: + torch_model = Yolov4.from_random_weights() + ttnn_weights = update_weight_parameters(OrderedDict(torch_model.state_dict())) + ttnn_model = TtYOLOv4(ttnn_weights) n_classes = 80 namesfile = "models/experimental/yolov4/demo/coco.names" diff --git a/models/experimental/yolov4/reference/yolov4.py b/models/experimental/yolov4/reference/yolov4.py index d65869585c5..691c808a29c 100644 --- a/models/experimental/yolov4/reference/yolov4.py +++ b/models/experimental/yolov4/reference/yolov4.py @@ -36,3 +36,16 @@ def forward(self, input: torch.Tensor): x4, x5, x6 = self.head(x20, x13, x6) return x4, x5, x6 + + @staticmethod + def from_random_weights(): + model = Yolov4() + model.eval() + + new_state_dict = {} + for name, parameter in model.state_dict().items(): + if isinstance(parameter, torch.FloatTensor): + new_state_dict[name] = parameter + + model.load_state_dict(new_state_dict) + return model diff --git a/models/experimental/yolov4/ttnn/weight_parameter_update.py b/models/experimental/yolov4/ttnn/weight_parameter_update.py new file mode 100644 index 00000000000..cfe3d864713 --- /dev/null +++ b/models/experimental/yolov4/ttnn/weight_parameter_update.py @@ -0,0 +1,71 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import re +from collections import OrderedDict + + +def update_weigth_keys(key): + key = key.replace("downsample", "down") + key = key.replace("neck", "neek") + if ".res" in key: + + def res_name_update(match): + chr = match.group(1) + num = int(match.group(2)) + if num == 0 or num == 1: + return f".{chr}.0.conv.{num}." + if num == 3 or num == 4: + return f".{chr}.1.conv.{num-3}." + + key = re.sub(r"\.res\.", r".resblock.", key) + key = re.sub(r"\.(\d+)\.(\d+)\.", res_name_update, key) + return key + if "neek" in key: + + def neek_underscore_update_rule(match): + chr = match.group(1) + num1 = int(match.group(2)) + num2 = int(match.group(3)) + dict = { + (7, 2): 8, + (7, 3): 9, + (7, 4): 11, + (8, 2): 12, + (7, 5): 13, + (9, 2): 15, + (9, 3): 16, + (9, 4): 18, + (10, 2): 19, + (9, 5): 20, + } + if chr == "b": + return f".conv{dict[(num1, num2)]}.conv.1." + return f".conv{dict[(num1, num2)]}.conv.0." + + def neck_rename_update(match): + chr = match.group(1) + num = int(match.group(2)) + if num <= 7: + return f".conv{num}.conv.1." if chr == "b" else f".conv{num}.conv.0." + dict = {8: 10, 9: 14, 10: 17} + return f".conv{dict[num]}.conv.1." if chr == "b" else f".conv{dict[num]}.conv.0." + + updated_name = re.sub(r"\.([a-z])(\d+)_(\d+)\.", neek_underscore_update_rule, key) + if key != updated_name: # chk if name got updated + return updated_name + updated_name = re.sub(r"\.([a-z])(\d+)\.", neck_rename_update, key) + if key != updated_name: + return updated_name + key = re.sub(r"\.c(\d+)\.", r".conv\1.conv.0.", key) + key = re.sub(r"\.b(\d+)\.", r".conv\1.conv.1.", key) + return key + + +def update_weight_parameters(model_weight): + ttnn_model_random_weight = OrderedDict() + for key, weight in model_weight.items(): + updated_key = update_weigth_keys(key) + ttnn_model_random_weight[updated_key] = weight + return ttnn_model_random_weight diff --git a/models/experimental/yolov4/ttnn/yolov4.py b/models/experimental/yolov4/ttnn/yolov4.py index fd951678893..015e490d24f 100644 --- a/models/experimental/yolov4/ttnn/yolov4.py +++ b/models/experimental/yolov4/ttnn/yolov4.py @@ -25,7 +25,10 @@ class TtYOLOv4: def __init__(self, path) -> None: - self.torch_model = torch.load(path) + if type(path) is str: + self.torch_model = torch.load(path) + else: + self.torch_model = path self.torch_keys = self.torch_model.keys() self.down1 = Down1(self) self.down2 = Down2(self) diff --git a/tech_reports/ttnn/graph-tracing.md b/tech_reports/ttnn/graph-tracing.md new file mode 100644 index 00000000000..e1e07c1d731 --- /dev/null +++ b/tech_reports/ttnn/graph-tracing.md @@ -0,0 +1,708 @@ +# TT-NN Graph Trace +TT-NN provides a mechanism for tracing operations and memory activities in a neural network's execution. + +Using this trace it is possible to analyze the operation even without executing it on the accelerator. +The output trace can then be processed to get a single number like Peak Memory Load or to print tabular data or visualize a call graph. + +## 🪄 How to Use +Wrap any number of TT-NN calls with `GraphProcessor::begin_graph_capture` and `GraphProcessor::end_graph_capture` or use with any callable. +In the example below `ttnn::zeros` is not included in a trace, but `ttnn::add` is +https://github.com/tenstorrent/tt-metal/blob/4ae4ac3c30cd24ea27cbac8cc5811c90d077e9c0/tests/ttnn/unit_tests/gtests/test_graph_add.cpp#L50-L58 + +You can then analyze the trace with some of the provided utility functions +https://github.com/tenstorrent/tt-metal/blob/4ae4ac3c30cd24ea27cbac8cc5811c90d077e9c0/tests/ttnn/unit_tests/gtests/test_graph_add.cpp#L64-L66 +or process it manually to extract whatever data in whatever format, like this table +``` + current_op event total_cb total_buffer info +0 ttnn::add begin_op 0 9011200 {'inputs': '8', 'name': 'ttnn::add'} +1 ttnn::repeat begin_op 0 9011200 {'inputs': '2', 'name': 'ttnn::repeat'} +2 ttnn::repeat buffer_allocate 0 17203200 {'address': '753696', 'layout': 'INTERLEAVED', 'size': '8192000', 'type': 'DRAM'} +3 ttnn::repeat buffer_allocate 0 17209344 {'address': '1073735680', 'layout': 'INTERLEAVED', 'size': '6144', 'type': 'DRAM'} +4 ttnn::repeat circular_buffer_allocate 4096 17209344 {'addr': '107360', 'core_range_set': '{[(x=0,y=0) - (x=7,y=7)]}', 'size': '4096'} +5 ttnn::repeat buffer_deallocate 4096 17203200 {'layout': 'INTERLEAVED', 'size': '0', 'type': 'DRAM'} +6 ttnn::repeat circular_buffer_deallocate_all 0 17203200 {} +7 ttnn::prim::binary begin_op 0 17203200 {'inputs': '10', 'name': 'ttnn::prim::binary'} +8 ttnn::prim::binary buffer_allocate 0 25395200 {'address': '1437728', 'layout': 'INTERLEAVED', 'size': '8192000', 'type': 'DRAM'} +9 ttnn::prim::binary buffer_allocate 0 25409536 {'address': '1073735680', 'layout': 'INTERLEAVED', 'size': '14336', 'type': 'DRAM'} +10 ttnn::prim::binary circular_buffer_allocate 4096 25409536 {'addr': '107360', 'core_range_set': '{[(x=0,y=0) - (x=7,y=7)]}', 'size': '4096'} +11 ttnn::prim::binary circular_buffer_allocate 8192 25409536 {'addr': '111456', 'core_range_set': '{[(x=0,y=0) - (x=7,y=7)]}', 'size': '4096'} +12 ttnn::prim::binary circular_buffer_allocate 12288 25409536 {'addr': '115552', 'core_range_set': '{[(x=0,y=0) - (x=7,y=7)]}', 'size': '4096'} +13 ttnn::prim::binary buffer_deallocate 12288 25395200 {'layout': 'INTERLEAVED', 'size': '0', 'type': 'DRAM'} +14 ttnn::prim::binary circular_buffer_deallocate_all 0 25395200 {} +15 ttnn::add buffer_deallocate 0 17203200 {'layout': 'INTERLEAVED', 'size': '0', 'type': 'DRAM'} +16 ttnn::add circular_buffer_deallocate_all 0 17203200 {} +``` +or a graph +![trace](https://github.com/user-attachments/assets/42501a1f-8354-4b3b-a5d9-707f30b23f4f) + +## Trace Format +Trace is captured as a JSON and you can find the code producing it [here](https://github.com/tenstorrent/tt-metal/blob/main/ttnn/cpp/ttnn/graph/graph_processor.cpp). +Below you can find a detailed description of the schema. + +### Node Types +The trace is represented as a directed graph, where each node corresponds to a specific operation or memory event. Below is an overview of the various types of nodes that can be present in the trace: + +First, each node has these parameters + +* `counter`: The unique identifier of the node within the graph. +* `node_type`: node type, available types are listed below +* `params`: the map of parameters \[mapping a string string property name to a string value\] +* `connections`: An array of connections to subsequent nodes. + +### Node Connections +Each node in the graph maintains a list of connections to other nodes. These connections represent the flow of data and control through the various operations and memory events during the execution of the network. + +### 1\. capture\_start +Marks the beginning of the graph capture process. This node is the root of the graph and does not have any parent nodes. + +#### Parameters +* Empty, as this is a marker node. + +#### Connections +* First element is next operation call +* Last element is the corresponding `capture_end` + +### 2\. capture\_end +Marks the end of the graph capture process. + +#### Parameters +* Empty, as this is a marker node. + +### 3\. function\_start +Represents the beginning of a function or operation within the graph. This node captures details about the function name and the number of input parameters it received. + +#### Parameters +* `inputs`: Number of input parameters. +* `name`: The name of the function or operation. + +#### Connections +* Another op call, primitive op call, or a corresponding `function_end` + +#### Functions types +* TT-NN operation: for example ttnn::add, ttnn::repeat +* TT-NN primitive operation: for example ttnn::prim::binary (if it uses TMP infra) or ttnn::prim::old\_infra\_device\_operation (if not) +* TT-NN device operation: name “Device Operation” (should change in future) +* Unregistered, but manually tracked functions: (will change) + create\_device\_tensor, Tensor::to, Tensor::cpu, Tensor::cpu\_sharded, Tensor::print, Tensor::pad, Tensor::unpad, Tensor::reshape + tt::tt\_metal::detail::convert\_python\_tensor\_to\_tt\_tensor, tt::tt\_metal::detail::convert\_tt\_tensor\_to\_torch\_tensor + +### 4\. function\_end +Marks the end of a function or operation. This node is paired with the corresponding `begin_function` node and records the outputs or final state of the function. + +#### Parameters +* `name`: The name of the function or operation. + +#### Connections +* Output tensor/s +* Next control flow node (potentially `capture_end` node) + +### 5\. buffer +Represents the allocation of a memory buffer. This node records details about the buffer's size, type (e.g., DRAM or L1 cache), and layout. + +#### Parameters +* `size`: The size of the buffer in bytes. +* `type`: The type of memory (e.g., "DRAM", "L1"). +* `layout`: The memory layout (e.g., "INTERLEAVED", "SINGLE\_BANK"). + +#### Connections +* Single element in the connection list specifies the associated Tensor ID + +### 6\. buffer\_allocate +Denotes the allocation of a buffer in memory. This node captures the specifics of the memory allocation event, including the buffer's address and type. + +#### Parameters +* `size`: The size of the buffer in bytes. +* `address`: The memory address of the buffer. +* `type`: The type of memory (e.g., "DRAM", "L1"). +* `layout`: The memory layout. + +#### Connections +* Single element in the connections list specifies the allocated buffer ID + +### 7\. buffer\_deallocate +Represents the deallocation of a buffer from memory. This node records the details of the buffer being deallocated. + +#### Parameters +* `size`: The size of the buffer in bytes. +* `type`: The type of memory (e.g., "DRAM", "L1"). +* `layout`: The memory layout. + +#### Connections +* Single element in the connections list specifies the deallocated buffer ID + +### 8\. circular\_buffer\_allocate +Represents the allocation of a circular buffer, typically used in handling streaming data or multi-buffering strategies. This node captures details like the core range set involved and the buffer size. + +#### Parameters +* `size`: The size of the circular buffer in bytes. +* `address`: The memory address associated with the buffer. +* `core_range_set`: The range of cores involved in the circular buffer. + +#### Connections +Usually empty + +### 9\. circular\_buffer\_deallocate\_all +Marks the deallocation of all circular buffers. This is a bulk operation and is connected to all circular buffer nodes that are being deallocated. + +#### Parameters +Empty, as this operation deallocates all circular buffers. + +#### Connections +Usually empty + +### 10\. tensor +Represents a tensor in the graph. This node captures the tensor's shape and is connected to the memory buffer it uses, if applicable. +`[#]` means that each tensor is indexed, and instead of \# in real trace you will see an id + +#### Parameters +* `tensor_id`: The identified of the tensor. +* `shape`: The shape of the tensor. + +#### Connections +Usually specifies function\_start of a function where given tensor is passed as an argument/used + +## Operation Dispatching +When run in `NO_DISPATCH` run mode, real allocations do not happen, so trace collection does not have side effects on the allocator state. +You can pass unrealistically big tensors in this mode and unless an operation does upper limit validation, you still can collect the trace. +In this mode trace collection is faster because ops are dispatched to the device. + +When run in the `NORMAL` mode, memory can be fragmented, which can lead to a different trace and you see real addresses where everything is allocated. + +## Python +Tracing is available through Python too +https://github.com/tenstorrent/tt-metal/blob/4ae4ac3c30cd24ea27cbac8cc5811c90d077e9c0/tests/ttnn/unit_tests/test_graph_capture.py#L21-L25 + +Here is a sample code to print a table from the beginning of this document + +```py +def process_allocations(graph): + df = pd.DataFrame(columns=['current_op', 'event', 'total_cb', 'total_buffer', 'info']) + + cur_op = [] + total_cb = 0 + total_buffer = 0 + tensors = set() + i = 1 # lets skip initial node + while i < len(graph): + params = '' + v = graph[i] + params = v.params + print(v, len(df)) + i += 1 + if v.node_type == 'function_start': + if len(cur_op) == 0: + #entring first op, lets get all input tensors + while i < len(graph): + print(graph[i], len(df)) + if graph[i].node_type == 'buffer': + total_buffer += int(graph[i].params['size']) + i += 1 + elif graph[i].node_type == 'tensor': + i += 1 + else: + break + name = v.params['name'] + if name == "ttnn::prim::old_infra_device_operation": + name = "ttnn::prim::old_infra_op" + cur_op.append(name) + if v.node_type == 'circular_buffer_allocate': + total_cb += int(v.params['size']) + if v.node_type == 'circular_buffer_deallocate_all': + total_cb = 0 + if v.node_type == 'buffer_allocate': + total_buffer += int(v.params['size']) + if v.node_type == 'function_end': + cur_op.pop() + #continue + if v.node_type == 'tensor': + continue + if v.node_type == 'buffer_deallocate': + total_buffer -= int(graph[v.connections[0]].params['size']) + if v.node_type == 'buffer': + continue + if len(cur_op) > 0: + data = {'current_op': cur_op[-1], 'event' : v.node_type, 'total_cb': total_cb, 'total_buffer': total_buffer, 'info' : params} + df.loc[len(df)] = data + return df +``` + +## Sample Trace + +This is a sample trace of running `ttnn::add(Shape\[1, 1, 32, 32\], Shape\[4, 1, 32, 32\])`. +This setup requires to broadcast the first tensor, so trace contains a call to ttnn::repeat. +High level call stack here is: + +``` +ttnn::add +ttnn::repeat +ttnn::prim::old_infra_device_operation (calling ttnn primitive operation) +Device Operation (dispatching device operation) +create_device_tensor (creates intermediate output for ttnn::repeat) +ttnn::prim::binary (calling ttnn primitive operation) +Device Operation (dispatching device operation) +create_device_tensor (creates final output) +``` + +And you can see when each Buffer and CB is allocated / deallocated. + +### PrettyPrint + +``` +Capture Start +Begin: tt::tt_metal::detail::convert_python_tensor_to_tt_tensor +End: tt::tt_metal::detail::convert_python_tensor_to_tt_tensor +Add Tensor: 0 +Begin: ttnn::to_layout + Begin: Tensor::reshape + End: Tensor::reshape + Add Tensor: 1 + Begin: Tensor::pad + End: Tensor::pad + Add Tensor: 2 + Begin: Tensor::to + End: Tensor::to + Add Tensor: 3 +End: ttnn::to_layout +Begin: Tensor::to + Add Device Buffer + Allocate Device Buffer +End: Tensor::to +Add Tensor: 4 +Begin: ttnn::add + Begin: Tensor::to + Add Tensor: -1 + Add Device Buffer + Allocate Device Buffer + End: Tensor::to + Add Tensor: 5 + Begin: ttnn::prim::binary + Begin: BinaryDeviceOperation + Begin: tt::tt_metal::create_device_tensor + Add Device Buffer + Allocate Device Buffer + End: tt::tt_metal::create_device_tensor + Add Tensor: 6 + Add Tensor: 6 + Add Device Buffer + Allocate Device Buffer + Allocate Device Buffer + Allocate Device Buffer + Allocate Device Buffer + Deallocate Device Buffer + End: BinaryDeviceOperation + Add Tensor: 7 + End: ttnn::prim::binary + Deallocate Device Buffer +End: ttnn::add +Begin: Tensor::cpu +End: Tensor::cpu +Add Tensor: 8 +Begin: Tensor::to +End: Tensor::to +Add Tensor: 9 +Begin: tt::tt_metal::detail::convert_tt_tensor_to_torch_tensor +End: tt::tt_metal::detail::convert_tt_tensor_to_torch_tensor +Deallocate Device Buffer +Deallocate Device Buffer +``` + +### Visualizer + +![visualizer](https://github.com/user-attachments/assets/03df00c6-4902-416d-a26a-6ffe874537a5) + + +## Raw JSON +``` +[ + { + "connections": [ + 1, + 32 + ], + "counter": 0, + "node_type": "capture_start", + "params": {} + }, + { + "connections": [ + 3, + 5, + 6, + 18, + 30, + 31 + ], + "counter": 1, + "node_type": "function_start", + "params": { + "inputs": "2", + "name": "ttnn::add" + } + }, + { + "connections": [ + 1, + 18 + ], + "counter": 2, + "node_type": "tensor", + "params": { + "shape": "ttnn.Shape([4, 3, 32, 32])" + } + }, + { + "connections": [ + 2, + 2 + ], + "counter": 3, + "name": "buffer", + "params": { + "layout": "INTERLEAVED", + "size": "24576", + "type": "L1" + } + }, + { + "connections": [ + 1, + 6 + ], + "counter": 4, + "name": "tensor[1]", + "params": { + "shape": "ttnn.Shape([1, 3, 32, 32])" + } + }, + { + "connections": [ + 4, + 4 + ], + "counter": 5, + "name": "buffer", + "params": { + "layout": "INTERLEAVED", + "size": "6144", + "type": "L1" + } + }, + { + "connections": [ + 7, + 17 + ], + "counter": 6, + "name": "function_start", + "params": { + "inputs": "2", + "name": "ttnn::repeat" + } + }, + { + "connections": [ + 8, + 16 + ], + "counter": 7, + "name": "function_start", + "params": { + "inputs": "5", + "name": "ttnn::prim::old_infra_device_operation" + } + }, + { + "connections": [ + 9, + 14, + 15 + ], + "counter": 8, + "name": "function_start", + "params": { + "inputs": "2", + "name": "Device Operation" + } + }, + { + "connections": [ + 10, + 11, + 12 + ], + "counter": 9, + "name": "function_start", + "params": { + "inputs": "5", + "name": "create_device_tensor" + } + }, + { + "connections": [ + 13, + 13, + 13, + 13, + 13 + ], + "counter": 10, + "name": "buffer", + "params": { + "layout": "INTERLEAVED", + "size": "24576", + "type": "L1" + } + }, + { + "connections": [ + 10 + ], + "counter": 11, + "name": "buffer_allocate", + "params": { + "address": "1953396066", + "layout": "INTERLEAVED", + "size": "24576", + "type": "L1" + } + }, + { + "connections": [ + 13 + ], + "counter": 12, + "name": "function_end", + "params": { + "name": "create_device_tensor" + } + }, + { + "connections": [ + 18 + ], + "counter": 13, + "name": "tensor[2]", + "params": { + "shape": "ttnn.Shape([4, 3, 32, 32])" + } + }, + { + "connections": [], + "counter": 14, + "name": "circular_buffer_allocate", + "params": { + "address": "0", + "core_range_set": "{[(x=0,y=0) - (x=0,y=7)], [(x=1,y=0) - (x=1,y=3)]}", + "size": "4096" + } + }, + { + "connections": [ + 13 + ], + "counter": 15, + "name": "function_end", + "params": { + "name": "Device Operation" + } + }, + { + "connections": [ + 13 + ], + "counter": 16, + "name": "function_end", + "params": { + "name": "ttnn::prim::old_infra_device_operation" + } + }, + { + "connections": [ + 13, + 18 + ], + "counter": 17, + "name": "function_end", + "params": { + "name": "ttnn::repeat" + } + }, + { + "connections": [ + 19, + 29 + ], + "counter": 18, + "name": "function_start", + "params": { + "inputs": "10", + "name": "ttnn::prim::binary" + } + }, + { + "connections": [ + 20, + 25, + 26, + 27, + 28 + ], + "counter": 19, + "name": "function_start", + "params": { + "inputs": "2", + "name": "Device Operation" + } + }, + { + "connections": [ + 21, + 22, + 23 + ], + "counter": 20, + "name": "function_start", + "params": { + "inputs": "5", + "name": "create_device_tensor" + } + }, + { + "connections": [ + 24, + 24, + 24, + 24 + ], + "counter": 21, + "name": "buffer", + "params": { + "layout": "INTERLEAVED", + "size": "24576", + "type": "L1" + } + }, + { + "connections": [ + 21 + ], + "counter": 22, + "name": "buffer_allocate", + "params": { + "address": "0", + "layout": "INTERLEAVED", + "size": "24576", + "type": "L1" + } + }, + { + "connections": [ + 24 + ], + "counter": 23, + "name": "function_end", + "params": { + "name": "create_device_tensor" + } + }, + { + "connections": [], + "counter": 24, + "name": "tensor[3]", + "params": { + "shape": "ttnn.Shape([4, 3, 32, 32])" + } + }, + { + "connections": [], + "counter": 25, + "name": "circular_buffer_allocate", + "params": { + "address": "0", + "core_range_set": "{[(x=0,y=0) - (x=7,y=7)]}", + "size": "4096" + } + }, + { + "connections": [], + "counter": 26, + "name": "circular_buffer_allocate", + "params": { + "address": "0", + "core_range_set": "{[(x=0,y=0) - (x=7,y=7)]}", + "size": "4096" + } + }, + { + "connections": [], + "counter": 27, + "name": "circular_buffer_allocate", + "params": { + "address": "0", + "core_range_set": "{[(x=0,y=0) - (x=7,y=7)]}", + "size": "4096" + } + }, + { + "connections": [ + 24 + ], + "counter": 28, + "name": "function_end", + "params": { + "name": "Device Operation" + } + }, + { + "connections": [ + 24 + ], + "counter": 29, + "name": "function_end", + "params": { + "name": "ttnn::prim::binary" + } + }, + { + "connections": [ + 10 + ], + "counter": 30, + "name": "buffer_deallocate", + "params": { + "layout": "INTERLEAVED", + "size": "0", + "type": "L1" + } + }, + { + "connections": [ + 24, + 33 + ], + "counter": 31, + "name": "function_end", + "params": { + "name": "ttnn::add" + } + }, + { + "connections": [ + 21 + ], + "counter": 32, + "name": "buffer_deallocate", + "params": { + "layout": "INTERLEAVED", + "size": "0", + "type": "L1" + } + }, + { + "connections": [], + "counter": 33, + "name": "capture_end", + "params": {} + } +] +``` diff --git a/tests/nightly/tg/ccl/test_all_gather_nightly.py b/tests/nightly/tg/ccl/test_all_gather_nightly.py new file mode 120000 index 00000000000..92d5007ada5 --- /dev/null +++ b/tests/nightly/tg/ccl/test_all_gather_nightly.py @@ -0,0 +1 @@ +../../../../tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_nightly.py \ No newline at end of file diff --git a/tests/nightly/tg/ccl/test_reduce_scatter_nightly.py b/tests/nightly/tg/ccl/test_reduce_scatter_nightly.py new file mode 120000 index 00000000000..ac93b90f333 --- /dev/null +++ b/tests/nightly/tg/ccl/test_reduce_scatter_nightly.py @@ -0,0 +1 @@ +../../../../tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_TG_nightly.py \ No newline at end of file diff --git a/tests/scripts/run_python_model_tests.sh b/tests/scripts/run_python_model_tests.sh index 9ec73c2e37b..0643861ec26 100755 --- a/tests/scripts/run_python_model_tests.sh +++ b/tests/scripts/run_python_model_tests.sh @@ -35,6 +35,7 @@ run_python_model_tests_wormhole_b0() { # higher sequence lengths and different formats trigger memory issues pytest models/demos/falcon7b_common/tests/unit_tests/test_falcon_matmuls_and_bmms_with_mixed_precision.py -k "seq_len_128 and in0_BFLOAT16-in1_BFLOAT8_B-out_BFLOAT16-weights_DRAM" pytest tests/ttnn/integration_tests/resnet/test_ttnn_functional_resnet50_new.py -k "pretrained_weight_false" + pytest models/experimental/yolov4/demo/demo.py -k "pretrained_weight_false" # Unet Shallow WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -svv models/experimental/functional_unet/tests/test_unet_model.py diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh index fd8d580296c..517503b2646 100755 --- a/tests/scripts/run_tests.sh +++ b/tests/scripts/run_tests.sh @@ -79,12 +79,6 @@ run_frequent_api_pipeline_tests() { TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_frequent echo "Running Python API unit tests in SD for frequent..." ./tests/scripts/run_python_api_unit_tests.sh - else - if [[ $tt_arch == "wormhole_b0" ]]; then - pytest -n auto tests/ttnn/unit_tests/operations/test_all_gather.py -k nightly - else - echo "API tests are not available for fast dispatch because they're already covered in post-commit" - fi fi } diff --git a/tests/scripts/tg/run_tg_nightly_tests.sh b/tests/scripts/tg/run_tg_nightly_tests.sh index 1bcf876a66e..89e5c253c7c 100755 --- a/tests/scripts/tg/run_tg_nightly_tests.sh +++ b/tests/scripts/tg/run_tg_nightly_tests.sh @@ -7,7 +7,8 @@ run_tg_llama3_70b_tests() { echo "LOG_METAL: Running run_tg_llama3_70b_tests" - pytest tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_nightly.py ; fail+=$? + pytest tests/nightly/tg/ccl/test_all_gather_nightly.py ; fail+=$? + pytest tests/nightly/tg/ccl/test_reduce_scatter_nightly.py ; fail+=$? # Falcon40B prefill 60 layer end to end with 10 loops; we need 8x8 grid size pytest tests/nightly/tg/models/demos/tg/llama3_70b ; fail+=$? diff --git a/tests/sweep_framework/sweep_utils/utils.py b/tests/sweep_framework/sweep_utils/utils.py index 89ff4c15aed..9f574d47f88 100644 --- a/tests/sweep_framework/sweep_utils/utils.py +++ b/tests/sweep_framework/sweep_utils/utils.py @@ -129,6 +129,8 @@ def gen_rand_bitwise_left_shift(size, shift_bits=None, low=-2147483647, high=214 def gen_with_zeroes(size, probabilityzeroes=0.5, low=-100, high=100, dtype=torch.bfloat16): element_count = 1 + if probabilityzeroes == "random": + probabilityzeroes = random.uniform(0.0, 0.9) for i in size: element_count = element_count * i raw = torch.zeros(element_count).to(dtype) diff --git a/tests/sweep_framework/sweeps/data_movement/nonzero/nonzero.py b/tests/sweep_framework/sweeps/data_movement/nonzero/nonzero.py new file mode 100644 index 00000000000..ba72a97d98a --- /dev/null +++ b/tests/sweep_framework/sweeps/data_movement/nonzero/nonzero.py @@ -0,0 +1,97 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +from typing import Optional, Tuple +from functools import partial + +import torch +import random +import ttnn +from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm, gen_with_zeroes +from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt + +from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time +from models.utility_functions import torch_random + +# Override the default timeout in seconds for hang detection. +TIMEOUT = 30 + +random.seed(0) + + +# Parameters provided to the test vector generator are defined here. +# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values. +# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs. +# Developers can create their own generator functions and pass them to the parameters as inputs. +parameters = { + "nightly": { + "input_shape": gen_shapes([1, 1, 1, 1], [1, 1, 1, 256], [1, 1, 1, 1], 16), + "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b], + "input_layout": [ttnn.ROW_MAJOR_LAYOUT, ttnn.TILE_LAYOUT], + "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG], + }, + "xfail": { + "input_shape": gen_shapes([1, 1, 1, 1], [1, 1, 1, 256], [1, 1, 1, 1], 16), + "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b], + "input_layout": [ttnn.ROW_MAJOR_LAYOUT, ttnn.TILE_LAYOUT], + "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + }, +} + + +def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]: + if test_vector["input_layout"] == ttnn.TILE_LAYOUT: + return True, "Input tensor must be in row major layout" + if test_vector["input_layout"] == ttnn.ROW_MAJOR_LAYOUT and test_vector["input_a_dtype"] == ttnn.bfloat8_b: + return True, "bfloat8_b is only supported on tiled layout" + return False, None + + +def run( + input_shape, + input_a_dtype, + input_layout, + input_a_memory_config, + output_memory_config, + *, + device, +) -> list: + data_seed = random.randint(0, 20000000) + torch.manual_seed(data_seed) + + if input_layout == ttnn.ROW_MAJOR_LAYOUT: + input_shape = sanitize_shape_rm(input_shape) + + torch_input_tensor_a = gen_func_with_cast_tt( + partial(gen_with_zeroes, probabilityzeroes="random", low=-100, high=100, dtype=torch.float32), input_a_dtype + )(input_shape) + + torch_output_tensor = torch.nonzero(torch_input_tensor_a, as_tuple=False) + torch_num_nonzero = torch_output_tensor.shape[0] + torch_output_tensor = torch_output_tensor[:, 3].reshape(-1, 1) + + input_tensor_a = ttnn.from_torch( + torch_input_tensor_a, + dtype=input_a_dtype, + layout=input_layout, + device=device, + memory_config=input_a_memory_config, + ) + + start_time = start_measuring_time() + output_indices, output_tensor = ttnn.nonzero(input_tensor_a, memory_config=output_memory_config) + e2e_perf = stop_measuring_time(start_time) + + num_nonzero = ttnn.to_torch(output_indices)[0, 0, 0, 0].item() + output_tensor = ttnn.to_torch(output_tensor)[0, 0, 0, :num_nonzero].reshape(-1, 1) + + if num_nonzero != torch_num_nonzero: + return [ + (False, f"Expected num of non-zero: {torch_num_nonzero}, actual num of non_zero: {num_nonzero}"), + e2e_perf, + ] + + return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf] diff --git a/tests/sweep_framework/sweeps/eltwise/binary_backward/rsub_bw/rsub_bw.py b/tests/sweep_framework/sweeps/eltwise/binary_backward/rsub_bw/rsub_bw.py new file mode 100644 index 00000000000..c16b5fe8722 --- /dev/null +++ b/tests/sweep_framework/sweeps/eltwise/binary_backward/rsub_bw/rsub_bw.py @@ -0,0 +1,134 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +from typing import Optional, Tuple +from functools import partial + +import torch +import random +import ttnn +from tests.sweep_framework.sweep_utils.utils import gen_shapes +from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt + +from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time +from models.utility_functions import torch_random + +# Override the default timeout in seconds for hang detection. +TIMEOUT = 30 + +random.seed(0) + + +# Parameters provided to the test vector generator are defined here. +# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values. +# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs. +# Developers can create their own generator functions and pass them to the parameters as inputs. +parameters = { + "nightly": { + "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 16) + + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 16) + + gen_shapes([1, 1], [256, 256], [1, 1], 16), + "grad_dtype": [ttnn.bfloat16], + "input_a_dtype": [ttnn.bfloat16], + "input_b_dtype": [ttnn.bfloat16], + "grad_layout": [ttnn.TILE_LAYOUT], + "input_a_layout": [ttnn.TILE_LAYOUT], + "input_b_layout": [ttnn.TILE_LAYOUT], + "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "input_b_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + }, +} + + +def str_to_float(x): + try: + return float(x) + except: + return 0.0 + + +# This is the run instructions for the test, defined by the developer. +# The run function must take the above-defined parameters as inputs. +# The runner will call this run function with each test vector, and the returned results from this function will be stored. +# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra. +def run( + input_shape, + grad_dtype, + input_a_dtype, + input_b_dtype, + grad_layout, + input_a_layout, + input_b_layout, + grad_memory_config, + input_a_memory_config, + input_b_memory_config, + output_memory_config, + *, + device, +) -> list: + data_seed = random.randint(0, 20000000) + torch.manual_seed(data_seed) + + torch_grad_tensor = gen_func_with_cast_tt(partial(torch_random, low=-10, high=10, dtype=torch.float32), grad_dtype)( + input_shape + ) + + torch_input_tensor_a = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype + )(input_shape) + torch_input_tensor_a.requires_grad = True + torch_input_tensor_a.retain_grad() + + torch_input_tensor_b = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), input_b_dtype + )(input_shape) + torch_input_tensor_b.requires_grad = True + torch_input_tensor_b.retain_grad() + + golden_function = ttnn.get_golden_function(ttnn.rsub_bw) + torch_output_tensor = golden_function(torch_grad_tensor, torch_input_tensor_a, torch_input_tensor_b) + + grad_tensor = ttnn.from_torch( + torch_grad_tensor, + dtype=grad_dtype, + layout=grad_layout, + device=device, + memory_config=grad_memory_config, + ) + + input_tensor_a = ttnn.from_torch( + torch_input_tensor_a.detach().clone(), + dtype=input_a_dtype, + layout=input_a_layout, + device=device, + memory_config=input_a_memory_config, + ) + + input_tensor_b = ttnn.from_torch( + torch_input_tensor_b.detach().clone(), + dtype=input_b_dtype, + layout=input_b_layout, + device=device, + memory_config=input_b_memory_config, + ) + + start_time = start_measuring_time() + output_tensor = ttnn.rsub_bw(grad_tensor, input_tensor_a, input_tensor_b, memory_config=output_memory_config) + + for i in range(len(output_tensor)): + output_tensor[i] = ttnn.to_torch(output_tensor[i]) + e2e_perf = stop_measuring_time(start_time) + + pcc = [True, 1.0] + + for i in range(len(output_tensor)): + pcc_tmp = check_with_pcc(torch_output_tensor[i], output_tensor[i], 0.99) + pcc[0] = pcc[0] and pcc_tmp[0] + pcc[1] = min(pcc[1], str_to_float(pcc_tmp[1])) + + pcc[1] = str(pcc[1]) + # print(f"pcc {pcc}") + return [pcc, e2e_perf] diff --git a/tests/sweep_framework/sweeps/eltwise/binary_backward/squared_difference_bw/squared_difference_bw.py b/tests/sweep_framework/sweeps/eltwise/binary_backward/squared_difference_bw/squared_difference_bw.py new file mode 100644 index 00000000000..61bc0e090ba --- /dev/null +++ b/tests/sweep_framework/sweeps/eltwise/binary_backward/squared_difference_bw/squared_difference_bw.py @@ -0,0 +1,136 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +from typing import Optional, Tuple +from functools import partial + +import torch +import random +import ttnn +from tests.sweep_framework.sweep_utils.utils import gen_shapes +from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt + +from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time +from models.utility_functions import torch_random + +# Override the default timeout in seconds for hang detection. +TIMEOUT = 30 + +random.seed(0) + + +# Parameters provided to the test vector generator are defined here. +# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values. +# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs. +# Developers can create their own generator functions and pass them to the parameters as inputs. +parameters = { + "nightly": { + "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 16) + + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 16) + + gen_shapes([1, 1], [256, 256], [1, 1], 16), + "grad_dtype": [ttnn.bfloat16], + "input_a_dtype": [ttnn.bfloat16], + "input_b_dtype": [ttnn.bfloat16], + "grad_layout": [ttnn.TILE_LAYOUT], + "input_a_layout": [ttnn.TILE_LAYOUT], + "input_b_layout": [ttnn.TILE_LAYOUT], + "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "input_b_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + }, +} + + +def str_to_float(x): + try: + return float(x) + except: + return 0.0 + + +# This is the run instructions for the test, defined by the developer. +# The run function must take the above-defined parameters as inputs. +# The runner will call this run function with each test vector, and the returned results from this function will be stored. +# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra. +def run( + input_shape, + grad_dtype, + input_a_dtype, + input_b_dtype, + grad_layout, + input_a_layout, + input_b_layout, + grad_memory_config, + input_a_memory_config, + input_b_memory_config, + output_memory_config, + *, + device, +) -> list: + data_seed = random.randint(0, 20000000) + torch.manual_seed(data_seed) + + torch_grad_tensor = gen_func_with_cast_tt(partial(torch_random, low=-10, high=10, dtype=torch.float32), grad_dtype)( + input_shape + ) + + torch_input_tensor_a = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype + )(input_shape) + torch_input_tensor_a.requires_grad = True + torch_input_tensor_a.retain_grad() + + torch_input_tensor_b = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), input_b_dtype + )(input_shape) + torch_input_tensor_b.requires_grad = True + torch_input_tensor_b.retain_grad() + + golden_function = ttnn.get_golden_function(ttnn.squared_difference_bw) + torch_output_tensor = golden_function(torch_grad_tensor, torch_input_tensor_a, torch_input_tensor_b) + + grad_tensor = ttnn.from_torch( + torch_grad_tensor, + dtype=grad_dtype, + layout=grad_layout, + device=device, + memory_config=grad_memory_config, + ) + + input_tensor_a = ttnn.from_torch( + torch_input_tensor_a.detach().clone(), + dtype=input_a_dtype, + layout=input_a_layout, + device=device, + memory_config=input_a_memory_config, + ) + + input_tensor_b = ttnn.from_torch( + torch_input_tensor_b.detach().clone(), + dtype=input_b_dtype, + layout=input_b_layout, + device=device, + memory_config=input_b_memory_config, + ) + + start_time = start_measuring_time() + output_tensor = ttnn.squared_difference_bw( + grad_tensor, input_tensor_a, input_tensor_b, memory_config=output_memory_config + ) + + for i in range(len(output_tensor)): + output_tensor[i] = ttnn.to_torch(output_tensor[i]) + e2e_perf = stop_measuring_time(start_time) + + pcc = [True, 1.0] + + for i in range(len(output_tensor)): + pcc_tmp = check_with_pcc(torch_output_tensor[i], output_tensor[i], 0.99) + pcc[0] = pcc[0] and pcc_tmp[0] + pcc[1] = min(pcc[1], str_to_float(pcc_tmp[1])) + + pcc[1] = str(pcc[1]) + # print(f"pcc {pcc}") + return [pcc, e2e_perf] diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/celu_bw/celu_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/celu_bw/celu_bw.py new file mode 100644 index 00000000000..90b2c6ebe1c --- /dev/null +++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/celu_bw/celu_bw.py @@ -0,0 +1,97 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional, Tuple +from functools import partial + +import torch +import random +import ttnn +from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm +from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt + +from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time +from models.utility_functions import torch_random + +# Override the default timeout in seconds for hang detection. +TIMEOUT = 30 + +random.seed(0) + + +# Parameters provided to the test vector generator are defined here. +# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values. +# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs. +# Developers can create their own generator functions and pass them to the parameters as inputs. +parameters = { + "nightly": { + "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16) + + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16) + + gen_shapes([32, 32], [256, 256], [32, 32], 16), + "grad_dtype": [ttnn.bfloat16], + "input_a_dtype": [ttnn.bfloat16], + "grad_layout": [ttnn.TILE_LAYOUT], + "input_a_layout": [ttnn.TILE_LAYOUT], + "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + }, +} + + +# This is the run instructions for the test, defined by the developer. +# The run function must take the above-defined parameters as inputs. +# The runner will call this run function with each test vector, and the returned results from this function will be stored. +# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra. +def run( + input_shape, + grad_dtype, + input_a_dtype, + grad_layout, + input_a_layout, + grad_memory_config, + input_a_memory_config, + output_memory_config, + *, + device, +) -> list: + data_seed = random.randint(0, 20000000) + torch.manual_seed(data_seed) + + torch_grad_tensor = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype + )(input_shape) + torch_input_tensor_a = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype + )(input_shape) + torch_input_tensor_a.requires_grad = True + torch_input_tensor_a.retain_grad() + + alpha = torch.tensor(1, dtype=torch.bfloat16).uniform_(0.01, 10).item() + + intermediate_result = torch.nn.functional.elu(torch_input_tensor_a, alpha=alpha) + intermediate_result.backward(gradient=torch_grad_tensor) + torch_output_tensor = torch_input_tensor_a.grad + + grad_tensor = ttnn.from_torch( + torch_grad_tensor, + dtype=grad_dtype, + layout=grad_layout, + device=device, + memory_config=grad_memory_config, + ) + + input_tensor_a = ttnn.from_torch( + torch_input_tensor_a.detach().clone(), + dtype=input_a_dtype, + layout=input_a_layout, + device=device, + memory_config=input_a_memory_config, + ) + + start_time = start_measuring_time() + output_tensor = ttnn.elu_bw(grad_tensor, input_tensor_a, alpha=alpha, memory_config=output_memory_config)[0] + output_tensor = ttnn.to_torch(output_tensor) + e2e_perf = stop_measuring_time(start_time) + + return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf] diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/elu_bw/elu_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/elu_bw/elu_bw.py new file mode 100644 index 00000000000..cb4ab1809d6 --- /dev/null +++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/elu_bw/elu_bw.py @@ -0,0 +1,97 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional, Tuple +from functools import partial + +import torch +import random +import ttnn +from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm +from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt + +from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time +from models.utility_functions import torch_random + +# Override the default timeout in seconds for hang detection. +TIMEOUT = 30 + +random.seed(0) + + +# Parameters provided to the test vector generator are defined here. +# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values. +# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs. +# Developers can create their own generator functions and pass them to the parameters as inputs. +parameters = { + "nightly": { + "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16) + + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16) + + gen_shapes([32, 32], [256, 256], [32, 32], 16), + "grad_dtype": [ttnn.bfloat16], + "input_a_dtype": [ttnn.bfloat16], + "grad_layout": [ttnn.TILE_LAYOUT], + "input_a_layout": [ttnn.TILE_LAYOUT], + "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + }, +} + + +# This is the run instructions for the test, defined by the developer. +# The run function must take the above-defined parameters as inputs. +# The runner will call this run function with each test vector, and the returned results from this function will be stored. +# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra. +def run( + input_shape, + grad_dtype, + input_a_dtype, + grad_layout, + input_a_layout, + grad_memory_config, + input_a_memory_config, + output_memory_config, + *, + device, +) -> list: + data_seed = random.randint(0, 20000000) + torch.manual_seed(data_seed) + + torch_grad_tensor = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype + )(input_shape) + torch_input_tensor_a = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype + )(input_shape) + torch_input_tensor_a.requires_grad = True + torch_input_tensor_a.retain_grad() + + alpha = torch.tensor(1, dtype=torch.bfloat16).uniform_(-10, 10).item() + + intermediate_result = torch.nn.functional.elu(torch_input_tensor_a, alpha=alpha) + intermediate_result.backward(gradient=torch_grad_tensor) + torch_output_tensor = torch_input_tensor_a.grad + + grad_tensor = ttnn.from_torch( + torch_grad_tensor, + dtype=grad_dtype, + layout=grad_layout, + device=device, + memory_config=grad_memory_config, + ) + + input_tensor_a = ttnn.from_torch( + torch_input_tensor_a.detach().clone(), + dtype=input_a_dtype, + layout=input_a_layout, + device=device, + memory_config=input_a_memory_config, + ) + + start_time = start_measuring_time() + output_tensor = ttnn.elu_bw(grad_tensor, input_tensor_a, alpha=alpha, memory_config=output_memory_config)[0] + output_tensor = ttnn.to_torch(output_tensor) + e2e_perf = stop_measuring_time(start_time) + + return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf] diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/floor_bw/floor_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/floor_bw/floor_bw.py new file mode 100644 index 00000000000..2d3aa811334 --- /dev/null +++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/floor_bw/floor_bw.py @@ -0,0 +1,95 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional, Tuple +from functools import partial + +import torch +import random +import ttnn +from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm +from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt + +from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time +from models.utility_functions import torch_random + +# Override the default timeout in seconds for hang detection. +TIMEOUT = 30 + +random.seed(0) + + +# Parameters provided to the test vector generator are defined here. +# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values. +# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs. +# Developers can create their own generator functions and pass them to the parameters as inputs. +parameters = { + "nightly": { + "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16) + + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16) + + gen_shapes([32, 32], [256, 256], [32, 32], 16), + "grad_dtype": [ttnn.bfloat16], + "input_a_dtype": [ttnn.bfloat16], + "grad_layout": [ttnn.TILE_LAYOUT], + "input_a_layout": [ttnn.TILE_LAYOUT], + "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + }, +} + + +# This is the run instructions for the test, defined by the developer. +# The run function must take the above-defined parameters as inputs. +# The runner will call this run function with each test vector, and the returned results from this function will be stored. +# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra. +def run( + input_shape, + grad_dtype, + input_a_dtype, + grad_layout, + input_a_layout, + grad_memory_config, + input_a_memory_config, + output_memory_config, + *, + device, +) -> list: + data_seed = random.randint(0, 20000000) + torch.manual_seed(data_seed) + + torch_grad_tensor = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype + )(input_shape) + torch_input_tensor_a = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype + )(input_shape) + torch_input_tensor_a.requires_grad = True + torch_input_tensor_a.retain_grad() + + intermediate_result = torch.floor(torch_input_tensor_a) + intermediate_result.backward(gradient=torch_grad_tensor) + torch_output_tensor = torch_input_tensor_a.grad + + grad_tensor = ttnn.from_torch( + torch_grad_tensor, + dtype=grad_dtype, + layout=grad_layout, + device=device, + memory_config=grad_memory_config, + ) + + input_tensor_a = ttnn.from_torch( + torch_input_tensor_a.detach().clone(), + dtype=input_a_dtype, + layout=input_a_layout, + device=device, + memory_config=input_a_memory_config, + ) + + start_time = start_measuring_time() + output_tensor = ttnn.floor_bw(grad_tensor, input_tensor_a, memory_config=output_memory_config)[0] + output_tensor = ttnn.to_torch(output_tensor) + e2e_perf = stop_measuring_time(start_time) + + return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf] diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/hardswish_bw/hardswish_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/hardswish_bw/hardswish_bw.py new file mode 100644 index 00000000000..1c9cbf0e84a --- /dev/null +++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/hardswish_bw/hardswish_bw.py @@ -0,0 +1,95 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional, Tuple +from functools import partial + +import torch +import random +import ttnn +from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm +from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt + +from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time +from models.utility_functions import torch_random + +# Override the default timeout in seconds for hang detection. +TIMEOUT = 30 + +random.seed(0) + + +# Parameters provided to the test vector generator are defined here. +# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values. +# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs. +# Developers can create their own generator functions and pass them to the parameters as inputs. +parameters = { + "nightly": { + "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16) + + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16) + + gen_shapes([32, 32], [256, 256], [32, 32], 16), + "grad_dtype": [ttnn.bfloat16], + "input_a_dtype": [ttnn.bfloat16], + "grad_layout": [ttnn.TILE_LAYOUT], + "input_a_layout": [ttnn.TILE_LAYOUT], + "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + }, +} + + +# This is the run instructions for the test, defined by the developer. +# The run function must take the above-defined parameters as inputs. +# The runner will call this run function with each test vector, and the returned results from this function will be stored. +# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra. +def run( + input_shape, + grad_dtype, + input_a_dtype, + grad_layout, + input_a_layout, + grad_memory_config, + input_a_memory_config, + output_memory_config, + *, + device, +) -> list: + data_seed = random.randint(0, 20000000) + torch.manual_seed(data_seed) + + torch_grad_tensor = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype + )(input_shape) + torch_input_tensor_a = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype + )(input_shape) + torch_input_tensor_a.requires_grad = True + torch_input_tensor_a.retain_grad() + + intermediate_result = torch.nn.functional.hardswish(torch_input_tensor_a) + intermediate_result.backward(gradient=torch_grad_tensor) + torch_output_tensor = torch_input_tensor_a.grad + + grad_tensor = ttnn.from_torch( + torch_grad_tensor, + dtype=grad_dtype, + layout=grad_layout, + device=device, + memory_config=grad_memory_config, + ) + + input_tensor_a = ttnn.from_torch( + torch_input_tensor_a.detach().clone(), + dtype=input_a_dtype, + layout=input_a_layout, + device=device, + memory_config=input_a_memory_config, + ) + + start_time = start_measuring_time() + output_tensor = ttnn.hardswish_bw(grad_tensor, input_tensor_a, memory_config=output_memory_config)[0] + output_tensor = ttnn.to_torch(output_tensor) + e2e_perf = stop_measuring_time(start_time) + + return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf] diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/leaky_relu_bw/leaky_relu_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/leaky_relu_bw/leaky_relu_bw.py new file mode 100644 index 00000000000..31c1262b411 --- /dev/null +++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/leaky_relu_bw/leaky_relu_bw.py @@ -0,0 +1,100 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +from typing import Optional, Tuple +from functools import partial + +import torch +import random +import ttnn +from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm +from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt + +from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time +from models.utility_functions import torch_random + +# Override the default timeout in seconds for hang detection. +TIMEOUT = 30 + +random.seed(0) + + +# Parameters provided to the test vector generator are defined here. +# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values. +# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs. +# Developers can create their own generator functions and pass them to the parameters as inputs. +parameters = { + "nightly": { + "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16) + + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16) + + gen_shapes([32, 32], [256, 256], [32, 32], 16), + "grad_dtype": [ttnn.bfloat16], + "input_a_dtype": [ttnn.bfloat16], + "grad_layout": [ttnn.TILE_LAYOUT], + "input_a_layout": [ttnn.TILE_LAYOUT], + "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + }, +} + + +# This is the run instructions for the test, defined by the developer. +# The run function must take the above-defined parameters as inputs. +# The runner will call this run function with each test vector, and the returned results from this function will be stored. +# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra. +def run( + input_shape, + grad_dtype, + input_a_dtype, + grad_layout, + input_a_layout, + grad_memory_config, + input_a_memory_config, + output_memory_config, + *, + device, +) -> list: + data_seed = random.randint(0, 20000000) + torch.manual_seed(data_seed) + + torch_grad_tensor = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype + )(input_shape) + torch_input_tensor_a = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype + )(input_shape) + torch_input_tensor_a.requires_grad = True + torch_input_tensor_a.retain_grad() + + negative_slope = torch.tensor(1, dtype=torch.bfloat16).uniform_(0, 100).item() + + intermediate_result = torch.nn.functional.leaky_relu(torch_input_tensor_a, negative_slope=negative_slope) + intermediate_result.backward(gradient=torch_grad_tensor) + torch_output_tensor = torch_input_tensor_a.grad + + grad_tensor = ttnn.from_torch( + torch_grad_tensor, + dtype=grad_dtype, + layout=grad_layout, + device=device, + memory_config=grad_memory_config, + ) + + input_tensor_a = ttnn.from_torch( + torch_input_tensor_a.detach().clone(), + dtype=input_a_dtype, + layout=input_a_layout, + device=device, + memory_config=input_a_memory_config, + ) + + start_time = start_measuring_time() + output_tensor = ttnn.leaky_relu_bw( + grad_tensor, input_tensor_a, negative_slope=negative_slope, memory_config=output_memory_config + )[0] + output_tensor = ttnn.to_torch(output_tensor) + e2e_perf = stop_measuring_time(start_time) + + return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf] diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/rpow_bw/rpow_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/rpow_bw/rpow_bw.py new file mode 100644 index 00000000000..dd5d6c19ea4 --- /dev/null +++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/rpow_bw/rpow_bw.py @@ -0,0 +1,98 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +from typing import Optional, Tuple +from functools import partial + +import torch +import random +import ttnn +from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm +from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt + +from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time +from models.utility_functions import torch_random + +# Override the default timeout in seconds for hang detection. +TIMEOUT = 30 + +random.seed(0) + + +# Parameters provided to the test vector generator are defined here. +# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values. +# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs. +# Developers can create their own generator functions and pass them to the parameters as inputs. +parameters = { + "xfail": { + "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16) + + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16) + + gen_shapes([32, 32], [256, 256], [32, 32], 16), + "grad_dtype": [ttnn.bfloat16], + "input_a_dtype": [ttnn.bfloat16], + "grad_layout": [ttnn.TILE_LAYOUT], + "input_a_layout": [ttnn.TILE_LAYOUT], + "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + }, +} + + +# This is the run instructions for the test, defined by the developer. +# The run function must take the above-defined parameters as inputs. +# The runner will call this run function with each test vector, and the returned results from this function will be stored. +# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra. +def run( + input_shape, + grad_dtype, + input_a_dtype, + grad_layout, + input_a_layout, + grad_memory_config, + input_a_memory_config, + output_memory_config, + *, + device, +) -> list: + data_seed = random.randint(0, 20000000) + torch.manual_seed(data_seed) + + torch_grad_tensor = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype + )(input_shape) + torch_input_tensor_a = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype + )(input_shape) + torch_input_tensor_a.requires_grad = True + torch_input_tensor_a.retain_grad() + + factor = random.uniform(0.1, 10.0) + + intermediate_result = torch.pow(torch_input_tensor_a, factor) + intermediate_result.backward(gradient=torch_grad_tensor) + torch_output_tensor = torch_input_tensor_a.grad + + grad_tensor = ttnn.from_torch( + torch_grad_tensor, + dtype=grad_dtype, + layout=grad_layout, + device=device, + memory_config=grad_memory_config, + ) + + input_tensor_a = ttnn.from_torch( + torch_input_tensor_a.detach().clone(), + dtype=input_a_dtype, + layout=input_a_layout, + device=device, + memory_config=input_a_memory_config, + ) + + start_time = start_measuring_time() + output_tensor = ttnn.pow_bw(grad_tensor, input_tensor_a, factor, memory_config=output_memory_config)[0] + output_tensor = ttnn.to_torch(output_tensor) + e2e_perf = stop_measuring_time(start_time) + + return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf] diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/selu_bw/selu_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/selu_bw/selu_bw.py new file mode 100644 index 00000000000..d4eeac59b7e --- /dev/null +++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/selu_bw/selu_bw.py @@ -0,0 +1,95 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional, Tuple +from functools import partial + +import torch +import random +import ttnn +from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm +from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt + +from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time +from models.utility_functions import torch_random + +# Override the default timeout in seconds for hang detection. +TIMEOUT = 30 + +random.seed(0) + + +# Parameters provided to the test vector generator are defined here. +# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values. +# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs. +# Developers can create their own generator functions and pass them to the parameters as inputs. +parameters = { + "nightly": { + "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16) + + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16) + + gen_shapes([32, 32], [256, 256], [32, 32], 16), + "grad_dtype": [ttnn.bfloat16], + "input_a_dtype": [ttnn.bfloat16], + "grad_layout": [ttnn.TILE_LAYOUT], + "input_a_layout": [ttnn.TILE_LAYOUT], + "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + }, +} + + +# This is the run instructions for the test, defined by the developer. +# The run function must take the above-defined parameters as inputs. +# The runner will call this run function with each test vector, and the returned results from this function will be stored. +# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra. +def run( + input_shape, + grad_dtype, + input_a_dtype, + grad_layout, + input_a_layout, + grad_memory_config, + input_a_memory_config, + output_memory_config, + *, + device, +) -> list: + data_seed = random.randint(0, 20000000) + torch.manual_seed(data_seed) + + torch_grad_tensor = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype + )(input_shape) + torch_input_tensor_a = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype + )(input_shape) + torch_input_tensor_a.requires_grad = True + torch_input_tensor_a.retain_grad() + + intermediate_result = torch.nn.functional.selu(torch_input_tensor_a) + intermediate_result.backward(gradient=torch_grad_tensor) + torch_output_tensor = torch_input_tensor_a.grad + + grad_tensor = ttnn.from_torch( + torch_grad_tensor, + dtype=grad_dtype, + layout=grad_layout, + device=device, + memory_config=grad_memory_config, + ) + + input_tensor_a = ttnn.from_torch( + torch_input_tensor_a.detach().clone(), + dtype=input_a_dtype, + layout=input_a_layout, + device=device, + memory_config=input_a_memory_config, + ) + + start_time = start_measuring_time() + output_tensor = ttnn.selu_bw(grad_tensor, input_tensor_a, memory_config=output_memory_config)[0] + output_tensor = ttnn.to_torch(output_tensor) + e2e_perf = stop_measuring_time(start_time) + + return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf] diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/silu_bw/silu_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/silu_bw/silu_bw.py new file mode 100644 index 00000000000..fed2d0d1d08 --- /dev/null +++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/silu_bw/silu_bw.py @@ -0,0 +1,95 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional, Tuple +from functools import partial + +import torch +import random +import ttnn +from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm +from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt + +from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time +from models.utility_functions import torch_random + +# Override the default timeout in seconds for hang detection. +TIMEOUT = 30 + +random.seed(0) + + +# Parameters provided to the test vector generator are defined here. +# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values. +# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs. +# Developers can create their own generator functions and pass them to the parameters as inputs. +parameters = { + "nightly": { + "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16) + + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16) + + gen_shapes([32, 32], [256, 256], [32, 32], 16), + "grad_dtype": [ttnn.bfloat16], + "input_a_dtype": [ttnn.bfloat16], + "grad_layout": [ttnn.TILE_LAYOUT], + "input_a_layout": [ttnn.TILE_LAYOUT], + "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + }, +} + + +# This is the run instructions for the test, defined by the developer. +# The run function must take the above-defined parameters as inputs. +# The runner will call this run function with each test vector, and the returned results from this function will be stored. +# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra. +def run( + input_shape, + grad_dtype, + input_a_dtype, + grad_layout, + input_a_layout, + grad_memory_config, + input_a_memory_config, + output_memory_config, + *, + device, +) -> list: + data_seed = random.randint(0, 20000000) + torch.manual_seed(data_seed) + + torch_grad_tensor = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype + )(input_shape) + torch_input_tensor_a = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype + )(input_shape) + torch_input_tensor_a.requires_grad = True + torch_input_tensor_a.retain_grad() + + intermediate_result = torch.nn.functional.silu(torch_input_tensor_a) + intermediate_result.backward(gradient=torch_grad_tensor) + torch_output_tensor = torch_input_tensor_a.grad + + grad_tensor = ttnn.from_torch( + torch_grad_tensor, + dtype=grad_dtype, + layout=grad_layout, + device=device, + memory_config=grad_memory_config, + ) + + input_tensor_a = ttnn.from_torch( + torch_input_tensor_a.detach().clone(), + dtype=input_a_dtype, + layout=input_a_layout, + device=device, + memory_config=input_a_memory_config, + ) + + start_time = start_measuring_time() + output_tensor = ttnn.silu_bw(grad_tensor, input_tensor_a, memory_config=output_memory_config)[0] + output_tensor = ttnn.to_torch(output_tensor) + e2e_perf = stop_measuring_time(start_time) + + return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf] diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/tanhshrink_bw/tanhshrink_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/tanhshrink_bw/tanhshrink_bw.py new file mode 100644 index 00000000000..a2afd5c6e4a --- /dev/null +++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/tanhshrink_bw/tanhshrink_bw.py @@ -0,0 +1,95 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional, Tuple +from functools import partial + +import torch +import random +import ttnn +from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm +from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt + +from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time +from models.utility_functions import torch_random + +# Override the default timeout in seconds for hang detection. +TIMEOUT = 30 + +random.seed(0) + + +# Parameters provided to the test vector generator are defined here. +# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values. +# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs. +# Developers can create their own generator functions and pass them to the parameters as inputs. +parameters = { + "nightly": { + "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16) + + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16) + + gen_shapes([32, 32], [256, 256], [32, 32], 16), + "grad_dtype": [ttnn.bfloat16], + "input_a_dtype": [ttnn.bfloat16], + "grad_layout": [ttnn.TILE_LAYOUT], + "input_a_layout": [ttnn.TILE_LAYOUT], + "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + }, +} + + +# This is the run instructions for the test, defined by the developer. +# The run function must take the above-defined parameters as inputs. +# The runner will call this run function with each test vector, and the returned results from this function will be stored. +# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra. +def run( + input_shape, + grad_dtype, + input_a_dtype, + grad_layout, + input_a_layout, + grad_memory_config, + input_a_memory_config, + output_memory_config, + *, + device, +) -> list: + data_seed = random.randint(0, 20000000) + torch.manual_seed(data_seed) + + torch_grad_tensor = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype + )(input_shape) + torch_input_tensor_a = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype + )(input_shape) + torch_input_tensor_a.requires_grad = True + torch_input_tensor_a.retain_grad() + + intermediate_result = torch.nn.functional.tanhshrink(torch_input_tensor_a) + intermediate_result.backward(gradient=torch_grad_tensor) + torch_output_tensor = torch_input_tensor_a.grad + + grad_tensor = ttnn.from_torch( + torch_grad_tensor, + dtype=grad_dtype, + layout=grad_layout, + device=device, + memory_config=grad_memory_config, + ) + + input_tensor_a = ttnn.from_torch( + torch_input_tensor_a.detach().clone(), + dtype=input_a_dtype, + layout=input_a_layout, + device=device, + memory_config=input_a_memory_config, + ) + + start_time = start_measuring_time() + output_tensor = ttnn.tanhshrink_bw(grad_tensor, input_tensor_a, memory_config=output_memory_config)[0] + output_tensor = ttnn.to_torch(output_tensor) + e2e_perf = stop_measuring_time(start_time) + + return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf] diff --git a/tests/sweep_framework/sweeps/embedding/embedding.py b/tests/sweep_framework/sweeps/embedding/embedding.py new file mode 100644 index 00000000000..739f74e854a --- /dev/null +++ b/tests/sweep_framework/sweeps/embedding/embedding.py @@ -0,0 +1,109 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +from typing import Optional, Tuple +from functools import partial + +import torch +import random +import ttnn +from tests.sweep_framework.sweep_utils.utils import gen_shapes +from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt + +from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time +from models.utility_functions import torch_random + +# Override the default timeout in seconds for hang detection. +TIMEOUT = 30 + +random.seed(0) + + +# Parameters provided to the test vector generator are defined here. +# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values. +# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs. +# Developers can create their own generator functions and pass them to the parameters as inputs. +parameters = { + "nightly": { + "embedding_args": gen_shapes([1, 32, 32, 128], [4, 2080, 4128, 550], [1, 32, 32, 32], 32), + "input_dtype": [ttnn.uint32], + "weight_dtype": [ttnn.bfloat16, ttnn.bfloat8_b], + "output_dtype": [ttnn.bfloat16, ttnn.bfloat8_b], + "input_layout": [ttnn.ROW_MAJOR_LAYOUT, ttnn.TILE_LAYOUT], + "weight_layout": [ttnn.ROW_MAJOR_LAYOUT, ttnn.TILE_LAYOUT], + "input_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "weight_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + }, +} + + +def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]: + if test_vector["input_layout"] == ttnn.TILE_LAYOUT: + return True, "Input must be in row major layout" + if test_vector["weight_layout"] == ttnn.TILE_LAYOUT: + return True, "Weights must in row major layout" + if test_vector["output_dtype"] == ttnn.bfloat8_b: + return True, "bloat8_b is not supported for output tensor" + if test_vector["weight_layout"] == ttnn.ROW_MAJOR_LAYOUT and test_vector["weight_dtype"] == ttnn.bfloat8_b: + return True, "bfloat8_b is only supported on tiled layout" + return False, None + + +# This is the run instructions for the test, defined by the developer. +# The run function must take the above-defined parameters as inputs. +# The runner will call this run function with each test vector, and the returned results from this function will be stored. +# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra. +def run( + embedding_args, + input_dtype, + weight_dtype, + output_dtype, + input_layout, + weight_layout, + input_memory_config, + weight_memory_config, + output_memory_config, + *, + device, +) -> list: + data_seed = random.randint(0, 20000000) + torch.manual_seed(data_seed) + + batch_size, seq_length, embeddings_dim, num_embeddings = embedding_args + + input_shape = (batch_size, seq_length) + weight_shape = (num_embeddings, embeddings_dim) + + torch_input_tensor = torch_random(input_shape, 0, num_embeddings, torch.int64) + torch_weight_tensor = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), weight_dtype + )(weight_shape) + + golden_function = ttnn.get_golden_function(ttnn.embedding) + torch_output_tensor = golden_function(torch_input_tensor, torch_weight_tensor).squeeze(dim=0) + # torch_output_tensor = torch.nn.functional.embedding(torch_input_tensor, torch_weight_tensor) + + input_tensor = ttnn.from_torch( + torch_input_tensor, + dtype=input_dtype, + layout=input_layout, + device=device, + memory_config=input_memory_config, + ) + weight_tensor = ttnn.from_torch( + torch_weight_tensor, + dtype=weight_dtype, + layout=weight_layout, + device=device, + memory_config=weight_memory_config, + ) + + start_time = start_measuring_time() + output_tensor = ttnn.embedding(input_tensor, weight_tensor, dtype=output_dtype, memory_config=output_memory_config) + e2e_perf = stop_measuring_time(start_time) + + output_tensor = ttnn.to_torch(output_tensor).squeeze() + + return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf] diff --git a/tests/tests_common/sfpu_helper/sfpu_helper.hpp b/tests/tests_common/sfpu_helper/sfpu_helper.hpp index f643d93e478..05ea74d1b5b 100644 --- a/tests/tests_common/sfpu_helper/sfpu_helper.hpp +++ b/tests/tests_common/sfpu_helper/sfpu_helper.hpp @@ -74,8 +74,8 @@ float ref_identity(float x) { return x; } -vector sfpu(const std::vector &src, std::function sfpu_func) { - vector dst; +std::vector sfpu(const std::vector &src, std::function sfpu_func) { + std::vector dst; for (uint32_t el: src) { @@ -98,13 +98,13 @@ vector sfpu(const std::vector &src, std::function create_random_ones_and_twos_vector_of_bfloat16(uint32_t num_bytes, int seed) { +std::vector create_random_ones_and_twos_vector_of_bfloat16(uint32_t num_bytes, int seed) { // Used for reciprocal, since binary vectors are filled with 0s and 1s, and recip of 0 is undefined, // so then we just generate a vector of ones and twos - vector src = create_random_binary_vector_of_bfloat16(num_bytes, seed); + std::vector src = create_random_binary_vector_of_bfloat16(num_bytes, seed); - vector dst; + std::vector dst; for (uint32_t el: src) { @@ -148,7 +148,7 @@ bool is_close_rtol_0p175_atol_0p1(float a, float b) { } // SFPU maps -> relevant kernels, golden functions, comparison functions -static std::vector sfpu_op = +static std::vector sfpu_op = { "relu", "exponential", "reciprocal", @@ -165,7 +165,7 @@ static std::vector sfpu_op = "identity" }; -const map> sfpu_op_to_function = { +const std::map> sfpu_op_to_function = { {"relu", relu}, {"exponential", exponential}, {"reciprocal", reciprocal}, @@ -182,7 +182,7 @@ const map> sfpu_op_to_function = { {"identity", ref_identity} }; -const map(uint32_t num_bytes, int seed)>> sfpu_op_to_init_func = { +const std::map(uint32_t num_bytes, int seed)>> sfpu_op_to_init_func = { {"relu", create_random_vector_of_bfloat16_1_1}, {"exponential", create_random_binary_vector_of_bfloat16}, {"reciprocal", create_random_ones_and_twos_vector_of_bfloat16}, @@ -199,7 +199,7 @@ const map(uint32_t num_bytes, int seed)>> {"identity", create_random_vector_of_bfloat16_1_1} }; -const map> sfpu_op_to_comparison_function = { +const std::map> sfpu_op_to_comparison_function = { {"exponential", equal_within_two_sig_figs}, {"reciprocal", equal_within_absolute_tolerance_of_0p03}, {"gelu", is_close_0p015}, diff --git a/tests/tt_eager/ops/test_sfpu.cpp b/tests/tt_eager/ops/test_sfpu.cpp index 40543974d05..2773327468e 100644 --- a/tests/tt_eager/ops/test_sfpu.cpp +++ b/tests/tt_eager/ops/test_sfpu.cpp @@ -18,6 +18,7 @@ #include "ttnn/operations/eltwise/unary/common/unary_op_utils.hpp" // #include "tt_gdb/tt_gdb.hpp" +using std::vector; // SFPU maps -> relevant kernels, golden functions, comparison functions std::map> sfpu_op_to_hlk_op_name={}; diff --git a/tests/tt_eager/ops/test_sliding_window_ops.cpp b/tests/tt_eager/ops/test_sliding_window_ops.cpp index 31db47a4b35..c5583841c7f 100644 --- a/tests/tt_eager/ops/test_sliding_window_ops.cpp +++ b/tests/tt_eager/ops/test_sliding_window_ops.cpp @@ -14,6 +14,7 @@ #include "ttnn/operations/numpy/functions.hpp" #include "ttnn/tensor/types.hpp" +using std::vector; using tt::tt_metal::LegacyShape; using tt::tt_metal::Tensor; using namespace ttnn::operations::sliding_window; diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_broadcast.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_broadcast.py index 117b1383491..296734f0f4e 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_broadcast.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_broadcast.py @@ -27,7 +27,7 @@ @pytest.mark.parametrize("dtype", (ttnn.bfloat16, ttnn.bfloat8_b)) def test_run_bcast_h_test(input_shapes, bcast_op_type, dtype, device, function_level_defaults): datagen_func = [ - generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.float32) + generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.bfloat16) ] * 2 comparison_func = partial(comparison_funcs.comp_pcc) run_single_pytorch_test( @@ -60,7 +60,7 @@ def test_run_bcast_h_test(input_shapes, bcast_op_type, dtype, device, function_l @pytest.mark.parametrize("dtype", (ttnn.bfloat16, ttnn.bfloat8_b)) def test_run_bcast_w_test(input_shapes, bcast_op_type, dtype, device, function_level_defaults): datagen_func = [ - generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.float32) + generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.bfloat16) ] * 2 comparison_func = partial(comparison_funcs.comp_pcc) run_single_pytorch_test( @@ -93,7 +93,7 @@ def test_run_bcast_w_test(input_shapes, bcast_op_type, dtype, device, function_l @pytest.mark.parametrize("dtype", (ttnn.bfloat16, ttnn.bfloat8_b)) def test_run_bcast_hw_test(input_shapes, bcast_op_type, dtype, device, function_level_defaults): datagen_func = [ - generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.float32) + generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.bfloat16) ] * 2 comparison_func = partial(comparison_funcs.comp_pcc) run_single_pytorch_test( diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div.py index 60acfe5f59b..0fc89897817 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div.py @@ -16,7 +16,7 @@ from tests.tt_eager.python_api_testing.sweep_tests.run_pytorch_ci_tests import ( run_single_pytorch_test, ) -from models.utility_functions import is_grayskull, skip_for_blackhole +from models.utility_functions import is_grayskull mem_configs = [ ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.DRAM), @@ -24,7 +24,6 @@ ] -@skip_for_blackhole("Only supported for WH, see #12349") @pytest.mark.parametrize("accurate_mode", [False, True]) @pytest.mark.parametrize("round_mode", ["None", "trunc", "floor"]) @pytest.mark.parametrize( diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div_unary.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div_unary.py index 0a85d67f471..3d85a290137 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div_unary.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div_unary.py @@ -16,7 +16,7 @@ from tests.tt_eager.python_api_testing.sweep_tests.run_pytorch_ci_tests import ( run_single_pytorch_test, ) -from models.utility_functions import is_grayskull, skip_for_blackhole +from models.utility_functions import is_grayskull mem_configs = [ ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.DRAM), @@ -24,7 +24,6 @@ ] -@skip_for_blackhole("Only supported on WH, see #12349") @pytest.mark.parametrize("accurate_mode", [True]) @pytest.mark.parametrize("round_mode", ["None", "trunc", "floor"]) @pytest.mark.parametrize( diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_fmod.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_fmod.py index 28cd0590730..65b45a5ba6b 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_fmod.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_fmod.py @@ -16,7 +16,7 @@ from tests.tt_eager.python_api_testing.sweep_tests.run_pytorch_ci_tests import ( run_single_pytorch_test, ) -from models.utility_functions import skip_for_grayskull, skip_for_blackhole +from models.utility_functions import skip_for_grayskull mem_configs = [ ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.DRAM), @@ -24,7 +24,6 @@ ] -@skip_for_blackhole("Only supported on WH, see #12349") @pytest.mark.parametrize( "input_shapes", [ diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_frac.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_frac.py index 25e92cbf3fd..9c6907d1fac 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_frac.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_frac.py @@ -7,7 +7,7 @@ import random from functools import partial import ttnn -from models.utility_functions import skip_for_grayskull, skip_for_blackhole +from models.utility_functions import skip_for_grayskull from tests.tt_eager.python_api_testing.sweep_tests import ( comparison_funcs, @@ -23,7 +23,6 @@ ] -@skip_for_blackhole("Unsupported on BH, see #12349") @pytest.mark.parametrize( "input_shapes", [ diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention_decode.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention_decode.py index 0783e8c464e..dcaac29bc89 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention_decode.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention_decode.py @@ -469,6 +469,7 @@ def run_test_sdpa_decode_single_iter( assert out_pass +@skip_for_blackhole("Unsupported on BH, see #12349") @skip_for_grayskull("Unsupported in GS since L1 runs OOM with most configs") @pytest.mark.parametrize( "dtype, q_dtype", @@ -518,6 +519,7 @@ def test_sdpa_decode( ) +@skip_for_blackhole("Unsupported on BH, see #12349") @skip_for_grayskull("Unsupported in GS since L1 runs OOM with most configs") @pytest.mark.parametrize( "dtype, q_dtype", @@ -550,6 +552,7 @@ def test_sdpa_decode_non_causal(device, b, nh, nkv, s, d, dtype, grid_size, q_dt assert device.num_program_cache_entries() == 1 +@skip_for_blackhole("Unsupported on BH, see #12349") @skip_for_grayskull("Unsupported in GS since L1 runs OOM with most configs") @pytest.mark.parametrize( "dtype, q_dtype", @@ -869,6 +872,7 @@ def test_sdpa_decode_paged_attention( assert device.num_program_cache_entries() == 4 +@skip_for_blackhole("Unsupported on BH, see #12349") @skip_for_grayskull("Unsupported in GS since L1 runs OOM with most configs") @pytest.mark.parametrize( "dtype, q_dtype", @@ -898,6 +902,7 @@ def test_sdpa_decode_sharded(device, b, nh, nkv, s, d, dtype, grid_size, q_dtype ) +@skip_for_blackhole("Unsupported on BH, see #12349") @skip_for_grayskull("Unsupported in GS since L1 runs OOM with most configs") @pytest.mark.skip("Skipping Perf Test in CI") def test_sdpa_decode_perf(device, use_program_cache): @@ -952,6 +957,7 @@ def test_sdpa_decode_perf(device, use_program_cache): ) +@skip_for_blackhole("Unsupported on BH, see #12349") @skip_for_grayskull("Unsupported in GS since L1 runs OOM with most configs") @pytest.mark.parametrize( "dtype", @@ -1171,6 +1177,7 @@ def run_test_sdpa_decode_ndpcc(device, b, nh, nkv, s, d, dtype, grid_size, q_dty logger.info(f"PCC failed Start Pos: {failed_start_pos}") +@skip_for_blackhole("Unsupported on BH, see #12349") @pytest.mark.timeout(600) @pytest.mark.skip("Skipping due to causing 45 minutes timeout on tt eager unit tests") @skip_for_grayskull("Unsupported in GS since L1 runs OOM with most configs") diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sum.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sum.py index 5bafca9f180..1b204e133fb 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sum.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sum.py @@ -9,7 +9,6 @@ from models.utility_functions import skip_for_blackhole -@skip_for_blackhole("Mismatching on BH, see #12349") @pytest.mark.parametrize( "shape_dim", ( diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py index 7ee7809fbb8..29ac447236e 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py @@ -631,6 +631,7 @@ def test_transpose_bfloat8_b(device, shape, swap_dims): assert_with_pcc(torch_output, tt_output, 0.9999) +@skip_for_blackhole("Mismatching on BH, see #12349") @pytest.mark.parametrize( "dtype", (ttnn.bfloat16, ttnn.float32), @@ -649,6 +650,7 @@ def test_transpose_hc(dtype, shape, device): transpose(shape, device, dim0=1, dim1=-2, input_dtype=dtype) +@skip_for_blackhole("Mismatching on BH, see #12349") @pytest.mark.parametrize( "dtype", (ttnn.bfloat16, ttnn.float32), @@ -677,6 +679,7 @@ def test_transpose_2D(dtype, shape, layout, device): assert_with_pcc(torch_output, tt_output, 0.9999) +@skip_for_blackhole("Mismatching on BH, see #12349") @pytest.mark.parametrize( "dtype", (ttnn.bfloat16, ttnn.float32), @@ -758,6 +761,7 @@ def test_transpose_failures(config, device): assert_with_pcc(torch_output, tt_output, 0.9999) +@skip_for_blackhole("Mismatching on BH, see #12349") @pytest.mark.parametrize( "config", [ diff --git a/tests/tt_eager/python_api_testing/unit_testing/test_prod_nc.py b/tests/tt_eager/python_api_testing/unit_testing/test_prod_nc.py index 71641bc8071..9bf38150854 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/test_prod_nc.py +++ b/tests/tt_eager/python_api_testing/unit_testing/test_prod_nc.py @@ -28,7 +28,6 @@ def get_tensors(input_shape, output_shape, device): return tt_input, tt_output, torch_input -@skip_for_blackhole("Mismatching on BH, see #12349") @pytest.mark.parametrize( "input_shape", ( diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp index 5bbf0ca25b0..71b845e629d 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp @@ -24,6 +24,7 @@ #include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp" #include +using std::vector; using namespace tt; using std::chrono::duration_cast; using std::chrono::microseconds; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp index 71447cb3e19..0596b2939ba 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp @@ -28,6 +28,7 @@ #include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp" #include "tt_metal/common/work_split.hpp" +using std::vector; using namespace tt; //////////////////////////////////////////////////////////////////////////////// // This benchmark measures the compute performance of matmul. When in the slow diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp index 9bb6a19092a..804c62594d2 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp @@ -13,6 +13,7 @@ #include "tt_metal/impl/dispatch/command_queue.hpp" #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp" +using std::vector; using namespace tt; using namespace tt::tt_metal; using std::chrono::duration_cast; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/common/util.hpp b/tests/tt_metal/tt_metal/perf_microbenchmark/common/util.hpp index 879d11a1b4c..af91f5e785a 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/common/util.hpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/common/util.hpp @@ -28,7 +28,7 @@ inline uint64_t get_t0_to_any_riscfw_end_cycle(tt::tt_metal::Device *device, con hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::DPRINT); // This works for tensix only, will need to be updated for eth - vector print_buffer_addrs = { + std::vector print_buffer_addrs = { reinterpret_cast(dprint_msg->data[DPRINT_RISCV_INDEX_NC]), reinterpret_cast(dprint_msg->data[DPRINT_RISCV_INDEX_BR]), reinterpret_cast(dprint_msg->data[DPRINT_RISCV_INDEX_TR0]), @@ -37,7 +37,7 @@ inline uint64_t get_t0_to_any_riscfw_end_cycle(tt::tt_metal::Device *device, con }; for (const auto &worker_core : worker_cores_used_in_program) { for (const auto &buffer_addr : print_buffer_addrs) { - vector profile_buffer; + std::vector profile_buffer; uint32_t end_index; uint32_t dropped_marker_counter; profile_buffer = tt::llrt::read_hex_vec_from_core(device_id, worker_core, buffer_addr, DPRINT_BUFFER_SIZE); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h index 0959bd24c98..fada32bb47c 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h @@ -28,8 +28,8 @@ struct one_core_data_t { CoreCoord phys_core; int bank_id; int bank_offset; - vector valid; - vector data; + std::vector valid; + std::vector data; }; class DeviceData { @@ -342,7 +342,7 @@ inline bool DeviceData::validate_one_core(Device *device, uint32_t result_addr) { int fail_count = 0; const std::vector& dev_data = one_core_data.data; - const vector& dev_valid = one_core_data.valid; + const std::vector& dev_valid = one_core_data.valid; const CoreCoord logical_core = one_core_data.logical_core; const CoreCoord phys_core = one_core_data.phys_core; const CoreType core_type = one_core_data.core_type; @@ -366,7 +366,7 @@ inline bool DeviceData::validate_one_core(Device *device, // Read results from device and compare to expected for this core. result_addr += bank_offset; - vector results = tt::llrt::read_hex_vec_from_core(device->id(), phys_core, result_addr, size_bytes); + std::vector results = tt::llrt::read_hex_vec_from_core(device->id(), phys_core, result_addr, size_bytes); log_info(tt::LogTest, "Validating {} bytes from {} bank {} log_core {}: phys_core: {} at addr: 0x{:x}", size_bytes, core_string, bank_id, logical_core.str(), phys_core.str(), result_addr); @@ -536,7 +536,7 @@ inline uint32_t get_min_required_buffer_addr(Device *device, bool is_dram){ return min_required_positive_offset; } -inline void generate_random_payload(vector& cmds, +inline void generate_random_payload(std::vector& cmds, uint32_t length) { for (uint32_t i = 0; i < length; i++) { @@ -545,7 +545,7 @@ inline void generate_random_payload(vector& cmds, } } -inline void generate_random_payload(vector& cmds, +inline void generate_random_payload(std::vector& cmds, const CoreRange& workers, DeviceData& data, uint32_t length_words, @@ -579,7 +579,7 @@ inline void generate_random_payload(vector& cmds, // Generate a random payload for a paged write command. Note: Doesn't currently support using the base_addr here. inline void generate_random_paged_payload(Device *device, CQDispatchCmd cmd, - vector& cmds, + std::vector& cmds, DeviceData& data, uint32_t start_page, bool is_dram) { @@ -618,8 +618,8 @@ inline void generate_random_paged_payload(Device *device, } } -inline void generate_random_packed_payload(vector& cmds, - vector& worker_cores, +inline void generate_random_packed_payload(std::vector& cmds, + std::vector& worker_cores, DeviceData& data, uint32_t size_words, bool repeat = false) { @@ -628,7 +628,7 @@ inline void generate_random_packed_payload(vector& cmds, const uint32_t bank_id = 0; // No interleaved pages here. bool first_core = true; - vectorresults; + std::vectorresults; CoreCoord first_worker = worker_cores[0]; for (uint32_t i = 0; i < size_words; i++) { uint32_t datum = (use_coherent_data_g) ? ((first_worker.x << 16) | (first_worker.y << 24) | coherent_count++) : std::rand(); @@ -648,7 +648,7 @@ inline void generate_random_packed_payload(vector& cmds, } } -inline void generate_random_packed_large_payload(vector& generated_data, +inline void generate_random_packed_large_payload(std::vector& generated_data, CoreRange range, DeviceData& data, uint32_t size_words) { @@ -676,7 +676,7 @@ inline void generate_random_packed_large_payload(vector& generated_dat } } -inline void add_bare_dispatcher_cmd(vector& cmds, +inline void add_bare_dispatcher_cmd(std::vector& cmds, CQDispatchCmd cmd) { static_assert(sizeof(CQDispatchCmd) % sizeof(uint32_t) == 0, "CQDispatchCmd size must be a multiple of uint32_t size"); const size_t num_uint32s = sizeof(CQDispatchCmd) / sizeof(uint32_t); @@ -688,7 +688,7 @@ inline void add_bare_dispatcher_cmd(vector& cmds, } } -inline size_t debug_prologue(vector& cmds) { +inline size_t debug_prologue(std::vector& cmds) { size_t prior = cmds.size(); if (debug_g) { @@ -707,7 +707,7 @@ inline size_t debug_prologue(vector& cmds) { return prior; } -inline void debug_epilogue(vector& cmds, +inline void debug_epilogue(std::vector& cmds, size_t prior_end) { if (debug_g) { // Doing a checksum on the full command length is problematic in the kernel @@ -731,7 +731,7 @@ inline void debug_epilogue(vector& cmds, } } -inline void add_dispatcher_cmd(vector& cmds, +inline void add_dispatcher_cmd(std::vector& cmds, CQDispatchCmd cmd, uint32_t length) { @@ -744,7 +744,7 @@ inline void add_dispatcher_cmd(vector& cmds, debug_epilogue(cmds, prior_end); } -inline void add_dispatcher_cmd(vector& cmds, +inline void add_dispatcher_cmd(std::vector& cmds, const CoreRange& workers, DeviceData& device_data, CQDispatchCmd cmd, @@ -762,7 +762,7 @@ inline void add_dispatcher_cmd(vector& cmds, } inline void add_dispatcher_paged_cmd(Device *device, - vector& cmds, + std::vector& cmds, DeviceData& device_data, CQDispatchCmd cmd, uint32_t start_page, @@ -775,8 +775,8 @@ inline void add_dispatcher_paged_cmd(Device *device, } inline void add_dispatcher_packed_cmd(Device *device, - vector& cmds, - vector& worker_cores, + std::vector& cmds, + std::vector& worker_cores, DeviceData& device_data, CQDispatchCmd cmd, uint32_t size_words, @@ -798,7 +798,7 @@ inline void add_dispatcher_packed_cmd(Device *device, // bare: doesn't generate random payload data, for use w/ eg, dram reads inline void gen_bare_dispatcher_unicast_write_cmd(Device *device, - vector& cmds, + std::vector& cmds, CoreCoord worker_core, DeviceData& device_data, uint32_t length) { @@ -821,7 +821,7 @@ inline void gen_bare_dispatcher_unicast_write_cmd(Device *device, } inline void gen_dispatcher_unicast_write_cmd(Device *device, - vector& cmds, + std::vector& cmds, CoreCoord worker_core, DeviceData& device_data, uint32_t length) { @@ -842,7 +842,7 @@ inline void gen_dispatcher_unicast_write_cmd(Device *device, } inline void gen_dispatcher_multicast_write_cmd(Device *device, - vector& cmds, + std::vector& cmds, CoreRange worker_core_range, DeviceData& device_data, uint32_t length) { @@ -868,7 +868,7 @@ inline void gen_dispatcher_multicast_write_cmd(Device *device, } inline void gen_dispatcher_paged_write_cmd(Device *device, - vector& cmds, + std::vector& cmds, DeviceData& device_data, bool is_dram, uint32_t start_page, @@ -913,8 +913,8 @@ inline void gen_dispatcher_paged_write_cmd(Device *device, inline void gen_dispatcher_packed_write_cmd(Device *device, - vector& cmds, - vector& worker_cores, + std::vector& cmds, + std::vector& worker_cores, DeviceData& device_data, uint32_t size_words, bool repeat = false) { @@ -938,7 +938,7 @@ inline void gen_dispatcher_packed_write_cmd(Device *device, } inline void gen_rnd_dispatcher_packed_write_cmd(Device *device, - vector& cmds, + std::vector& cmds, DeviceData& device_data) { // Note: this cmd doesn't clamp to a max size which means it can overflow L1 buffer @@ -952,7 +952,7 @@ inline void gen_rnd_dispatcher_packed_write_cmd(Device *device, if (xfer_size_bytes < min_xfer_size_bytes_g) xfer_size_bytes = min_xfer_size_bytes_g; } - vector gets_data; + std::vector gets_data; while (gets_data.size() == 0) { for (auto & [core, one_worker] : device_data.get_data()) { if (device_data.core_and_bank_present(core, 0) && @@ -984,14 +984,14 @@ inline void gen_rnd_dispatcher_packed_write_cmd(Device *device, inline bool gen_rnd_dispatcher_packed_write_large_cmd(Device *device, CoreRange workers, - vector& cmds, + std::vector& cmds, DeviceData& device_data, uint32_t space_available) { int ntransactions = perf_test_g ? (CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_MAX_SUB_CMDS / 2) : ((std:: rand() % CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_MAX_SUB_CMDS) + 1); - vector sizes; + std::vector sizes; for (int i = 0; i < ntransactions; i++) { constexpr uint32_t max_pages = 4; uint32_t xfer_size_16b = (std::rand() % (dispatch_buffer_page_size_g * max_pages / hal.get_alignment(HalMemType::L1))) + 1; @@ -1022,7 +1022,7 @@ inline bool gen_rnd_dispatcher_packed_write_large_cmd(Device *device, cmd.write_packed_large.alignment = hal.get_alignment(HalMemType::L1); add_bare_dispatcher_cmd(cmds, cmd); - vector data; + std::vector data; for (int i = 0; i < ntransactions; i++) { uint32_t xfer_size_bytes = sizes[i]; @@ -1061,7 +1061,7 @@ inline bool gen_rnd_dispatcher_packed_write_large_cmd(Device *device, return false; } -inline void gen_dispatcher_host_write_cmd(vector& cmds, +inline void gen_dispatcher_host_write_cmd(std::vector& cmds, DeviceData& device_data, uint32_t length) { @@ -1075,7 +1075,7 @@ inline void gen_dispatcher_host_write_cmd(vector& cmds, add_dispatcher_cmd(cmds, device_data.get_host_core(), device_data, cmd, length, false, true); } -inline void gen_bare_dispatcher_host_write_cmd(vector& cmds, uint32_t length) { +inline void gen_bare_dispatcher_host_write_cmd(std::vector& cmds, uint32_t length) { CQDispatchCmd cmd; memset(&cmd, 0, sizeof(CQDispatchCmd)); @@ -1087,7 +1087,7 @@ inline void gen_bare_dispatcher_host_write_cmd(vector& cmds, uint32_t add_bare_dispatcher_cmd(cmds, cmd); } -inline void gen_dispatcher_set_write_offset_cmd(vector& cmds, uint32_t wo0, uint32_t wo1 = 0, uint32_t wo2 = 0) { +inline void gen_dispatcher_set_write_offset_cmd(std::vector& cmds, uint32_t wo0, uint32_t wo1 = 0, uint32_t wo2 = 0) { CQDispatchCmd cmd; memset(&cmd, 0, sizeof(CQDispatchCmd)); @@ -1100,7 +1100,7 @@ inline void gen_dispatcher_set_write_offset_cmd(vector& cmds, uint32_t add_dispatcher_cmd(cmds, cmd, payload_length); } -inline void gen_dispatcher_terminate_cmd(vector& cmds) { +inline void gen_dispatcher_terminate_cmd(std::vector& cmds) { CQDispatchCmd cmd; memset(&cmd, 0, sizeof(CQDispatchCmd)); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp index b660e49d921..d13994ded6e 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp @@ -28,6 +28,7 @@ constexpr uint32_t DEFAULT_BATCH_SIZE_K = 512; // // Test read/write bw and latency from host/dram/l1 ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; uint32_t iterations_g = DEFAULT_ITERATIONS; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp index 6f25b8c6c04..ae6c2cf33a3 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp @@ -36,6 +36,7 @@ constexpr uint32_t MIN_PAGED_WRITE_ADDR = 512 * 1024; // Disable randomization b // // Times dispatching program to M cores, N processors, of various sizes, CBs, runtime args ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; uint32_t iterations_g = DEFAULT_ITERATIONS; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp index 510406c08f9..dd5e90f3ac2 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp @@ -24,6 +24,7 @@ constexpr uint32_t MAX_ARGS = 255; // // Times dispatching program to M cores, N processors, of various sizes, CBs, runtime args ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; uint32_t iterations_g = DEFAULT_ITERATIONS; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp index 866f5193212..ba38f8ac8db 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp @@ -53,6 +53,7 @@ constexpr uint32_t host_data_dirty_pattern = 0xbaadf00d; // // Times dispatching program to M cores, N processors, of various sizes, CBs, runtime args ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; uint32_t iterations_g = DEFAULT_ITERATIONS; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp index 4541164697f..984cb6e3483 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp @@ -17,6 +17,7 @@ #include "tt_metal/impl/debug/dprint_server.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" +using std::vector; using namespace tt; // took from bmm_op.cpp diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp index f3e6a893ccd..ecef4cf8d6e 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp @@ -18,6 +18,7 @@ #define LAUNCH +using std::vector; using namespace tt; // Given a tensor that is row-major datums, make it tilized diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp index 81b30d3b56f..4bae04746bd 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp @@ -10,6 +10,7 @@ #include "kernels/traffic_gen_test.hpp" #include "tt_metal/impl/device/device.hpp" +using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp index 431fe73700f..07a59aefad8 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp @@ -10,6 +10,7 @@ #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "kernels/traffic_gen_test.hpp" +using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp index c251a567871..41bdccef04b 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp @@ -9,6 +9,7 @@ #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "kernels/traffic_gen_test.hpp" +using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp index 7b9b45ba68c..6a8c1753b75 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp @@ -10,6 +10,7 @@ #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "kernels/traffic_gen_test.hpp" +using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp index 3c501badf88..f311a276896 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp @@ -10,6 +10,7 @@ #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "kernels/traffic_gen_test.hpp" +using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp index 1babd91d899..fb21f47cf07 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp @@ -10,6 +10,7 @@ #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "kernels/traffic_gen_test.hpp" +using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp index ee9b74ea473..996b9361564 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp @@ -10,6 +10,7 @@ #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "kernels/traffic_gen_test.hpp" +using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp index dec4e9ab3ca..1a33b8c655b 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp @@ -10,6 +10,7 @@ #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "kernels/traffic_gen_test.hpp" +using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel.cpp index b0b8930326c..c111007d735 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel.cpp @@ -10,6 +10,7 @@ #include "kernels/traffic_gen_test.hpp" #include "tt_metal/impl/device/device.hpp" +using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp index ed64139831d..8402cf52f6e 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp @@ -10,6 +10,7 @@ #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "kernels/traffic_gen_test.hpp" +using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp index bd6708dca7d..5a06741d0df 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp @@ -10,6 +10,7 @@ #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "kernels/traffic_gen_test.hpp" +using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/test_bcast.cpp b/tests/tt_metal/tt_metal/test_bcast.cpp index b60816779f6..932390177b8 100644 --- a/tests/tt_metal/tt_metal/test_bcast.cpp +++ b/tests/tt_metal/tt_metal/test_bcast.cpp @@ -16,6 +16,7 @@ #include "test_gold_impls.hpp" #include "constants.hpp" +using std::vector; using namespace tt; using namespace constants; diff --git a/tests/tt_metal/tt_metal/test_bmm.cpp b/tests/tt_metal/tt_metal/test_bmm.cpp index 21f021714cc..c91b6de5a36 100644 --- a/tests/tt_metal/tt_metal/test_bmm.cpp +++ b/tests/tt_metal/tt_metal/test_bmm.cpp @@ -11,6 +11,7 @@ #include "common/bfloat16.hpp" #include "test_gold_impls.hpp" +using std::vector; using namespace tt; using namespace tt::tt_metal; diff --git a/tests/tt_metal/tt_metal/test_clean_init.cpp b/tests/tt_metal/tt_metal/test_clean_init.cpp index 770f0f34f4a..1e2ab1c8613 100644 --- a/tests/tt_metal/tt_metal/test_clean_init.cpp +++ b/tests/tt_metal/tt_metal/test_clean_init.cpp @@ -14,6 +14,7 @@ * recover from a "bad" state. */ +using std::vector; using namespace tt::tt_metal; int main(int argc, char **argv) { diff --git a/tests/tt_metal/tt_metal/test_compile_args.cpp b/tests/tt_metal/tt_metal/test_compile_args.cpp index 794404b1a65..5daa58616c4 100644 --- a/tests/tt_metal/tt_metal/test_compile_args.cpp +++ b/tests/tt_metal/tt_metal/test_compile_args.cpp @@ -16,6 +16,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; bool test_compile_args(std::vector compile_args_vec, tt_metal::Device *device) { diff --git a/tests/tt_metal/tt_metal/test_compile_program.cpp b/tests/tt_metal/tt_metal/test_compile_program.cpp index dac634833e2..a093bb91fcf 100644 --- a/tests/tt_metal/tt_metal/test_compile_program.cpp +++ b/tests/tt_metal/tt_metal/test_compile_program.cpp @@ -17,6 +17,7 @@ #include "tt_metal/impl/device/device.hpp" #include "tt_metal/impl/kernels/kernel.hpp" +using std::vector; using namespace tt; using namespace tt::tt_metal; diff --git a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp index 36909cb681d..7727c0a13ff 100644 --- a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp +++ b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp @@ -19,6 +19,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; std::string get_latest_kernel_binary_path(uint32_t mask, const std::shared_ptr kernel) { diff --git a/tests/tt_metal/tt_metal/test_core_range_set.cpp b/tests/tt_metal/tt_metal/test_core_range_set.cpp index 6140121289d..bb32b100d4a 100644 --- a/tests/tt_metal/tt_metal/test_core_range_set.cpp +++ b/tests/tt_metal/tt_metal/test_core_range_set.cpp @@ -17,6 +17,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; void check_program_is_mapped_to_correct_cores(const tt_metal::Program &program, const CoreRangeSet &core_range_set, const std::vector &compute_kernel_args) { diff --git a/tests/tt_metal/tt_metal/test_datacopy.cpp b/tests/tt_metal/tt_metal/test_datacopy.cpp index 809d0a5003a..911916a5d0e 100644 --- a/tests/tt_metal/tt_metal/test_datacopy.cpp +++ b/tests/tt_metal/tt_metal/test_datacopy.cpp @@ -15,6 +15,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; namespace unary_datacopy { diff --git a/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp b/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp index 16795cccb8b..59f9d90fe27 100644 --- a/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp +++ b/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp @@ -16,6 +16,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; int main(int argc, char **argv) { diff --git a/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp b/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp index c7577a40435..d66f5441fc4 100644 --- a/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp +++ b/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp @@ -15,6 +15,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; namespace unary_datacopy { diff --git a/tests/tt_metal/tt_metal/test_eltwise_binary.cpp b/tests/tt_metal/tt_metal/test_eltwise_binary.cpp index 6583d62ecb1..7abd5863f8f 100644 --- a/tests/tt_metal/tt_metal/test_eltwise_binary.cpp +++ b/tests/tt_metal/tt_metal/test_eltwise_binary.cpp @@ -12,6 +12,7 @@ #include "tt_metal/impl/dispatch/command_queue.hpp" #include "tt_metal/impl/device/device.hpp" +using std::vector; using namespace tt; using namespace tt::tt_metal; diff --git a/tests/tt_metal/tt_metal/test_flatten.cpp b/tests/tt_metal/tt_metal/test_flatten.cpp index 430436cf829..6322da0dc4e 100644 --- a/tests/tt_metal/tt_metal/test_flatten.cpp +++ b/tests/tt_metal/tt_metal/test_flatten.cpp @@ -16,6 +16,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; uint32_t prod(vector &shape) { diff --git a/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp index 8f6269b2c39..6a125cf8c21 100644 --- a/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp +++ b/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp @@ -16,6 +16,7 @@ // This test is similar to test_matmul_large_block. // The only difference is that it uses generic_binary_reader_kernel instead of reader_matmul_blocked kernel. ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; // Given a tensor that is row-major datums, make it tilized diff --git a/tests/tt_metal/tt_metal/test_gold_impls.hpp b/tests/tt_metal/tt_metal/test_gold_impls.hpp index e2737044df6..e989339bbb7 100644 --- a/tests/tt_metal/tt_metal/test_gold_impls.hpp +++ b/tests/tt_metal/tt_metal/test_gold_impls.hpp @@ -61,7 +61,7 @@ struct BcastOp { // These constants above map to ops in llk_3c.h: // add_tiles_bcast, sub_tiles_bcast, mul_tiles_bcast - static const vector all() { return { ADD, SUB, MUL }; } + static const std::vector all() { return { ADD, SUB, MUL }; } }; @@ -122,7 +122,7 @@ inline std::vector gold_bmm( const std::vector shapeA, const std::vector& A, const std::vector& shapeB, - const vector& B, + const std::vector& B, bool acc16 = false ) { @@ -132,12 +132,12 @@ inline std::vector gold_bmm( uint32_t K = shapeA[3]; TT_FATAL(shapeB[2] == K, "Error"); uint32_t N = shapeB[3]; - vector shapeC{1, nb, M, N}; + std::vector shapeC{1, nb, M, N}; TensAddr addrC(shapeC); TensAddr addrA(shapeA); TensAddr addrB(shapeB); - vector result(addrC.numel()); - vector resultf(addrC.numel()); + std::vector result(addrC.numel()); + std::vector resultf(addrC.numel()); std::fill(resultf.begin(), resultf.end(), 0); for (int ib = 0; ib < nb; ib++) diff --git a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp index ee499eac073..c5677750107 100644 --- a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp +++ b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp @@ -19,6 +19,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; bool test_write_interleaved_sticks_and_then_read_interleaved_sticks(const tt::ARCH& arch) { diff --git a/tests/tt_metal/tt_metal/test_matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_matmul_large_block.cpp index c3da4364db7..c05244e90df 100644 --- a/tests/tt_metal/tt_metal/test_matmul_large_block.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_large_block.cpp @@ -15,6 +15,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; // Given a tensor that is row-major datums, make it tilized diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp index e8c63539ff2..8e0a625fa51 100644 --- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp @@ -16,6 +16,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; using namespace tt::tt_metal; diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp index d2ffdee3654..2ebe6804c8a 100644 --- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp @@ -16,6 +16,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp index 21692b41c63..73f2df50cd8 100644 --- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp @@ -16,6 +16,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; // Given a tensor that is row-major datums, make it tilized diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in1_mcast.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in1_mcast.cpp index 567c7b2d0e8..ea75884ff97 100644 --- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in1_mcast.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in1_mcast.cpp @@ -16,6 +16,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; // Given a tensor that is row-major datums, make it tilized diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp index ab87b99b88c..51f703323ff 100644 --- a/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp @@ -15,6 +15,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; // Given a tensor that is row-major datums, make it tilized diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp index 26a8773d4e7..13ad19b9b5d 100644 --- a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp @@ -15,6 +15,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; // Given a tensor that is row-major datums, make it tilized diff --git a/tests/tt_metal/tt_metal/test_matmul_single_core.cpp b/tests/tt_metal/tt_metal/test_matmul_single_core.cpp index 131e96d497f..f7b7da888f9 100644 --- a/tests/tt_metal/tt_metal/test_matmul_single_core.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_single_core.cpp @@ -15,6 +15,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; // Given a tensor that is row-major datums, make it tilized diff --git a/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp b/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp index 72e910db512..0b68b76326e 100644 --- a/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp @@ -16,6 +16,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; // Given a tensor that is row-major datums, make it tilized diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp index b4f52531989..dcd371a9d3d 100644 --- a/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp @@ -15,6 +15,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; int main(int argc, char **argv) { diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp index bcafffbe44e..df3273a197b 100644 --- a/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp @@ -14,6 +14,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; int main(int argc, char **argv) { diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp index bdfe897259b..4a58e403a1a 100644 --- a/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp @@ -15,6 +15,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; int main(int argc, char **argv) { diff --git a/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp b/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp index 5a1e242b314..7a24596843b 100644 --- a/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp +++ b/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp @@ -16,6 +16,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; std::tuple create_program( diff --git a/tests/tt_metal/tt_metal/test_multiple_programs.cpp b/tests/tt_metal/tt_metal/test_multiple_programs.cpp index a7c0dd7e70f..eef89366d45 100644 --- a/tests/tt_metal/tt_metal/test_multiple_programs.cpp +++ b/tests/tt_metal/tt_metal/test_multiple_programs.cpp @@ -12,6 +12,7 @@ #include "tt_metal/test_utils/deprecated/tensor.hpp" #include "test_tiles.hpp" +using std::vector; using namespace tt; struct BinaryOpType { diff --git a/tests/tt_metal/tt_metal/test_transpose_hc.cpp b/tests/tt_metal/tt_metal/test_transpose_hc.cpp index 060ec8ac4ca..e3392ac0d23 100644 --- a/tests/tt_metal/tt_metal/test_transpose_hc.cpp +++ b/tests/tt_metal/tt_metal/test_transpose_hc.cpp @@ -13,6 +13,7 @@ #include "test_tiles.hpp" +using std::vector; using namespace tt; using std::uint32_t; diff --git a/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp b/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp index fcf74dbebfc..6d783347e7b 100644 --- a/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp +++ b/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp @@ -11,6 +11,7 @@ #include "common/bfloat16.hpp" #include "test_gold_impls.hpp" +using std::vector; using namespace tt; inline std::vector gold_standard_untilize(std::vector src_vec, std::vector shape) { diff --git a/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp b/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp index 4e3ea25ad9f..5b3f3cd7851 100644 --- a/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp +++ b/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp @@ -7,6 +7,7 @@ #include "tt_metal/impl/dispatch/command_queue.hpp" #include "tt_metal/detail/tt_metal.hpp" +using std::vector; using namespace tt; using namespace tt::tt_metal; uint32_t NUM_TILES = 2048; diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp b/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp index 58689b51aa9..9be219332a0 100644 --- a/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp @@ -13,6 +13,7 @@ #include "tt_metal/detail/util.hpp" #include "tt_metal/host_api.hpp" +using std::vector; using namespace tt; namespace unit_tests::initialize_semaphores { diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp b/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp index 7aeabc409a2..62418427c21 100644 --- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp @@ -13,6 +13,7 @@ #include "tt_metal/test_utils/print_helpers.hpp" #include "tt_metal/test_utils/stimulus.hpp" +using std::vector; using namespace tt::tt_metal; namespace basic_tests::buffer::banked { diff --git a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp index b57bff769f8..1df5ec9cdfd 100644 --- a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp @@ -10,6 +10,7 @@ #include "tt_metal/detail/tt_metal.hpp" #include "common/bfloat16.hpp" +using std::vector; using namespace tt::tt_metal; namespace basic_tests::circular_buffer { diff --git a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp index 199aa429f88..1196f802b36 100644 --- a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp @@ -9,6 +9,7 @@ #include "tt_metal/host_api.hpp" #include "tt_metal/impl/buffers/circular_buffer.hpp" +using std::vector; using namespace tt::tt_metal; namespace basic_tests::circular_buffer { diff --git a/tests/tt_metal/tt_metal/unit_tests/common/device_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests/common/device_fixture.hpp index 9ccaa93d4bd..c48639f4f36 100644 --- a/tests/tt_metal/tt_metal/unit_tests/common/device_fixture.hpp +++ b/tests/tt_metal/tt_metal/unit_tests/common/device_fixture.hpp @@ -29,7 +29,7 @@ class DeviceFixture : public ::testing::Test { num_devices_ = 2; } - vector ids; + std::vector ids; for (unsigned int id = 0; id < num_devices_; id++) { ids.push_back(id); } @@ -94,7 +94,7 @@ class GalaxyFixture : public ::testing::Test { void InitializeDevices() { const size_t num_devices = tt::tt_metal::GetNumAvailableDevices(); - vector ids; + std::vector ids; for (uint32_t id = 0; id < num_devices; id++) { ids.push_back(id); diff --git a/tests/tt_metal/tt_metal/unit_tests/common/n300_device_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests/common/n300_device_fixture.hpp index 355c0b03ab3..08e57a5cb2a 100644 --- a/tests/tt_metal/tt_metal/unit_tests/common/n300_device_fixture.hpp +++ b/tests/tt_metal/tt_metal/unit_tests/common/n300_device_fixture.hpp @@ -23,7 +23,7 @@ class N300DeviceFixture : public ::testing::Test { num_devices_ = tt::tt_metal::GetNumAvailableDevices(); if (arch_ == tt::ARCH::WORMHOLE_B0 and tt::tt_metal::GetNumAvailableDevices() == 2 and tt::tt_metal::GetNumPCIeDevices() == 1) { - vector ids; + std::vector ids; for (unsigned int id = 0; id < num_devices_; id++) { ids.push_back(id); } diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp index ac551ffe7f2..d12d89bd88f 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp @@ -10,6 +10,7 @@ #include "tt_metal/test_utils/stimulus.hpp" #include "test_golden_impls.hpp" +using std::map; using namespace tt; using namespace tt::test_utils; using namespace tt::test_utils::df; diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp index cb9f7bbaf69..314bf2fb127 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp @@ -6,6 +6,7 @@ #include "tt_metal/test_utils/stimulus.hpp" +using std::vector; using namespace tt; using namespace tt::test_utils; diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_dropout_sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_dropout_sfpu_compute.cpp index c3269675049..655aeb87cfe 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_dropout_sfpu_compute.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_dropout_sfpu_compute.cpp @@ -8,6 +8,7 @@ #include "tt_metal/host_api.hpp" #include "common/bfloat16.hpp" +using std::vector; using namespace tt; using namespace tt::tt_metal; diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp index 6988cfb5277..4afc02acaa8 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp @@ -12,6 +12,7 @@ #include "tt_metal/detail/tt_metal.hpp" #include "tests/tt_metal/test_utils/packing.hpp" +using std::vector; namespace unit_tests::compute { diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp index 95c902c026f..b55c6329938 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp @@ -7,6 +7,7 @@ #include "tt_metal/common/bfloat8.hpp" #include "tt_metal/test_utils/comparison.hpp" +using std::vector; using namespace tt; using namespace tt::test_utils; diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp index 596beb20e8d..8439126997a 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp @@ -20,6 +20,7 @@ #include "common/test_tiles.hpp" #include "common/bfloat16.hpp" +using std::vector; using namespace tt; using namespace tt::tt_metal; using namespace tt::test_utils; diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp index 9a80a7d8819..0a7822c6fbf 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp @@ -17,6 +17,8 @@ #include "tt_metal/test_utils/print_helpers.hpp" #include "tt_metal/test_utils/stimulus.hpp" +using std::map; +using std::vector; using namespace tt; using namespace tt::test_utils; using namespace tt::test_utils::df; diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp index 3cfd8a8e7cc..19baa412647 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp @@ -17,6 +17,8 @@ #include "tt_metal/test_utils/print_helpers.hpp" #include "tt_metal/test_utils/stimulus.hpp" +using std::map; +using std::vector; using namespace tt; using namespace tt::test_utils; using namespace tt::test_utils::df; diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_transpose.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_transpose.cpp index c6b0ccc87dd..76736184a2e 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_transpose.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_transpose.cpp @@ -19,6 +19,7 @@ #include "test_golden_impls.hpp" #include "common/test_tiles.hpp" +using std::vector; using namespace tt; using namespace tt::test_utils; using namespace tt::test_utils::df; diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp index 91071283a51..5dbdcc7ab6d 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp @@ -18,6 +18,7 @@ #include "tt_metal/test_utils/stimulus.hpp" #include "test_golden_impls.hpp" +using std::vector; using namespace tt; using namespace tt::test_utils; using namespace tt::tt_metal; diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp index d475d3c897b..5729e1a6c4b 100644 --- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp @@ -7,6 +7,8 @@ #include "tt_metal/common/core_coord.hpp" #include "core_coord_fixture.hpp" +using std::vector; + namespace basic_tests::CoreRange { TEST_F(CoreCoordHarness, TestCoreRangeIterator) diff --git a/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp b/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp index 081d72fb3a1..791f033d127 100644 --- a/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp @@ -16,6 +16,7 @@ #include "tt_metal/test_utils/print_helpers.hpp" #include "tt_metal/test_utils/stimulus.hpp" +using std::vector; using namespace tt; using namespace tt::test_utils; using namespace tt::test_utils::df; diff --git a/tests/tt_metal/tt_metal/unit_tests/fast_dispatch_kernels/test_write_host.cpp b/tests/tt_metal/tt_metal/unit_tests/fast_dispatch_kernels/test_write_host.cpp index e7640e332d0..f70039820a5 100644 --- a/tests/tt_metal/tt_metal/unit_tests/fast_dispatch_kernels/test_write_host.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/fast_dispatch_kernels/test_write_host.cpp @@ -12,6 +12,7 @@ #include "tt_metal/test_utils/env_vars.hpp" #include "tt_metal/common/math.hpp" +using std::vector; using namespace tt::tt_metal; // TODO: Remove dependency on "tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h" and remove globals diff --git a/tests/tt_metal/tt_metal/unit_tests/multichip/ring_gather_kernels.cpp b/tests/tt_metal/tt_metal/unit_tests/multichip/ring_gather_kernels.cpp index 75c8edc0ae3..6cad6f5d625 100644 --- a/tests/tt_metal/tt_metal/unit_tests/multichip/ring_gather_kernels.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/multichip/ring_gather_kernels.cpp @@ -18,6 +18,7 @@ #include "tt_metal/test_utils/print_helpers.hpp" #include "tt_metal/test_utils/stimulus.hpp" +using std::vector; using namespace tt; using namespace tt::test_utils; using namespace tt::test_utils::df; diff --git a/tests/tt_metal/tt_metal/unit_tests_common/basic/test_device_init.cpp b/tests/tt_metal/tt_metal/unit_tests_common/basic/test_device_init.cpp index 6e82d8b87b4..f4dfae4d653 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/basic/test_device_init.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/basic/test_device_init.cpp @@ -17,6 +17,7 @@ #include "tt_metal/impl/device/device.hpp" #include "tt_metal/impl/device/device_pool.hpp" +using std::vector; using namespace tt; using namespace tt::test_utils; diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp index 510787d324b..5ee4c1fa79c 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp @@ -48,7 +48,7 @@ class CommonFixture: public ::testing::Test { protected: tt::ARCH arch_; - vector devices_; + std::vector devices_; bool slow_dispatch_; bool has_remote_devices_; @@ -70,7 +70,7 @@ class CommonFixture: public ::testing::Test { // An extra flag for if we have remote devices, as some tests are disabled for fast // dispatch + remote devices. this->has_remote_devices_ = num_devices > num_pci_devices; - vector ids; + std::vector ids; for (unsigned int id = 0; id < num_devices; id++) { if (SkipTest(id)) continue; diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/test_dispatch.cpp b/tests/tt_metal/tt_metal/unit_tests_common/common/test_dispatch.cpp index 5f3332ebcc9..d8e3a4fefe1 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/common/test_dispatch.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/common/test_dispatch.cpp @@ -6,6 +6,8 @@ #include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp" +using std::vector; + // Test sync w/ semaphores betweeen eth/tensix cores // Test will hang in the kernel if the sync doesn't work properly static void test_sems_across_core_types(CommonFixture *fixture, diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/test_utils.hpp b/tests/tt_metal/tt_metal/unit_tests_common/common/test_utils.hpp index 3c3f2fa7d02..bc2356eb19d 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/common/test_utils.hpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/common/test_utils.hpp @@ -66,7 +66,7 @@ inline bool StringContainsWithWildcard(const string& s1, const string& s2, char // Check whether the given file contains a list of strings. Doesn't check for // strings between lines in the file. -inline bool FileContainsAllStrings(string file_name, const vector &must_contain) { +inline bool FileContainsAllStrings(string file_name, const std::vector &must_contain) { std::fstream log_file; if (!OpenFile(file_name, log_file, std::fstream::in)) return false; @@ -78,7 +78,7 @@ inline bool FileContainsAllStrings(string file_name, const vector &must_ string line; while (getline(log_file, line)) { // Check for all target strings in the current line - vector found_on_current_line; + std::vector found_on_current_line; for (const string &s : must_contain_set) { if (StringContainsWithWildcard(s, line, '*')) found_on_current_line.push_back(s); @@ -110,7 +110,7 @@ inline bool FileContainsAllStrings(string file_name, const vector &must_ // Check whether the given file contains a list of strings (in order). Doesn't check for strings // between lines in a file. -inline bool FileContainsAllStringsInOrder(string file_name, const vector &must_contain) { +inline bool FileContainsAllStringsInOrder(string file_name, const std::vector &must_contain) { std::fstream log_file; if (!OpenFile(file_name, log_file, std::fstream::in)) return false; diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp index 3a87f5d8365..b7b5d17b241 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp @@ -17,6 +17,7 @@ #include "tests/tt_metal/test_utils/print_helpers.hpp" #include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp" +using std::vector; using namespace tt; using namespace tt::test_utils; namespace unit_tests_common::matmul::test_matmul_X_tile{ diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_large_block.cpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_large_block.cpp index c808649943c..35722184b45 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_large_block.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_large_block.cpp @@ -16,6 +16,7 @@ #include "tests/tt_metal/test_utils/print_helpers.hpp" #include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp" +using std::vector; using namespace tt; using namespace tt::test_utils; diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp index 2eb2e6975a4..353e9084340 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp @@ -16,6 +16,7 @@ #include "tests/tt_metal/test_utils/tilization.hpp" #include "tests/tt_metal/test_utils/print_helpers.hpp" #include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp" +using std::vector; using namespace tt; namespace unit_tests_common::matmul::test_matmul_multi_core_X_dram { diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp index 1a24e668847..ff60ec1b853 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp @@ -16,6 +16,7 @@ #include "tests/tt_metal/test_utils/tilization.hpp" #include "tests/tt_metal/test_utils/print_helpers.hpp" #include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp" +using std::vector; using namespace tt; namespace unit_tests_common::matmul::test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast { diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp index 416a13037bd..6f7aee8b645 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp @@ -16,6 +16,7 @@ #include "tests/tt_metal/test_utils/tilization.hpp" #include "tests/tt_metal/test_utils/print_helpers.hpp" #include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp" +using std::vector; using namespace tt; namespace unit_tests_common::matmul::test_matmul_multi_core_multi_dram_inX_mcast { diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_single_core.cpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_single_core.cpp index 6c4fe79c023..5db120dd9ba 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_single_core.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_single_core.cpp @@ -17,6 +17,7 @@ #include "tests/tt_metal/test_utils/print_helpers.hpp" #include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp" +using std::vector; using namespace tt; namespace unit_tests_common::matmul::test_matmul_single_core{ diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/test_flatten.cpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/test_flatten.cpp index 76e7bb007ad..5dbadc80812 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/compute/test_flatten.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/compute/test_flatten.cpp @@ -14,6 +14,7 @@ #include "llrt/llrt.hpp" +using std::vector; using namespace tt; namespace gtest_smoke::test_flatten{ diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_before_finish.cpp b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_before_finish.cpp index 7128ad2dd25..0370b51f3f2 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_before_finish.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_before_finish.cpp @@ -7,6 +7,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // A test for checking that the finish command can wait for the last dprint. ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; using namespace tt::tt_metal; diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_link_training.cpp b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_link_training.cpp index 8ba4b94cb89..dd23509745b 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_link_training.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_link_training.cpp @@ -8,6 +8,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // A test for checking watcher polling the eth link training counter. ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; using namespace tt::tt_metal; diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize_delays.cpp b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize_delays.cpp index ef31e02d8a3..600872d58ac 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize_delays.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize_delays.cpp @@ -13,6 +13,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // A test for checking watcher NOC sanitization. ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; using namespace tt::tt_metal; diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp index 46b720e6508..1a21a43a187 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp @@ -8,6 +8,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // A test for checking watcher pause feature. ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; using namespace tt::tt_metal; diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp index 4dafe78f539..ffc9fb62e57 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp @@ -10,6 +10,7 @@ ////////////////////////////////////////////////////////////////////////////////////////// // A test for checking watcher waypoints. ////////////////////////////////////////////////////////////////////////////////////////// +using std::vector; using namespace tt; using namespace tt::tt_metal; diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp index ec8238158d1..2ad2bb7842d 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp @@ -14,6 +14,7 @@ #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/impl/kernels/kernel.hpp" +using std::vector; using namespace tt::tt_metal; struct CBConfig { diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp index 65b6b20d57f..755f5892db0 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp @@ -18,6 +18,7 @@ #include "tt_metal/host_api.hpp" +using std::vector; using namespace tt; using namespace tt::tt_metal; diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index fc6345f5e9a..e380c41d67c 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -13,6 +13,7 @@ #include "tt_metal/test_utils/print_helpers.hpp" #include "tt_metal/impl/device/device.hpp" +using std::vector; using namespace tt::tt_metal; struct BufferStressTestConfig { diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp index 03e6eb1004c..e4eceaffb9c 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp @@ -16,6 +16,7 @@ #include "tt_metal/impl/dispatch/command_queue.hpp" #include "tt_metal/impl/buffers/circular_buffer.hpp" +using std::vector; using namespace tt::tt_metal; namespace host_cq_test_utils { diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp index 5a772063742..023462a6cd2 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp @@ -12,6 +12,7 @@ #include "tt_metal/impl/event/event.hpp" #include "tt_metal/impl/dispatch/command_queue.hpp" +using std::vector; using namespace tt::tt_metal; enum class DataMovementMode: uint8_t { diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp index 28e0089323f..83746fe8a54 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp @@ -19,6 +19,8 @@ #include "tt_metal/test_utils/stimulus.hpp" #include "tt_metal/impl/device/device.hpp" +using std::map; +using std::vector; using namespace tt; using namespace tt::test_utils; using namespace tt::test_utils::df; diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp index 1dd077cb1b5..ae36623be52 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp @@ -21,6 +21,7 @@ #include "tt_metal/test_utils/stimulus.hpp" +using std::vector; using namespace tt; using namespace tt::test_utils; using namespace tt::test_utils::df; diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp index 2d67bad56d3..53d9f0d5707 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp @@ -19,6 +19,7 @@ #include "tt_metal/test_utils/print_helpers.hpp" #include "tt_metal/test_utils/stimulus.hpp" +using std::vector; using namespace tt; using namespace tt::test_utils; using namespace tt::test_utils::df; diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp index d339c115865..3fc76d32d74 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp @@ -22,6 +22,8 @@ #include "tt_metal/test_utils/stimulus.hpp" #include "tt_metal/impl/device/device.hpp" +using std::map; +using std::vector; using namespace tt; using namespace tt::test_utils; using namespace tt::tt_metal; diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp index 6a6e7517591..4e407df6d4e 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp @@ -13,6 +13,7 @@ #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" +using std::vector; using namespace tt::tt_metal; struct CBConfig { diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp index cef5a8d0c18..1a933c8a2f9 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp @@ -8,6 +8,7 @@ #include "tt_metal/host_api.hpp" #include "tt_metal/impl/device/device.hpp" +using std::vector; using namespace tt::tt_metal; struct TestBufferConfig { diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp index b6c3e82791c..1c08c86fa15 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp @@ -13,6 +13,7 @@ #include "tt_metal/impl/event/event.hpp" #include "tt_metal/impl/device/device.hpp" +using std::vector; using namespace tt::tt_metal; namespace local_test_functions { diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index f294316c6e4..6932ab11955 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -11,6 +11,7 @@ #include "tt_metal/test_utils/env_vars.hpp" #include "tt_metal/impl/device/device.hpp" +using std::vector; using namespace tt::tt_metal; diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp index e99ea309d5a..a42dd078797 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp @@ -29,7 +29,7 @@ struct BufferStressTestConfig { inline std::vector generate_arange_vector(uint32_t size_bytes, uint32_t start = 0) { TT_FATAL(size_bytes % sizeof(uint32_t) == 0, "Error"); - vector src(size_bytes / sizeof(uint32_t), 0); + std::vector src(size_bytes / sizeof(uint32_t), 0); for (uint32_t i = 0; i < src.size(); i++) { src.at(i) = start + i; diff --git a/tests/tt_metal/tt_metal/unit_tests_frequent/tests/run_many_times.cpp b/tests/tt_metal/tt_metal/unit_tests_frequent/tests/run_many_times.cpp index 255ce5d72e6..75116172d4d 100644 --- a/tests/tt_metal/tt_metal/unit_tests_frequent/tests/run_many_times.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_frequent/tests/run_many_times.cpp @@ -7,6 +7,7 @@ #include "tt_metal/host_api.hpp" #include "tt_metal/impl/device/device.hpp" +using std::vector; using namespace tt; using namespace tt::tt_metal; diff --git a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_TG_nightly.py b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_TG_nightly.py new file mode 100644 index 00000000000..2cbe8f5aa29 --- /dev/null +++ b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_TG_nightly.py @@ -0,0 +1,327 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import torch +import pytest +from loguru import logger +import ttnn +from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_equal, comp_pcc +from models.utility_functions import skip_for_grayskull + +from ttnn import ShardTensor2dMesh, ConcatMesh2dToTensor + + +def report_mismatches(golden, actual, max_printable=None): + printed = 0 + for w in range(golden.shape[0]): + for z in range(golden.shape[1]): + for y in range(0, golden.shape[2], 32): + for x in range(0, golden.shape[3], 32): + print_it = (max_printable is None or printed < max_printable) and golden[w, z, y, x] != actual[ + w, z, y, x + ] + if print_it: + printed += 1 + print( + f"output mismatch for tensor at [{w}, {z}, {y}, {x}]: expected {golden[w, z, y, x]} != actual {actual[w, z, y, x]}" + ) + + +def print_tile_corners_of_tensor(t): + for w in range(t.shape[0]): + for z in range(t.shape[1]): + str = "" + for x in range(0, t.shape[3], 32): + str += f"{x:<5} "[:5] + print(f" {str}") + for y in range(0, t.shape[2], 32): + str_vals = f"y={y:<3} "[:5] + for x in range(0, t.shape[3], 32): + yy = 0 + xx = 0 + val = int(t[w, z, y + yy, x + xx].item()) + str_vals += f"{val:<5} "[:5] + print(f"{str_vals}") + + +def run_line_reduce_scatter_on_TG_with_mesh_tensor_along_rows( + mesh_device, + num_devices_per_line, + per_chip_input_shape, + tensor_memory_layout, + dim, + num_links, + math_op, + input_dtype, + layout, + buffer_type: ttnn.BufferType, + use_program_cache, + function_level_defaults, + enable_async, + input_shard_spec: ttnn.ShardSpec = None, + num_reduce_scatter_instances: int = 1, + num_iters: int = 1, + cluster_axis: int = 0, +): + if len(mesh_device.get_devices()) != 32: + pytest.skip("Not TG!") + for d in mesh_device.get_devices(): + ttnn.enable_program_cache(d) + mesh_device.enable_async(enable_async) + + per_reduce_scatter_output_shape = list(per_chip_input_shape) + per_reduce_scatter_output_shape[dim] *= num_devices_per_line + full_mesh_input_shape = list(per_reduce_scatter_output_shape) + ## The `reduce_scatter_instances_concat_dim` is the dimension we will split the cluster spanning tensor along in order to split it + ## off into per-all-gather tensors + reduce_scatter_instances_concat_dim = 1 if dim == 0 else 0 + full_mesh_input_shape[reduce_scatter_instances_concat_dim] *= num_reduce_scatter_instances + logger.info( + f"full_mesh_input_shape: {full_mesh_input_shape}, dim: {dim}, reduce_scatter_instances_concat_dim: {reduce_scatter_instances_concat_dim}, num_devices_per_line: {num_devices_per_line}" + ) + + ## + ## Compute golden + ## + + per_chip_output_shape = list(per_chip_input_shape) + per_chip_output_shape[dim] //= num_devices_per_line + per_reduce_scatter_inputs = [] + per_reduce_scatter_goldens = [] + for i in range(num_reduce_scatter_instances): + per_chip_inputs = [torch.rand(per_chip_input_shape).bfloat16() for _ in range(num_devices_per_line)] + per_reduce_scatter_inputs.append(per_chip_inputs) + + golden_canonical_out_tensor = torch.zeros(per_chip_input_shape).bfloat16() + for t in per_chip_inputs: + golden_canonical_out_tensor = torch.add(golden_canonical_out_tensor, t).bfloat16() + per_reduce_scatter_goldens.append(golden_canonical_out_tensor) + + per_reduce_scatter_concatenated_inputs = [ + torch.cat(per_reduce_scatter_inputs[i], dim=dim) for i in range(num_reduce_scatter_instances) + ] + + full_input_tensor_unfractured = torch.cat( + per_reduce_scatter_concatenated_inputs, dim=reduce_scatter_instances_concat_dim + ) + + input_mem_config = ttnn.MemoryConfig(tensor_memory_layout, buffer_type=buffer_type, shard_spec=input_shard_spec) + shard_dims = ( + (dim, reduce_scatter_instances_concat_dim) if cluster_axis == 0 else (reduce_scatter_instances_concat_dim, dim) + ) + concat_dims = shard_dims + + mesh_shape = ( + (num_devices_per_line, num_reduce_scatter_instances) + if cluster_axis == 0 + else (num_reduce_scatter_instances, num_devices_per_line) + ) + + output_shard_spec = None + if input_shard_spec is not None: + output_shard_shape = list(input_shard_spec.shape) + if dim == 3: + output_shard_shape[1] *= num_devices_per_line + else: + output_shard_shape[0] *= num_devices_per_line + output_shard_spec = ttnn.ShardSpec( + input_shard_spec.grid, + output_shard_shape, + input_shard_spec.orientation, + False, + ) + output_mem_config = ttnn.MemoryConfig(tensor_memory_layout, buffer_type=buffer_type, shard_spec=output_shard_spec) + ttnn_tensor = ttnn.from_torch( + full_input_tensor_unfractured, + dtype=input_dtype, + device=mesh_device, + layout=layout, + memory_config=input_mem_config, + mesh_mapper=ShardTensor2dMesh(mesh_device, mesh_shape=mesh_shape, dims=shard_dims), + ) + ttnn_tensor = ttnn.to_device(ttnn_tensor, mesh_device) + + # ttnn.visualize_mesh_device(mesh_device, tensor=ttnn_tensor) + ttnn_tensor_out = ttnn.reduce_scatter( + ttnn_tensor, + scatter_dim=dim, + cluster_axis=cluster_axis, + mesh_device=mesh_device, + math_op=math_op, + num_links=num_links, + memory_config=output_mem_config, + topology=ttnn.Topology.Linear, + ) + trace_id = ttnn.begin_trace_capture(mesh_device, cq_id=0) + # ttnn.visualize_mesh_device(mesh_device, tensor=ttnn_tensor) + for _ in range(num_iters): + ttnn_tensor_out = ttnn.reduce_scatter( + ttnn_tensor, + scatter_dim=dim, + cluster_axis=cluster_axis, + mesh_device=mesh_device, + math_op=math_op, + num_links=num_links, + memory_config=output_mem_config, + topology=ttnn.Topology.Linear, + ) + ttnn.end_trace_capture(mesh_device, trace_id, cq_id=0) + for d in mesh_device.get_devices(): + ttnn.synchronize_device(d) + + logger.info("Starting Trace perf test...") + ttnn.execute_trace(mesh_device, trace_id, blocking=False) + ttnn.release_trace(mesh_device, trace_id) + for d in mesh_device.get_devices(): + ttnn.synchronize_device(d) + + # ttnn.visualize_mesh_device(mesh_device, tensor=ttnn_tensor_out) + tt_output_tensor = ttnn.to_torch( + ttnn_tensor_out, mesh_composer=ConcatMesh2dToTensor(mesh_device, mesh_shape=mesh_shape, dims=concat_dims) + ) + output_tensors_list = torch.chunk( + tt_output_tensor, num_reduce_scatter_instances, dim=reduce_scatter_instances_concat_dim + ) + + passed = True + for i in range(num_reduce_scatter_instances): + # The result of all-chips in the reduce scatter line having their outputs concatenated + reduce_scatter_outputs_concatenated = output_tensors_list[i] + per_chip_outputs = torch.chunk(reduce_scatter_outputs_concatenated, num_devices_per_line, dim=dim) + per_chip_goldens = torch.chunk(per_reduce_scatter_goldens[i], num_devices_per_line, dim=dim) + + assert len(per_chip_outputs) == len(per_chip_goldens) + # compare the output and golden (zip) + for d, (output, golden) in enumerate(zip(per_chip_outputs, per_chip_goldens)): + eq, output = comp_pcc(output, golden) + + if not eq: + passed = False + logger.error(f"output mismatch for tensor on reduce_scatter {i}, device {d}: {output}") + + assert passed, f"FAILED: {output}" + + +# Enumerate the post-commit cases explicitly +@skip_for_grayskull("Requires eth connected devices to run") +@pytest.mark.parametrize( + "num_devices, num_links, per_chip_output_shape, dim, layout", + [ + (4, 1, [1, 4, 32, 2304], 1, ttnn.TILE_LAYOUT), + ], +) +@pytest.mark.parametrize( + "input_dtype", + [ + ttnn.bfloat16, + # ttnn.bfloat8_b, + ], +) +@pytest.mark.parametrize( + "buffer_type", + [ + ttnn.BufferType.DRAM, + ttnn.BufferType.L1, + ], +) +@pytest.mark.parametrize("replication_factor", [8]) # 1, 8]) +@pytest.mark.parametrize("enable_async", [True]) +@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True) +@pytest.mark.parametrize("math_op", [ttnn.ReduceType.Sum]) +@pytest.mark.parametrize("device_params", [{"trace_region_size": 10281600}], indirect=True) +def test_line_reduce_scatter_on_TG_rows_post_commit( + mesh_device, + num_devices, + per_chip_output_shape, + dim, + num_links, + math_op, + input_dtype, + layout, + buffer_type, + use_program_cache, + function_level_defaults, + enable_async, + replication_factor, + num_iters=16, +): + run_line_reduce_scatter_on_TG_with_mesh_tensor_along_rows( + mesh_device, + num_devices, + per_chip_output_shape, + ttnn.TensorMemoryLayout.INTERLEAVED, + dim, + num_links, + math_op, + input_dtype, + layout, + buffer_type, + use_program_cache, + function_level_defaults, + enable_async=enable_async, + num_iters=num_iters, + num_reduce_scatter_instances=replication_factor, + cluster_axis=1, + ) + + +@skip_for_grayskull("Requires eth connected devices to run") +@pytest.mark.parametrize( + "num_devices, num_links, per_chip_output_shape, dim, layout", + [ + (8, 1, [1, 8, 32, 1280], 1, ttnn.TILE_LAYOUT), + (8, 1, [8, 1, 32, 1280], 0, ttnn.TILE_LAYOUT), + ], +) +@pytest.mark.parametrize( + "input_dtype", + [ + ttnn.bfloat16, + ], +) +@pytest.mark.parametrize( + "buffer_type", + [ + ttnn.BufferType.DRAM, + ], +) +@pytest.mark.parametrize("enable_async", [True]) +@pytest.mark.parametrize("replication_factor", [4]) +@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True) +@pytest.mark.parametrize("math_op", [ttnn.ReduceType.Sum]) +@pytest.mark.parametrize("device_params", [{"trace_region_size": 10281600}], indirect=True) +def test_line_reduce_scatter_on_TG_cols_post_commit( + mesh_device, + num_devices, + per_chip_output_shape, + dim, + num_links, + math_op, + input_dtype, + layout, + buffer_type, + use_program_cache, + function_level_defaults, + enable_async, + replication_factor, + num_iters=16, +): + run_line_reduce_scatter_on_TG_with_mesh_tensor_along_rows( + mesh_device, + num_devices, + per_chip_output_shape, + ttnn.TensorMemoryLayout.INTERLEAVED, + dim, + num_links, + math_op, + input_dtype, + layout, + buffer_type, + use_program_cache, + function_level_defaults, + enable_async=enable_async, + num_iters=num_iters, + num_reduce_scatter_instances=replication_factor, + cluster_axis=0, + ) diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_mul_bcast.py b/tests/ttnn/unit_tests/operations/eltwise/test_mul_bcast.py new file mode 100644 index 00000000000..de4122f29ad --- /dev/null +++ b/tests/ttnn/unit_tests/operations/eltwise/test_mul_bcast.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +import torch + +import ttnn + +from tests.ttnn.utils_for_testing import assert_with_pcc +from torch.nn import functional as F + + +@pytest.mark.parametrize("h", [32]) +@pytest.mark.parametrize("w", [64]) +def test_mul_channel_bcast_repeat(device, h, w): + torch_input_tensor_a = torch.rand((16, 16, h, w), dtype=torch.bfloat16) + torch_input_tensor_b = torch.rand((16, 1, h, w), dtype=torch.bfloat16) + + input_tensor_a = ttnn.from_torch(torch_input_tensor_a, layout=ttnn.TILE_LAYOUT, device=device) + input_tensor_b = ttnn.from_torch(torch_input_tensor_b, layout=ttnn.TILE_LAYOUT, device=device) + output = ttnn.mul(input_tensor_a, input_tensor_b) + output = ttnn.to_torch(output) + + torch_output_tensor = torch.mul(torch_input_tensor_a, torch_input_tensor_b) + assert_with_pcc(torch_output_tensor, output, 0.9999) + + +@pytest.mark.parametrize("h", [32]) +@pytest.mark.parametrize("w", [64]) +def test_mul_batch_bcast_repeat(device, h, w): + torch_input_tensor_a = torch.rand((1, 16, h, w), dtype=torch.bfloat16) + torch_input_tensor_b = torch.rand((16, 16, h, w), dtype=torch.bfloat16) + + input_tensor_a = ttnn.from_torch(torch_input_tensor_a, layout=ttnn.TILE_LAYOUT, device=device) + input_tensor_b = ttnn.from_torch(torch_input_tensor_b, layout=ttnn.TILE_LAYOUT, device=device) + output = ttnn.mul(input_tensor_a, input_tensor_b) + output = ttnn.to_torch(output) + + torch_output_tensor = torch.mul(torch_input_tensor_a, torch_input_tensor_b) + assert_with_pcc(torch_output_tensor, output, 0.9999) diff --git a/tests/ttnn/unit_tests/operations/test_bernoulli.py b/tests/ttnn/unit_tests/operations/test_bernoulli.py new file mode 100644 index 00000000000..104259801b9 --- /dev/null +++ b/tests/ttnn/unit_tests/operations/test_bernoulli.py @@ -0,0 +1,124 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import torch +import pytest +import ttnn +import numpy as np +from tests.ttnn.unit_tests.operations.test_utils import ( + get_compute_kernel_options, + compute_kernel_options, + compute_kernel_ids, + get_lib_dtype, +) +from models.utility_functions import skip_for_grayskull +from collections import Counter +from loguru import logger + + +def run_bernoulli(shape, in_dtype, out_dtype, device, is_out_alloc=False, compute_kernel_options=None): + compute_kernel_config = get_compute_kernel_options(compute_kernel_options) + cpu_input = torch.rand(shape, dtype=get_lib_dtype(torch, in_dtype)) + npu_input = ttnn.from_torch(cpu_input, device=device, dtype=get_lib_dtype(ttnn, in_dtype), layout=ttnn.TILE_LAYOUT) + + npu_output = None + if is_out_alloc: + cpu_output = torch.rand(shape, dtype=get_lib_dtype(torch, out_dtype)) + npu_output = ttnn.from_torch( + cpu_output, device=device, dtype=get_lib_dtype(ttnn, out_dtype), layout=ttnn.TILE_LAYOUT + ) + + one_probs = [] + for _ in range(10): + if is_out_alloc: + ttnn.bernoulli( + npu_input, + output=npu_output, + dtype=get_lib_dtype(ttnn, out_dtype), + compute_kernel_config=compute_kernel_config, + ) + else: + npu_output = ttnn.bernoulli( + npu_input, + dtype=get_lib_dtype(ttnn, out_dtype), + compute_kernel_config=compute_kernel_config, + ) + + tt_output = ttnn.to_torch(npu_output).reshape(shape) + tt_output_list = tt_output.flatten().tolist() + + c = Counter(tt_output_list) + one_probs.append(c[1] / len(tt_output_list)) + + expected_one_prob = 0.5 + assert np.allclose(expected_one_prob, np.mean(one_probs), rtol=0.05) + + +# fmt: off +@skip_for_grayskull("Requires wormhole_b0 to run") +@pytest.mark.parametrize("shape", + [ + [2003], + [500, 500], + [1, 512, 2, 256], + ], +) +@pytest.mark.parametrize("in_dtype", + [ + "bfloat16", + "float32" + ] +) +@pytest.mark.parametrize("out_dtype", + [ + "bfloat16", + "float32" + ] +) +@pytest.mark.parametrize("is_out_alloc", + [ + True, + False + ] +) +# fmt: on +def test_bernoulli(shape, in_dtype, out_dtype, device, is_out_alloc): + torch.manual_seed(0) + run_bernoulli(shape, in_dtype, out_dtype, device, is_out_alloc) + + +@skip_for_grayskull("Requires wormhole_b0 to run") +@pytest.mark.parametrize( + "shape", + [ + [1, 21, 123, 24], + ], +) +@pytest.mark.parametrize("in_dtype", ["float32"]) +@pytest.mark.parametrize("out_dtype", ["float32"]) +@pytest.mark.parametrize("is_out_alloc", [True, False]) +def test_bernoulli_callback(shape, in_dtype, out_dtype, device, is_out_alloc, use_program_cache): + torch.manual_seed(0) + num_program_cache_entries_list = [] + for i in range(2): + run_bernoulli(shape, in_dtype, out_dtype, device, is_out_alloc) + # Add dummy tensor to make sure that created tensor in 2 iteration don't share the same addr + tt_dummy_tensor = ttnn.empty([1, 1, 32, 32], ttnn.bfloat16, ttnn.TILE_LAYOUT, device) + num_program_cache_entries_list.append(device.num_program_cache_entries()) + logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}") + assert num_program_cache_entries_list[0] > 0 + assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1] + + +@skip_for_grayskull("Requires wormhole_b0 to run") +@pytest.mark.parametrize( + "shape", + [[512, 512], [5, 4, 70, 40]], +) +@pytest.mark.parametrize("in_dtype", ["float32"]) +@pytest.mark.parametrize("out_dtype", ["float32"]) +@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids) +def test_uniform_with_compute_kernel_options(shape, in_dtype, out_dtype, device, compute_kernel_options): + torch.manual_seed(0) + run_bernoulli(shape, in_dtype, out_dtype, device, compute_kernel_options) diff --git a/tests/ttnn/unit_tests/operations/test_index_fill.py b/tests/ttnn/unit_tests/operations/test_index_fill.py new file mode 100644 index 00000000000..8935f5c5bab --- /dev/null +++ b/tests/ttnn/unit_tests/operations/test_index_fill.py @@ -0,0 +1,138 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +import ttnn +import torch +from tests.ttnn.utils_for_testing import assert_equal + + +def run_index_fill_test(shape, dim, value, dtype, device): + if len(shape) - 1 < dim: + pytest.skip("Given dim is higher than tensor rank") + + if dtype == torch.int32: + torch_input = torch.randint(0, 100, shape, dtype=torch.int32) + else: + torch_input = torch.rand(shape, dtype=dtype) + torch_index = torch.tensor([0, 2]) + torch_output = torch.index_fill(torch_input, dim, torch_index, value) + + tt_input = ttnn.from_torch(torch_input, device=device) + tt_index = ttnn.from_torch(torch_index, device=device) + + ttnn_output = ttnn.index_fill(tt_input, dim, tt_index, value) + ttnn_output = ttnn.to_torch(ttnn_output) + + assert assert_equal(ttnn_output, torch_output) + + +@pytest.mark.parametrize( + "shape", + [ + [32, 32], # multiple of 32 + [12, 24], # not multiple of 32 + [23, 41, 32], # multiple of 32 + [9, 5, 38], # not multiple of 32 + [3, 4, 5, 32], # multiple of 32 + [41, 21, 33, 34], # not multiple of 32, + ], +) +@pytest.mark.parametrize( + "dim", + [ + 0, + 1, + 2, + 3, + ], +) +@pytest.mark.parametrize( + "value", + [ + 2.5, + 1.72, + ], +) +@pytest.mark.parametrize( + "dtype", + [ + torch.float32, + torch.bfloat16, + ], +) +def test_index_fill_float(shape, dim, value, dtype, device): + torch.manual_seed(2024) + + run_index_fill_test(shape, dim, value, dtype, device) + + +@pytest.mark.parametrize( + "shape", + [ + [32, 32], # multiple of 32 + [12, 23], # not multiple of 32 + [27, 12, 32], # multiple of 32 + [61, 3, 6], # not multiple of 32 + [6, 3, 7, 32], # multiple of 32 + [13, 15, 22, 13], # not multiple of 32 + ], +) +@pytest.mark.parametrize( + "dim", + [ + 0, + 1, + 2, + 3, + ], +) +@pytest.mark.parametrize( + "value", + [ + 15, + 12, + ], +) +def test_index_fill_int(shape, dim, value, device): + torch.manual_seed(2024) + + run_index_fill_test(shape, dim, value, torch.int32, device) + + +@pytest.mark.parametrize( + "shape", + [ + [32, 32], # multiple of 32 + [12, 23], # not multiple of 32 + [27, 12, 32], # multiple of 32 + [61, 3, 6], # not multiple of 32 + [4, 3, 7, 32], # multiple of 32 + [13, 15, 22, 13], # not multiple of 32 + ], +) +@pytest.mark.parametrize( + "dim", + [ + 0, + ], +) +@pytest.mark.parametrize( + "value", + [ + 2002, + ], +) +def test_index_fill_callback(shape, dim, value, device, use_program_cache): + torch.manual_seed(2024) + for i in range(2): + run_index_fill_test(shape, dim, value, torch.int32, device) + if i == 0: + num_program_cache_entries = device.num_program_cache_entries() + assert num_program_cache_entries > 0 + else: + assert device.num_program_cache_entries() == num_program_cache_entries + torch_dummy = torch.randn([32, 32]) + tt_dummy = ttnn.from_torch(torch_dummy, device=device) diff --git a/tests/ttnn/unit_tests/operations/test_moreh_dot_backward.py b/tests/ttnn/unit_tests/operations/test_moreh_dot_backward.py index 3e51cbd1c93..51b443396bf 100644 --- a/tests/ttnn/unit_tests/operations/test_moreh_dot_backward.py +++ b/tests/ttnn/unit_tests/operations/test_moreh_dot_backward.py @@ -12,14 +12,26 @@ get_compute_kernel_options, compute_kernel_options, compute_kernel_ids, + get_ttnn_torch_dtype, ) def get_tensors( - input_shape, other_shape, output_shape, require_input_grad, require_other_grad, is_1d, device, use_randint=True + input_shape, + other_shape, + output_shape, + require_input_grad, + require_other_grad, + is_1d, + device, + npu_dtype=ttnn.bfloat16, + use_randint=True, ): - npu_dtype = ttnn.bfloat16 - cpu_dtype = torch.bfloat16 + cpu_dtype = get_ttnn_torch_dtype(npu_dtype) + if cpu_dtype is None: + # panic + assert False + npu_layout = ttnn.TILE_LAYOUT cpu_layout = ttnn.ROW_MAJOR_LAYOUT @@ -33,9 +45,9 @@ def get_tensors( other = torch.rand(other_shape, dtype=cpu_dtype) output = torch.rand(output_shape, dtype=cpu_dtype) - tt_input = ttnn.Tensor(input, npu_dtype).pad_to_tile(float(1)).to(npu_layout).to(device) - tt_other = ttnn.Tensor(other, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device) - tt_output = ttnn.Tensor(output, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device) + tt_input = ttnn.from_torch(input, npu_dtype, layout=npu_layout, device=device) + tt_other = ttnn.from_torch(other, npu_dtype, layout=npu_layout, device=device) + tt_output = ttnn.from_torch(output, npu_dtype, layout=npu_layout, device=device) torch_input = input.reshape(-1) if is_1d else input torch_other = other.reshape(-1) if is_1d else other @@ -44,25 +56,16 @@ def get_tensors( output_grad = tt_output_grad = torch_output_grad = tt_input_grad = tt_other_grad = None if require_input_grad or require_other_grad: output_grad = torch.randint(-2, 3, output_shape, dtype=cpu_dtype) - # tt_output_grad = ttnn.Tensor(output_grad, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device) - tt_output_grad = ttnn.Tensor(output_grad, npu_dtype).pad_to_tile(float(-1)).to(npu_layout).to(device) + tt_output_grad = ttnn.from_torch(output_grad, npu_dtype, layout=npu_layout, device=device) torch_output_grad = output_grad[0][0][0][0] if is_1d else output_grad if require_input_grad: input_grad = torch.full(input_shape, float("nan"), dtype=cpu_dtype) - tt_input_grad = ttnn.Tensor(input_grad, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device) + tt_input_grad = ttnn.from_torch(input_grad, npu_dtype, layout=npu_layout, device=device) if require_other_grad: other_grad = torch.full(other_shape, float("nan"), dtype=cpu_dtype) - tt_other_grad = ( - ttnn.Tensor( - other_grad, - npu_dtype, - ) - .pad_to_tile(float("nan")) - .to(npu_layout) - .to(device) - ) + tt_other_grad = ttnn.from_torch(other_grad, npu_dtype, layout=npu_layout, device=device) return ( tt_input, @@ -77,24 +80,7 @@ def get_tensors( ) -@pytest.mark.parametrize( - "input_shape", - ( - [1, 1, 1, 10], # test not mutiple of 32 case - [1, 1, 1, 32], # test single tile - [1, 1, 1, 352], # test multiple tiles - [1, 1, 1, 323], # test multiple tiles, not a multiple of 32 - ), -) -@pytest.mark.parametrize( - "requires_grad", - ( - (True, False), - (False, True), - (True, True), - ), -) -def test_moreh_matmul_1d_backward(input_shape, requires_grad, device): +def run_moreh_dot_backward(input_shape, requires_grad, device, dtype=ttnn.bfloat16, use_randint=True): torch.manual_seed(3072) require_input_grad, require_other_grad = requires_grad output_shape = [1, 1, 1, 1] @@ -109,7 +95,9 @@ def test_moreh_matmul_1d_backward(input_shape, requires_grad, device): torch_input, torch_other, torch_output_grad, - ) = get_tensors(input_shape, input_shape, output_shape, require_input_grad, require_other_grad, True, device) + ) = get_tensors( + input_shape, input_shape, output_shape, require_input_grad, require_other_grad, True, device, dtype, use_randint + ) # torch matmul torch_out = torch.matmul( torch_input.requires_grad_(require_input_grad), torch_other.requires_grad_(require_other_grad) @@ -125,7 +113,7 @@ def test_moreh_matmul_1d_backward(input_shape, requires_grad, device): rtol = atol = 0.1 cpu_layout = ttnn.ROW_MAJOR_LAYOUT if require_input_grad: - ttcpu_input_grad = tt_input_grad.cpu().to(cpu_layout).unpad_from_tile(input_shape).to_torch() + ttcpu_input_grad = ttnn.to_torch(tt_input_grad) passing, output_pcc = comp_allclose_and_pcc( torch_input.grad, ttcpu_input_grad.reshape(-1), pcc=0.999, rtol=rtol, atol=atol @@ -135,7 +123,7 @@ def test_moreh_matmul_1d_backward(input_shape, requires_grad, device): assert passing if require_other_grad: - ttcpu_other_grad = tt_other_grad.cpu().to(cpu_layout).unpad_from_tile(input_shape).to_torch() + ttcpu_other_grad = ttnn.to_torch(tt_other_grad) passing, output_pcc = comp_allclose_and_pcc( torch_other.grad, ttcpu_other_grad.reshape(-1), pcc=0.999, rtol=rtol, atol=atol @@ -143,3 +131,61 @@ def test_moreh_matmul_1d_backward(input_shape, requires_grad, device): logger.debug(f"other_grad passing={passing}") logger.debug(f"other_grad pcc={output_pcc}") assert passing + + +@pytest.mark.parametrize( + "input_shape", + ( + [1, 1, 1, 10], # test not mutiple of 32 case + [1, 1, 1, 32], # test single tile + [1, 1, 1, 352], # test multiple tiles + [1, 1, 1, 323], # test multiple tiles, not a multiple of 32 + ), +) +@pytest.mark.parametrize( + "requires_grad", + ( + [True, False], + [False, True], + [True, True], + ), +) +@pytest.mark.parametrize("use_randint", (True, False)) +@pytest.mark.parametrize("dtype", ([ttnn.bfloat16, ttnn.bfloat8_b])) +def test_moreh_dot_backward(input_shape, requires_grad, dtype, use_randint, device): + run_moreh_dot_backward(input_shape, requires_grad, device, dtype, use_randint) + + +@pytest.mark.parametrize( + "input_shape", + ( + [1, 1, 1, 10], # test not mutiple of 32 case + [1, 1, 1, 32], # test single tile + [1, 1, 1, 352], # test multiple tiles + [1, 1, 1, 323], # test multiple tiles, not a multiple of 32 + ), +) +@pytest.mark.parametrize( + "requires_grad", + ( + [True, False], + [False, True], + [True, True], + ), +) +def test_moreh_dot_backward_callback( + input_shape, + requires_grad, + device, + use_program_cache, +): + num_program_in_cache = [] + for i in range(2): + run_moreh_dot_backward(input_shape, requires_grad, device) + num_program_in_cache.append(device.num_program_cache_entries()) + dummy = torch.randn([32, 32]) + tt_dummy = ttnn.from_torch(dummy, device=device) + + logger.info(f"num_program_in_cache={num_program_in_cache}") + assert num_program_in_cache[0] > 0 + assert num_program_in_cache[0] == num_program_in_cache[1] diff --git a/tests/ttnn/unit_tests/operations/test_uniform.py b/tests/ttnn/unit_tests/operations/test_uniform.py index 0ee59766878..9c3f05a6a6a 100644 --- a/tests/ttnn/unit_tests/operations/test_uniform.py +++ b/tests/ttnn/unit_tests/operations/test_uniform.py @@ -94,9 +94,9 @@ def run_uniform(shape, rand_range, dtype, device, compute_kernel_options=None, m ) -# fmt: off @skip_for_grayskull("Requires wormhole_b0 to run") -@pytest.mark.parametrize("shape", +@pytest.mark.parametrize( + "shape", [ [32, 32], [64, 64], @@ -105,20 +105,8 @@ def run_uniform(shape, rand_range, dtype, device, compute_kernel_options=None, m [1024, 1024], ], ) -@pytest.mark.parametrize("rand_range", - [ - [0, 1], - [2.1, 9], - [-5.1, 1.2] - ] -) -@pytest.mark.parametrize("dtype", - [ - "bfloat16", - "float32" - ] -) -# fmt: on +@pytest.mark.parametrize("rand_range", [[0, 1], [2.1, 9], [-5.1, 1.2]]) +@pytest.mark.parametrize("dtype", ["bfloat16", "float32"]) def test_uniform(shape, rand_range, dtype, device): torch.manual_seed(0) run_uniform(shape, rand_range, dtype, device) diff --git a/tests/ttnn/unit_tests/operations/test_utils.py b/tests/ttnn/unit_tests/operations/test_utils.py index 82c1b0f6bd9..9c304c02d27 100644 --- a/tests/ttnn/unit_tests/operations/test_utils.py +++ b/tests/ttnn/unit_tests/operations/test_utils.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import ttnn +import torch from models.utility_functions import is_wormhole_b0 import copy import pytest @@ -177,3 +178,22 @@ def get_lib_dtype(lib, dtype): "int32": lib.int32, } return dtype_map.get(dtype, None) + + +def get_ttnn_torch_dtype(ttnn_dtype: ttnn.DataType) -> torch.dtype: + """ + Maps a ttnn.DataType to the corresponding torch dtype that can handle them. + Parameters: + ttnn_dtype: ttnn.DataType + The ttnn data type to be mapped. + Returns: + torch.dtype or None + The corresponding torch dtype if the mapping exists, otherwise None. + """ + dtype_map = { + ttnn.bfloat16: torch.bfloat16, + ttnn.float32: torch.float32, + ttnn.bfloat8_b: torch.bfloat16, + ttnn.int32: torch.int32, + } + return dtype_map.get(ttnn_dtype, None) diff --git a/tests/ttnn/unit_tests/test_reshape.py b/tests/ttnn/unit_tests/test_reshape.py index 4413fa7b601..4ada4299f60 100644 --- a/tests/ttnn/unit_tests/test_reshape.py +++ b/tests/ttnn/unit_tests/test_reshape.py @@ -293,9 +293,6 @@ def test_reshape_tile_layout_only_change_shape(device): ((1, 1445, 192), (1445, 192)), ((1, 256), (1, 1, 256)), ((16, 1, 32), (16, 1, 32)), - ((32,), (1, 1, 1, 32)), - ((16,), (1, 1, 1, 16)), - ((48,), (1, 1, 1, 48)), ], ) @pytest.mark.parametrize("layout", [ttnn.ROW_MAJOR_LAYOUT, ttnn.TILE_LAYOUT]) diff --git a/tt_metal/common/base.hpp b/tt_metal/common/base.hpp index 7000e13cd9d..6b36d02ee7e 100644 --- a/tt_metal/common/base.hpp +++ b/tt_metal/common/base.hpp @@ -7,56 +7,22 @@ */ #pragma once -#include #include -#include -#include -#include -#include -#include "tt_metal/common/tt_backend_api_types.hpp" // These are the types exported to frontend team... -#include "tt_metal/common/assert.hpp" -#include "hostdevcommon/kernel_structs.h" -#include "eth_l1_address_map.h" -#include "common/constants.hpp" -#include "common/base_types.hpp" - -using std::array; -using std::ostream; -using std::uint8_t; -using std::uint32_t; -using std::uint64_t; -using std::vector; -using std::string; -using std::size_t; -using std::map; +// DO NOT ADD MORE CODE TO THIS FILE +// THIS FILE POLLUTES ALL TRANSLATION UNITS - tt_metal, ttnn, programming examples, tests, customer code +// FIXME: At least put this in tt namespace inline constexpr uint32_t align(uint32_t addr, uint32_t alignment) { return ((addr - 1) | (alignment - 1)) + 1; } - -namespace tt -{ - -/** - * @brief Specifies the target devices on which the graph can be run. -*/ -enum class TargetDevice : uint8_t -{ - Silicon = 0, - Simulator = 1, - Invalid = 0xFF, -}; - -constexpr uint32_t MAX_AVAILABLE_CHIPS = 16; - -struct pair_hash { - template - std::size_t operator()(const std::pair &p) const - { - auto h1 = std::hash{}(p.first); - auto h2 = std::hash{}(p.second); - return h1 ^ h2; - } -}; - -} // end namespace tt +namespace tt { + /** + * @brief Specifies the target devices on which the graph can be run. + */ + enum class TargetDevice : std::uint8_t + { + Silicon = 0, + Simulator = 1, + Invalid = 0xFF, + }; +} diff --git a/tt_metal/common/test_common.hpp b/tt_metal/common/test_common.hpp index eeaa8587d64..6cdf825d0dc 100644 --- a/tt_metal/common/test_common.hpp +++ b/tt_metal/common/test_common.hpp @@ -18,12 +18,12 @@ #include #include "common/metal_soc_descriptor.h" -// Needed for TargetDevice enum -#include "common/base.hpp" -inline std::string get_soc_description_file(const tt::ARCH &arch, tt::TargetDevice target_device, string output_dir = "") { +#include "tt_metal/common/base.hpp" + +inline std::string get_soc_description_file(const tt::ARCH &arch, tt::TargetDevice target_device, std::string output_dir = "") { // Ability to skip this runtime opt, since trimmed SOC desc limits which DRAM channels are available. - string tt_metal_home; + std::string tt_metal_home; if (getenv("TT_METAL_HOME")) { tt_metal_home = getenv("TT_METAL_HOME"); } else { diff --git a/tt_metal/detail/reports/compilation_reporter.cpp b/tt_metal/detail/reports/compilation_reporter.cpp index 2940e7fe879..9a681cbca0b 100644 --- a/tt_metal/detail/reports/compilation_reporter.cpp +++ b/tt_metal/detail/reports/compilation_reporter.cpp @@ -79,13 +79,13 @@ std::string kernel_attributes_str(std::shared_ptr kernel) { return attr_str; } -void CompilationReporter::add_kernel_compile_stats(const Program &program, std::shared_ptr kernel, bool cache_hit, size_t kernel_hash) { +void CompilationReporter::add_kernel_compile_stats(uint64_t program_id, std::shared_ptr kernel, bool cache_hit, size_t kernel_hash) { std::unique_lock lock(mutex_); if (cache_hit) { - this->program_id_to_cache_hit_counter_[program.get_id()].hits++; + this->program_id_to_cache_hit_counter_[program_id].hits++; } else { - this->program_id_to_cache_hit_counter_[program.get_id()].misses++; + this->program_id_to_cache_hit_counter_[program_id].misses++; } std::string kernel_stats = "," + kernel->name() + ","; std::string cache_status = cache_hit ? "cache hit" : "cache miss"; @@ -99,13 +99,13 @@ void CompilationReporter::add_kernel_compile_stats(const Program &program, std:: } index++; } - this->program_id_to_kernel_stats_[program.get_id()].push_back(kernel_stats); + this->program_id_to_kernel_stats_[program_id].push_back(kernel_stats); } -void CompilationReporter::flush_program_entry(const Program &program, bool persistent_compilation_cache_enabled) { +void CompilationReporter::flush_program_entry(uint64_t program_id, size_t num_kernels, std::function(size_t)> get_kernel, bool persistent_compilation_cache_enabled) { std::unique_lock lock(mutex_); - auto num_cache_misses = this->program_id_to_cache_hit_counter_.at(program.get_id()).misses; - auto num_cache_hits = this->program_id_to_cache_hit_counter_.at(program.get_id()).hits; + auto num_cache_misses = this->program_id_to_cache_hit_counter_.at(program_id).misses; + auto num_cache_hits = this->program_id_to_cache_hit_counter_.at(program_id).hits; if (this->total_num_compile_programs_ == 0) { this->init_reports(); } @@ -113,8 +113,8 @@ void CompilationReporter::flush_program_entry(const Program &program, bool persi auto get_num_compute_and_data_movement_kernels = [&]() { uint32_t num_compute = 0; uint32_t num_data_movement = 0; - for (size_t kernel_id = 0; kernel_id < program.num_kernels(); kernel_id++) { - const auto kernel = detail::GetKernel(program, kernel_id); + for (size_t kernel_id = 0; kernel_id < num_kernels; kernel_id++) { + const auto kernel = get_kernel(kernel_id); if (kernel->processor() == tt::RISCV::BRISC or kernel->processor() == tt::RISCV::NCRISC) { num_data_movement++; } else { @@ -126,14 +126,14 @@ void CompilationReporter::flush_program_entry(const Program &program, bool persi auto [num_compute_kernels, num_data_movement_kernels] = get_num_compute_and_data_movement_kernels(); - this->summary_report_ << program.get_id() << ", " + this->summary_report_ << program_id << ", " << num_compute_kernels << ", " << num_data_movement_kernels << ", " << (persistent_compilation_cache_enabled ? "Y" : "N") << ", " << num_cache_misses << ", " << num_cache_hits << "\n"; - this->detailed_report_ << "Compiling Program: " << program.get_id() << "\n"; + this->detailed_report_ << "Compiling Program: " << program_id << "\n"; this->detailed_report_ << "\n,Kernel Creation Report:\n"; this->detailed_report_ << ",,Number of Compute CreateKernel API calls: " << num_compute_kernels << "\n"; this->detailed_report_ << ",,Number of Datamovement CreateKernel API calls: " << num_data_movement_kernels << "\n"; @@ -144,7 +144,7 @@ void CompilationReporter::flush_program_entry(const Program &program, bool persi this->detailed_report_ << ",,Total number of kernel compile cache hits: " << num_cache_hits << "\n"; this->detailed_report_ << "\n,Kernel File Name, Core Range, Cache Hit, Kernel Attributes, Hash\n"; - auto kernel_stats_vec = this->program_id_to_kernel_stats_.at(program.get_id()); + auto kernel_stats_vec = this->program_id_to_kernel_stats_.at(program_id); for (const auto &kernel_stats : kernel_stats_vec) { this->detailed_report_ << kernel_stats; } diff --git a/tt_metal/detail/reports/compilation_reporter.hpp b/tt_metal/detail/reports/compilation_reporter.hpp index c976bf5c8bc..23707b8eff3 100644 --- a/tt_metal/detail/reports/compilation_reporter.hpp +++ b/tt_metal/detail/reports/compilation_reporter.hpp @@ -45,9 +45,9 @@ class CompilationReporter { CompilationReporter(const CompilationReporter&) = delete; CompilationReporter(CompilationReporter&& other) noexcept = delete; - void add_kernel_compile_stats(const Program &program, std::shared_ptr kernel, bool cache_hit, size_t kernel_hash); + void add_kernel_compile_stats(uint64_t program_id, std::shared_ptr kernel, bool cache_hit, size_t kernel_hash); - void flush_program_entry(const Program &program, bool persistent_compilation_cache_enabled); + void flush_program_entry(uint64_t program_id, size_t num_kernels, std::function(size_t)> get_kernel, bool persistent_compilation_cache_enabled); static CompilationReporter& inst(); static void toggle (bool state); static bool enabled (); diff --git a/tt_metal/detail/reports/memory_reporter.cpp b/tt_metal/detail/reports/memory_reporter.cpp index 1bffc4421dd..5275d438742 100644 --- a/tt_metal/detail/reports/memory_reporter.cpp +++ b/tt_metal/detail/reports/memory_reporter.cpp @@ -100,14 +100,14 @@ void populate_reports(const Device *device, std::ofstream &memory_usage_summary_ write_memory_usage(device, BufferType::L1, memory_usage_summary_report, detailed_memory_usage_report, l1_usage_summary_report); } -void MemoryReporter::flush_program_memory_usage(const Program &program, const Device *device) { +void MemoryReporter::flush_program_memory_usage(uint64_t program_id, const Device *device) { if (not this->program_memory_usage_summary_report_.is_open()) { this->init_reports(); } - this->program_memory_usage_summary_report_ << program.get_id(); - this->program_l1_usage_summary_report_ << program.get_id(); - this->program_detailed_memory_usage_report_ << program.get_id(); + this->program_memory_usage_summary_report_ << program_id; + this->program_l1_usage_summary_report_ << program_id; + this->program_detailed_memory_usage_report_ << program_id; populate_reports(device, this->program_memory_usage_summary_report_, this->program_detailed_memory_usage_report_, this->program_l1_usage_summary_report_); } diff --git a/tt_metal/detail/reports/memory_reporter.hpp b/tt_metal/detail/reports/memory_reporter.hpp index e5138f02a35..217f6490522 100644 --- a/tt_metal/detail/reports/memory_reporter.hpp +++ b/tt_metal/detail/reports/memory_reporter.hpp @@ -60,7 +60,7 @@ class MemoryReporter { MemoryReporter(const MemoryReporter&) = delete; MemoryReporter(MemoryReporter&& other) noexcept = delete; - void flush_program_memory_usage(const Program &program, const Device *device); + void flush_program_memory_usage(uint64_t program_id, const Device *device); void dump_memory_usage_state(const Device *device, std::string prefix="") const; diff --git a/tt_metal/detail/tt_metal.hpp b/tt_metal/detail/tt_metal.hpp index d6168102a5e..e5464e721a6 100644 --- a/tt_metal/detail/tt_metal.hpp +++ b/tt_metal/detail/tt_metal.hpp @@ -276,7 +276,7 @@ inline namespace v0 { void SetLazyCommandQueueMode(bool lazy); - DeviceAddr AllocateBuffer(const Buffer* buffer, bool bottom_up); + DeviceAddr AllocateBuffer(Buffer* buffer); void DeallocateBuffer(Buffer *buffer); } // namespace detail diff --git a/tt_metal/graph/graph_tracking.cpp b/tt_metal/graph/graph_tracking.cpp index 17a72ddd5ee..c12eff0d7ec 100644 --- a/tt_metal/graph/graph_tracking.cpp +++ b/tt_metal/graph/graph_tracking.cpp @@ -27,12 +27,12 @@ bool GraphTracker::add_hook(const std::shared_ptr& new_hook) { return true; } -void GraphTracker::track_allocate(const Buffer* buffer, bool bottom_up) { +void GraphTracker::track_allocate(const Buffer* buffer) { if (processors.empty()) { return; } for (auto& it : processors) { - it->track_allocate(buffer, bottom_up); + it->track_allocate(buffer); } } @@ -73,11 +73,11 @@ void GraphTracker::track_program(Program* program) { } } -bool GraphTracker::hook_allocate(const Buffer* buffer, bool bottom_up) { +bool GraphTracker::hook_allocate(const Buffer* buffer) { if (hook == nullptr) return false; - return hook->hook_allocate(buffer, bottom_up); + return hook->hook_allocate(buffer); } bool GraphTracker::hook_deallocate(Buffer* buffer) { diff --git a/tt_metal/graph/graph_tracking.hpp b/tt_metal/graph/graph_tracking.hpp index 54ee8eef41d..712373ab005 100644 --- a/tt_metal/graph/graph_tracking.hpp +++ b/tt_metal/graph/graph_tracking.hpp @@ -28,7 +28,7 @@ inline namespace v0 { IGraphProcessor() = default; - virtual void track_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) {}; + virtual void track_allocate(const tt::tt_metal::Buffer* buffer) {}; virtual void track_deallocate(tt::tt_metal::Buffer* buffer) {}; @@ -54,7 +54,7 @@ inline namespace v0 { class IGraphHooks { public: IGraphHooks() = default; - virtual bool hook_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) = 0; + virtual bool hook_allocate(const tt::tt_metal::Buffer* buffer) = 0; virtual bool hook_deallocate(tt::tt_metal::Buffer* buffer) = 0; @@ -77,7 +77,7 @@ inline namespace v0 { bool add_hook(const std::shared_ptr& hook); - void track_allocate(const Buffer* buffer, bool bottom_up); + void track_allocate(const Buffer* buffer); void track_deallocate(Buffer* buffer); @@ -118,7 +118,7 @@ inline namespace v0 { } } - bool hook_allocate(const Buffer* buffer, bool bottom_up); + bool hook_allocate(const Buffer* buffer); bool hook_deallocate(Buffer* buffer); diff --git a/tt_metal/hw/CMakeLists.txt b/tt_metal/hw/CMakeLists.txt index e8c35d24f54..5066024abf0 100644 --- a/tt_metal/hw/CMakeLists.txt +++ b/tt_metal/hw/CMakeLists.txt @@ -186,6 +186,8 @@ foreach(ARCH IN LISTS ARCHS) ${CMAKE_COMMAND} -E make_directory ${HW_LIB_DIR} COMMAND ${GPP_CMD} ${GPP_FLAGS} ${GPP_DEFINES} ${GPP_INCLUDES} -c -o ${HW_LIB_DIR}/${HWLIB}.o ${${HWLIB}_SOURCE} + DEPENDS + ${${HWLIB}_SOURCE} COMMENT "Building hw lib ${HWLIB}.o" VERBATIM ) diff --git a/tt_metal/hw/toolchain/erisc-b0-app-sections.ld b/tt_metal/hw/toolchain/erisc-b0-app-sections.ld index 7e393bf869b..3ec62a2055a 100644 --- a/tt_metal/hw/toolchain/erisc-b0-app-sections.ld +++ b/tt_metal/hw/toolchain/erisc-b0-app-sections.ld @@ -89,11 +89,6 @@ SECTIONS . += 4; } > REGION_LDM - data_noinit (NOLOAD): - { - *(data_noinit) - } > REGION_APP_DATA - l1_memory : { *(l1_memory) diff --git a/tt_metal/hw/toolchain/erisc-b0-kernel.ld b/tt_metal/hw/toolchain/erisc-b0-kernel.ld index bb2183b3e97..feefb637320 100644 --- a/tt_metal/hw/toolchain/erisc-b0-kernel.ld +++ b/tt_metal/hw/toolchain/erisc-b0-kernel.ld @@ -110,11 +110,6 @@ SECTIONS . += 4; } > REGION_LDM - data_noinit (NOLOAD): - { - *(data_noinit) - } > REGION_APP_KERNEL_DATA - l1_memory : { *(l1_memory) diff --git a/tt_metal/impl/allocator/allocator.cpp b/tt_metal/impl/allocator/allocator.cpp index 023826e5cd9..7e760b3bf37 100644 --- a/tt_metal/impl/allocator/allocator.cpp +++ b/tt_metal/impl/allocator/allocator.cpp @@ -377,38 +377,45 @@ void verify_safe_allocation(Allocator& allocator) { } } -uint64_t allocate_buffer( - Allocator &allocator, - DeviceAddr size, - DeviceAddr page_size, - const BufferType &buffer_type, - bool bottom_up, - std::optional num_shards) { - uint64_t address = 0; +const std::unordered_set &get_allocated_buffers(const Allocator &allocator) { return allocator.allocated_buffers; } + +DeviceAddr allocate_buffer(Allocator &allocator, DeviceAddr size, Buffer *buffer) { + DeviceAddr address = 0; + auto page_size = buffer->page_size(); + auto buffer_type = buffer->buffer_type(); + auto bottom_up = buffer->bottom_up(); + auto num_shards = buffer->num_cores(); verify_safe_allocation(allocator); switch (buffer_type) { case BufferType::DRAM: - return allocator.descriptor.dram.alloc( + address = allocator.descriptor.dram.alloc( allocator.config, allocator.dram_manager, size, page_size, bottom_up, num_shards); + break; case BufferType::L1: - return allocator.descriptor.l1.alloc( + address = allocator.descriptor.l1.alloc( allocator.config, allocator.l1_manager, size, page_size, bottom_up, num_shards); + break; case BufferType::L1_SMALL: { TT_FATAL(num_shards.has_value(), "L1_SMALL only supports sharded allocations, see validate_num_banks"); - return allocator.descriptor.l1.alloc( + address = allocator.descriptor.l1.alloc( allocator.config, allocator.l1_small_manager, size, page_size, bottom_up, num_shards); - case BufferType::TRACE: - return allocator.descriptor.dram.alloc( - allocator.config, allocator.trace_buffer_manager, size, page_size, bottom_up, num_shards); + break; } + case BufferType::TRACE: + address = allocator.descriptor.dram.alloc( + allocator.config, allocator.trace_buffer_manager, size, page_size, bottom_up, num_shards); + break; default: { TT_THROW("Unsupported buffer type!"); } } + allocator.allocated_buffers.insert(buffer); return address; } -void deallocate_buffer(Allocator &allocator, DeviceAddr address, const BufferType &buffer_type) { +void deallocate_buffer(Allocator &allocator, Buffer *buffer) { + auto address = buffer->address(); + auto buffer_type = buffer->buffer_type(); switch (buffer_type) { case BufferType::DRAM: allocator.dram_manager.deallocate_buffer(address); break; case BufferType::L1: allocator.l1_manager.deallocate_buffer(address); break; @@ -418,6 +425,7 @@ void deallocate_buffer(Allocator &allocator, DeviceAddr address, const BufferTyp TT_THROW("Unsupported buffer type!"); } } + allocator.allocated_buffers.erase(buffer); } void deallocate_buffers(Allocator &allocator) { @@ -425,6 +433,7 @@ void deallocate_buffers(Allocator &allocator) { allocator.l1_manager.deallocate_all(); allocator.l1_small_manager.deallocate_all(); allocator.trace_buffer_manager.deallocate_all(); + allocator.allocated_buffers.clear(); } void clear(Allocator &allocator) { @@ -432,6 +441,7 @@ void clear(Allocator &allocator) { allocator.l1_manager.clear(); allocator.l1_small_manager.clear(); allocator.trace_buffer_manager.clear(); + allocator.allocated_buffers.clear(); } } // namespace allocator @@ -460,6 +470,7 @@ void Allocator::reset() { l1_manager.clear(); l1_small_manager.clear(); trace_buffer_manager.clear(); + allocated_buffers.clear(); config.reset(); } diff --git a/tt_metal/impl/allocator/allocator.hpp b/tt_metal/impl/allocator/allocator.hpp index ecb31dfb5c8..60e4c97f0b9 100644 --- a/tt_metal/impl/allocator/allocator.hpp +++ b/tt_metal/impl/allocator/allocator.hpp @@ -19,6 +19,12 @@ namespace tt { namespace tt_metal { +inline namespace v0 { + +class Buffer; + +} // namespace v0 + // Fwd declares enum class BufferType; struct Allocator; @@ -99,15 +105,17 @@ std::optional lowest_occupied_l1_address(const Allocator &allocator, DeviceAddr base_alloc(const AllocatorConfig & config, BankManager &bank_manager, DeviceAddr size, DeviceAddr page_size, bool bottom_up, std::optional num_shards); -DeviceAddr allocate_buffer(Allocator &allocator, DeviceAddr size, DeviceAddr page_size, const BufferType &buffer_type, bool bottom_up, std::optional num_shards = std::nullopt); +DeviceAddr allocate_buffer(Allocator &allocator, DeviceAddr size, Buffer *buffer); void mark_allocations_unsafe(Allocator &allocator); void mark_allocations_safe(Allocator &allocator); -void deallocate_buffer(Allocator &allocator, DeviceAddr address, const BufferType &buffer_type); +void deallocate_buffer(Allocator &allocator, Buffer *buffer); void deallocate_buffers(Allocator &allocator); +const std::unordered_set &get_allocated_buffers(const Allocator &allocator); + void clear(Allocator &allocatator); } // namespace allocator @@ -127,6 +135,7 @@ struct Allocator { std::unordered_map> dram_channel_to_bank_ids; std::unordered_map bank_id_to_logical_core; std::unordered_map>> logical_core_to_bank_ids; + std::unordered_set allocated_buffers; AllocatorConfig config; // Callbacks to invoke during initialization and allocation diff --git a/tt_metal/impl/buffers/buffer.cpp b/tt_metal/impl/buffers/buffer.cpp index 0403a82af98..d4cfcf88be3 100644 --- a/tt_metal/impl/buffers/buffer.cpp +++ b/tt_metal/impl/buffers/buffer.cpp @@ -44,9 +44,15 @@ void validate_buffer_size_and_page_size( "Page size must be divisible by sizeof(uint32_t) because buffers hold uint32_t values"); if (is_sharded(buffer_layout)) { - TT_FATAL(shard_parameters != std::nullopt, "Sharded buffers must have a core grid assigned"); - } else if (buffer_layout == TensorMemoryLayout::SINGLE_BANK) { - TT_FATAL(page_size == size, "Contiguous buffer must be one contiguous page"); + TT_FATAL( + shard_parameters != std::nullopt, + "Buffer was specified as sharded but does not have shard_parameters specified"); + } else { + TT_FATAL( + shard_parameters == std::nullopt, "Buffer was specified as not sharded but has shard_parameters specified"); + if (buffer_layout == TensorMemoryLayout::SINGLE_BANK) { + TT_FATAL(page_size == size, "Contiguous buffer must be one contiguous page"); + } } } @@ -125,7 +131,7 @@ BufferPageMapping generate_buffer_page_mapping(const Buffer& buffer) { auto shard_spec = buffer.shard_spec(); bool row_major = shard_spec.orientation() == ShardOrientation::ROW_MAJOR; - uint32_t num_cores = buffer.num_cores(); + uint32_t num_cores = buffer.num_cores().value(); buffer_page_mapping.all_cores_ = corerange_to_cores(shard_spec.grid(), num_cores, row_major); TT_FATAL(num_cores == buffer_page_mapping.all_cores_.size(), "Buffer has {} cores, but page mapping expects {} cores", num_cores, buffer_page_mapping.all_cores_.size()); @@ -196,7 +202,7 @@ Buffer::Buffer( buffer_type_(buffer_type), buffer_layout_(buffer_layout), shard_parameters_(shard_parameters), - bottom_up_(bottom_up), + bottom_up_(bottom_up.value_or(this->is_dram())), buffer_page_mapping_(nullptr) { TT_FATAL(this->device_ != nullptr && this->device_->allocator_ != nullptr, "Device and allocator need to not be null."); @@ -223,9 +229,7 @@ std::shared_ptr Buffer::create( } buffer->device_->push_work([buffer] { - bool bottom_up = buffer->bottom_up_.value_or(buffer->is_dram()); - buffer->address_ = detail::AllocateBuffer(buffer.get(), bottom_up); - detail::BUFFER_MAP.insert({buffer->device_->id(), buffer->address_}, buffer.get()); + buffer->address_ = detail::AllocateBuffer(buffer.get()); std::unique_lock lock(buffer->allocation_mutex_); buffer->allocation_status_.store(AllocationStatus::ALLOCATED, std::memory_order::relaxed); @@ -257,7 +261,6 @@ void Buffer::deallocate_impl() { if (device_->initialized_ && size_ != 0) { // address_ is only modified from this thread, no sync required - detail::BUFFER_MAP.erase({device_->id(), address_}); detail::DeallocateBuffer(this); } @@ -306,7 +309,7 @@ uint32_t Buffer::num_dev_pages() const { return this->num_pages(); } - return this->shard_spec().size() * this->num_cores(); + return this->shard_spec().size() * this->num_cores().value(); } CoreType Buffer::core_type() const { @@ -399,9 +402,9 @@ void Buffer::set_shard_spec(const ShardSpecBuffer& shard_spec) { this->buffer_page_mapping_ = nullptr; } -uint32_t Buffer::num_cores() const { +std::optional Buffer::num_cores() const { if (!is_sharded(this->buffer_layout_)) - return 1; + return std::nullopt; return this->shard_spec().tensor_shard_spec.grid.num_cores(); } @@ -433,10 +436,6 @@ DeviceAddr ShardSpecBuffer::size() const { return shape_in_pages_[0] * shape_in_pages_[1]; } -namespace detail { -buffer_map_t BUFFER_MAP = {}; -} - } // namespace tt_metal } // namespace tt diff --git a/tt_metal/impl/buffers/buffer.hpp b/tt_metal/impl/buffers/buffer.hpp index 8c4332de0cb..77c69707853 100644 --- a/tt_metal/impl/buffers/buffer.hpp +++ b/tt_metal/impl/buffers/buffer.hpp @@ -13,7 +13,6 @@ #include "common/core_coord.hpp" #include "common/tt_backend_api_types.hpp" #include "hostdevcommon/common_values.hpp" -#include "tt_metal/common/base.hpp" #include "tt_metal/common/constants.hpp" #include "tt_metal/common/math.hpp" #include "tt_metal/impl/allocator/allocator_types.hpp" @@ -176,6 +175,8 @@ class Buffer final { TensorMemoryLayout buffer_layout() const { return buffer_layout_; } + bool bottom_up() const { return bottom_up_; } + uint32_t dram_channel_from_bank_id(uint32_t bank_id) const; CoreCoord logical_core_from_bank_id(uint32_t bank_id) const; @@ -199,7 +200,7 @@ class Buffer final { ShardSpecBuffer shard_spec() const; void set_shard_spec(const ShardSpecBuffer& shard_spec); - uint32_t num_cores() const; + std::optional num_cores() const; const std::shared_ptr& get_buffer_page_mapping(); @@ -231,7 +232,7 @@ class Buffer final { const DeviceAddr size_; // Size in bytes const BufferType buffer_type_; const TensorMemoryLayout buffer_layout_; - const std::optional bottom_up_; + const bool bottom_up_; std::atomic allocation_status_ = AllocationStatus::ALLOCATION_REQUESTED; DeviceAddr address_ = 0; @@ -252,36 +253,6 @@ class Buffer final { BufferPageMapping generate_buffer_page_mapping(const Buffer &buffer); -namespace detail { -using Deviceid = uint32_t; - -class buffer_map_t { - public: - void insert(std::tuple buf_attr, Buffer *buffer) { - std::scoped_lock lock(this->map_mutex); - this->map.insert({buf_attr, buffer}); - } - - void erase(std::tuple buf_attr) { - std::scoped_lock lock(this->map_mutex); - this->map.erase(buf_attr); - } - - std::map, Buffer *> value() { - std::scoped_lock lock(this->map_mutex); - return this->map; - } - - ~buffer_map_t() { TT_ASSERT(this->map.empty(), "Not all buffers deallocated by runtime!"); } - - private: - std::mutex map_mutex; - std::map, Buffer *> map = {}; -}; - -extern buffer_map_t BUFFER_MAP; -} // namespace detail - inline namespace v0 { using HostDataType = std::variant< diff --git a/tt_metal/impl/debug/dprint_server.cpp b/tt_metal/impl/debug/dprint_server.cpp index 880f866ae9e..268bc8ad3fb 100644 --- a/tt_metal/impl/debug/dprint_server.cpp +++ b/tt_metal/impl/debug/dprint_server.cpp @@ -30,6 +30,7 @@ using std::string; using std::to_string; using std::cout; using std::endl; +using std::ostream; using std::setw; using std::flush; using std::tuple; @@ -154,7 +155,7 @@ struct DebugPrintServerContext { // A map from Device -> Core Range, which is used to determine which cores on which devices // to scan for print data. Also a lock for editing it. - std::map> device_to_core_range_; + std::map> device_to_core_range_; std::map device_reads_dispatch_cores_; // True if given device reads any dispatch cores. Used to // know whether dprint can be compiled out. std::mutex device_to_core_range_lock_; @@ -326,7 +327,7 @@ void WriteInitMagic(Device *device, const CoreCoord& phys_core, int hart_id, boo // TODO(AP): this could use a cleanup - need a different mechanism to know if a kernel is running on device. // Force wait for first kernel launch by first writing a non-zero and waiting for a zero. - vector initbuf = vector(DPRINT_BUFFER_SIZE / sizeof(uint32_t), 0); + std::vector initbuf = std::vector(DPRINT_BUFFER_SIZE / sizeof(uint32_t), 0); initbuf[0] = uint32_t(enabled ? DEBUG_PRINT_SERVER_STARTING_MAGIC : DEBUG_PRINT_SERVER_DISABLED_MAGIC); tt::llrt::write_hex_vec_to_core(device->id(), phys_core, initbuf, base_addr); } // WriteInitMagic @@ -339,7 +340,7 @@ bool CheckInitMagicCleared(Device *device, const CoreCoord& phys_core, int hart_ // compute the buffer address for the requested hart uint32_t base_addr = GetDprintBufAddr(device, phys_core, hart_id); - vector initbuf = { DEBUG_PRINT_SERVER_STARTING_MAGIC }; + std::vector initbuf = { DEBUG_PRINT_SERVER_STARTING_MAGIC }; auto result = tt::llrt::read_hex_vec_from_core(device->id(), phys_core, base_addr, 4); return (result[0] != initbuf[0]); } // CheckInitMagicCleared @@ -445,13 +446,13 @@ void DebugPrintServerContext::AttachDevice(Device* device) { // If RTOptions doesn't enable DPRINT on this device, return here and don't actually attach it // to the server. - vector chip_ids = tt::llrt::OptionsG.get_feature_chip_ids(tt::llrt::RunTimeDebugFeatureDprint); + std::vector chip_ids = tt::llrt::OptionsG.get_feature_chip_ids(tt::llrt::RunTimeDebugFeatureDprint); if (!tt::llrt::OptionsG.get_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint)) if (std::find(chip_ids.begin(), chip_ids.end(), device->id()) == chip_ids.end()) return; // Core range depends on whether dprint_all_cores flag is set. - vector print_cores_sanitized; + std::vector print_cores_sanitized; for (CoreType core_type : {CoreType::WORKER, CoreType::ETH}) { if (tt::llrt::OptionsG.get_feature_all_cores(tt::llrt::RunTimeDebugFeatureDprint, core_type) == tt::llrt::RunTimeDebugClassAll) { @@ -494,7 +495,7 @@ void DebugPrintServerContext::AttachDevice(Device* device) { tt::llrt::get_core_type_name(core_type)); } else { // No "all cores" option provided, which means print from the cores specified by the user - vector& print_cores = + std::vector& print_cores = tt::llrt::OptionsG.get_feature_cores(tt::llrt::RunTimeDebugFeatureDprint)[core_type]; // We should also validate that the cores the user specified are valid worker cores. @@ -555,7 +556,7 @@ void DebugPrintServerContext::AttachDevice(Device* device) { void DebugPrintServerContext::DetachDevice(Device* device) { // Don't detach the device if it's disabled by env vars - in this case it wasn't attached. - vector chip_ids = tt::llrt::OptionsG.get_feature_chip_ids(tt::llrt::RunTimeDebugFeatureDprint); + std::vector chip_ids = tt::llrt::OptionsG.get_feature_chip_ids(tt::llrt::RunTimeDebugFeatureDprint); if (!tt::llrt::OptionsG.get_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint)) if (std::find(chip_ids.begin(), chip_ids.end(), device->id()) == chip_ids.end()) return; @@ -925,7 +926,7 @@ bool DebugPrintServerContext::PeekOneHartNonBlocking( // with rpos not aligned to wpos // write back to device - update rpos only - vector rposbuf; + std::vector rposbuf; rposbuf.push_back(rpos); uint32_t offs = DebugPrintMemLayout().rpos_offs(); tt::llrt::write_hex_vec_to_core(chip_id, phys_core, rposbuf, base_addr+offs); @@ -956,7 +957,7 @@ void DebugPrintServerContext::PollPrintData(uint32_t hart_mask) { } // Make a copy of the device->core map, so that it can be modified while polling. - std::map> device_to_core_range_copy; + std::map> device_to_core_range_copy; device_to_core_range_lock_.lock(); device_to_core_range_copy = device_to_core_range_; diff --git a/tt_metal/impl/debug/noc_logging.cpp b/tt_metal/impl/debug/noc_logging.cpp index 2d07fa593fe..2ef251ae58d 100644 --- a/tt_metal/impl/debug/noc_logging.cpp +++ b/tt_metal/impl/debug/noc_logging.cpp @@ -97,7 +97,7 @@ void ClearNocData(Device *device) { CoreCoord phys_core = device->physical_core_from_logical_core(logical_core); for (int risc_id = 0; risc_id < GetNumRiscs(logical_core); risc_id++) { uint64_t addr = GetDprintBufAddr(device, phys_core, risc_id); - vector initbuf = vector(DPRINT_BUFFER_SIZE / sizeof(uint32_t), 0); + std::vector initbuf = std::vector(DPRINT_BUFFER_SIZE / sizeof(uint32_t), 0); tt::llrt::write_hex_vec_to_core(device->id(), phys_core, initbuf, addr); } } diff --git a/tt_metal/impl/debug/sanitize_noc_host.hpp b/tt_metal/impl/debug/sanitize_noc_host.hpp index 9dcd5c8a2d0..58029fe339d 100644 --- a/tt_metal/impl/debug/sanitize_noc_host.hpp +++ b/tt_metal/impl/debug/sanitize_noc_host.hpp @@ -24,7 +24,7 @@ namespace tt { #define DEBUG_VALID_ETH_ADDR(a, l) (((a) >= MEM_ETH_BASE) && ((a) + (l) <= MEM_ETH_BASE + MEM_ETH_SIZE)) -static bool coord_found_p(vectorcoords, CoreCoord core) { +static bool coord_found_p(std::vectorcoords, CoreCoord core) { for (CoreCoord item : coords) { if (item == core) return true; } diff --git a/tt_metal/impl/debug/watcher_device_reader.cpp b/tt_metal/impl/debug/watcher_device_reader.cpp index e5446268717..fdcf5638970 100644 --- a/tt_metal/impl/debug/watcher_device_reader.cpp +++ b/tt_metal/impl/debug/watcher_device_reader.cpp @@ -114,11 +114,11 @@ const launch_msg_t* get_valid_launch_message(const mailboxes_t *mbox_data) { namespace tt::watcher { WatcherDeviceReader::WatcherDeviceReader( - FILE *f, Device *device, vector &kernel_names, void (*set_watcher_exception_message)(const string &)) : + FILE *f, Device *device, std::vector &kernel_names, void (*set_watcher_exception_message)(const string &)) : f(f), device(device), kernel_names(kernel_names), set_watcher_exception_message(set_watcher_exception_message) { // On init, read out eth link retraining register so that we can see if retraining has occurred. WH only for now. if (device->arch() == ARCH::WORMHOLE_B0 && tt::llrt::OptionsG.get_watcher_enabled()) { - vector read_data; + std::vector read_data; for (const CoreCoord ð_core : device->get_active_ethernet_cores()) { CoreCoord phys_core = device->ethernet_core_from_logical_core(eth_core); read_data = tt::llrt::read_hex_vec_from_core( @@ -131,7 +131,7 @@ WatcherDeviceReader::WatcherDeviceReader( WatcherDeviceReader::~WatcherDeviceReader() { // On close, read out eth link retraining register so that we can see if retraining has occurred. if (device->arch() == ARCH::WORMHOLE_B0 && tt::llrt::OptionsG.get_watcher_enabled()) { - vector read_data; + std::vector read_data; for (const CoreCoord ð_core : device->get_active_ethernet_cores()) { CoreCoord phys_core = device->ethernet_core_from_logical_core(eth_core); read_data = tt::llrt::read_hex_vec_from_core( diff --git a/tt_metal/impl/debug/watcher_device_reader.hpp b/tt_metal/impl/debug/watcher_device_reader.hpp index 7f60ad5d4cf..0e0226eebbc 100644 --- a/tt_metal/impl/debug/watcher_device_reader.hpp +++ b/tt_metal/impl/debug/watcher_device_reader.hpp @@ -24,7 +24,7 @@ typedef struct { class WatcherDeviceReader { public: WatcherDeviceReader( - FILE *f, Device *device, vector &kernel_names, void (*set_watcher_exception_message)(const string &)); + FILE *f, Device *device, std::vector &kernel_names, void (*set_watcher_exception_message)(const std::string &)); ~WatcherDeviceReader(); void Dump(FILE *file = nullptr); @@ -32,9 +32,9 @@ class WatcherDeviceReader { // Functions for dumping each watcher feature to the log void DumpCore(CoreDescriptor &logical_core, bool is_active_eth_core); void DumpL1Status(CoreDescriptor &core, const launch_msg_t *launch_msg); - void DumpNocSanitizeStatus(CoreDescriptor &core, const string &core_str, const mailboxes_t *mbox_data, int noc); - void DumpAssertStatus(CoreDescriptor &core, const string &core_str, const mailboxes_t *mbox_data); - void DumpPauseStatus(CoreDescriptor &core, const string &core_str,const mailboxes_t *mbox_data); + void DumpNocSanitizeStatus(CoreDescriptor &core, const std::string &core_str, const mailboxes_t *mbox_data, int noc); + void DumpAssertStatus(CoreDescriptor &core, const std::string &core_str, const mailboxes_t *mbox_data); + void DumpPauseStatus(CoreDescriptor &core, const std::string &core_str,const mailboxes_t *mbox_data); void DumpRingBuffer(CoreDescriptor &core, const mailboxes_t *mbox_data, bool to_stdout); void DumpRunState(CoreDescriptor &core, const launch_msg_t *launch_msg, uint32_t state); void DumpLaunchMessage(CoreDescriptor &core, const mailboxes_t *mbox_data); @@ -45,12 +45,12 @@ class WatcherDeviceReader { // Helper functions void LogRunningKernels(CoreDescriptor &core, const launch_msg_t *launch_msg); - string GetKernelName(CoreDescriptor &core, const launch_msg_t *launch_msg, uint32_t type); + std::string GetKernelName(CoreDescriptor &core, const launch_msg_t *launch_msg, uint32_t type); FILE *f; Device *device; - vector &kernel_names; - void (* set_watcher_exception_message)(const string &); + std::vector &kernel_names; + void (* set_watcher_exception_message)(const std::string &); // Information that needs to be kept around on a per-dump basis std::set> paused_cores; diff --git a/tt_metal/impl/debug/watcher_server.cpp b/tt_metal/impl/debug/watcher_server.cpp index a644e379cf8..06070107aaf 100644 --- a/tt_metal/impl/debug/watcher_server.cpp +++ b/tt_metal/impl/debug/watcher_server.cpp @@ -254,7 +254,7 @@ void watcher_init(Device *device) { for (tt::llrt::RunTimeDebugFeatures delay_feature = tt::llrt::RunTimeDebugFeatureReadDebugDelay; (int)delay_feature <= tt::llrt::RunTimeDebugFeatureAtomicDebugDelay; delay_feature = (tt::llrt::RunTimeDebugFeatures)((int)delay_feature + 1)) { - vector chip_ids = tt::llrt::OptionsG.get_feature_chip_ids(delay_feature); + std::vector chip_ids = tt::llrt::OptionsG.get_feature_chip_ids(delay_feature); bool this_chip_enabled = tt::llrt::OptionsG.get_feature_all_chips(delay_feature) || std::find(chip_ids.begin(), chip_ids.end(), device->id()) != chip_ids.end(); if (this_chip_enabled) { @@ -275,7 +275,7 @@ void watcher_init(Device *device) { } for (CoreType core_type : {CoreType::WORKER, CoreType::ETH}) { - vector delayed_cores = tt::llrt::OptionsG.get_feature_cores(delay_feature)[core_type]; + std::vector delayed_cores = tt::llrt::OptionsG.get_feature_cores(delay_feature)[core_type]; for (tt_xy_pair logical_core : delayed_cores) { CoreCoord phys_core; bool valid_logical_core = true; diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 0479807018f..e536c9e940a 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -556,7 +556,7 @@ void Device::initialize_and_launch_firmware() { go_msg.signal = RUN_MSG_INIT; // Populate core info, which will be written to device - vector core_info_vec(sizeof(core_info_msg_t) / sizeof(uint32_t)); + std::vector core_info_vec(sizeof(core_info_msg_t) / sizeof(uint32_t)); core_info_msg_t *core_info = (core_info_msg_t *) core_info_vec.data(); const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(this->id()); @@ -593,7 +593,7 @@ void Device::initialize_and_launch_firmware() { // Determine which noc-coords are harvested // TODO(PGK/Almeet): fix this w/ new UMD - vector harvested_rows; + std::vector harvested_rows; uint32_t harvested_noc_rows = tt::Cluster::instance().get_harvested_rows(this->id()); for (uint32_t y = 0; y < soc_d.grid_size.y; y++) { bool row_harvested = (harvested_noc_rows >> y) & 0x1; @@ -2735,7 +2735,7 @@ void Device::configure_command_queue_programs() { uint32_t issue_queue_size = this->sysmem_manager_->get_issue_queue_size(cq_id); uint32_t completion_queue_start_addr = cq_start + issue_queue_size + get_absolute_cq_offset(channel, cq_id, cq_size); uint32_t completion_queue_start_addr_16B = completion_queue_start_addr >> 4; - vector completion_queue_wr_ptr = {completion_queue_start_addr_16B}; + std::vector completion_queue_wr_ptr = {completion_queue_start_addr_16B}; detail::WriteToDeviceL1(mmio_device, completion_q_writer_location, completion_q_rd_ptr, completion_queue_wr_ptr, dispatch_core_type); detail::WriteToDeviceL1(mmio_device, completion_q_writer_location, completion_q_wr_ptr, completion_queue_wr_ptr, dispatch_core_type); detail::WriteToDeviceL1(mmio_device, completion_q_writer_location, completion_q0_last_event_ptr, zero, dispatch_core_type); @@ -2949,10 +2949,8 @@ bool Device::close() { tt::Cluster::instance().l1_barrier(id_); allocator::clear(*this->allocator_); // After device close, no buffers on this device should be used - for (const auto &[buf_attr, buf] : detail::BUFFER_MAP.value()) { - if (std::get<0>(buf_attr) == this->id()) { - DeallocateBuffer(*buf); - } + for (const auto &buf : this->get_allocated_buffers()) { + DeallocateBuffer(*buf); } this->compute_cores_.clear(); @@ -3174,6 +3172,11 @@ void Device::dump_memory_blocks(const BufferType &buffer_type, std::ofstream &ou return allocator::dump_memory_blocks(*this->allocator_, buffer_type, out); } +const std::unordered_set &Device::get_allocated_buffers() const { + this->check_allocator_is_initialized(); + return allocator::get_allocated_buffers(*this->allocator_); +} + void Device::deallocate_buffers(){ allocator::deallocate_buffers(*allocator_); } diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp index 7beb58f3ea8..dce53a1eae8 100644 --- a/tt_metal/impl/device/device.hpp +++ b/tt_metal/impl/device/device.hpp @@ -197,6 +197,8 @@ class Device { uint32_t get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& physical_core) const; uint32_t get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& physical_cores) const; + const std::unordered_set &get_allocated_buffers() const; + void deallocate_buffers(); // machine epsilon diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 32c7ac99e73..e4ee5405f07 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -663,7 +663,7 @@ void EnqueueProgramCommand::assemble_runtime_args_commands(ProgramCommandSequenc .noc_xy_addr = this->device->get_noc_unicast_encoding(this->noc_index, physical_core)}); } } else { - vector> dst_noc_multicast_info = + std::vector> dst_noc_multicast_info = device->extract_dst_noc_multicast_info>( kernel->logical_coreranges(), core_type); common_sub_cmds.emplace>( @@ -730,8 +730,9 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro const uint32_t max_prefetch_command_size = dispatch_constants::get(dispatch_core_type).max_prefetch_command_size(); + const auto &program_transfer_info = program.get_program_transfer_info(); // Multicast Semaphore Cmd - uint32_t num_multicast_semaphores = program.program_transfer_info.multicast_semaphores.size(); + uint32_t num_multicast_semaphores = program_transfer_info.multicast_semaphores.size(); std::vector> multicast_sem_sub_cmds(num_multicast_semaphores); std::vector>> multicast_sem_data(num_multicast_semaphores); std::vector>> multicast_sem_payload(num_multicast_semaphores); @@ -739,7 +740,7 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro multicast_sem_dst_size.reserve(num_multicast_semaphores); if (num_multicast_semaphores > 0) { uint32_t i = 0; - for (const auto& [dst, transfer_info_vec] : program.program_transfer_info.multicast_semaphores) { + for (const auto& [dst, transfer_info_vec] : program_transfer_info.multicast_semaphores) { // TODO: loop over things inside transfer_info[i] uint32_t write_packed_len = transfer_info_vec[0].data.size(); multicast_sem_dst_size.emplace_back(std::make_pair(dst, write_packed_len * sizeof(uint32_t))); @@ -748,7 +749,7 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro for (const auto& dst_noc_info : transfer_info.dst_noc_info) { TT_ASSERT( transfer_info.data.size() == write_packed_len, - "Not all data vectors in write packed semaphore cmd equal in len"); + "Not all data std::vectors in write packed semaphore cmd equal in len"); multicast_sem_sub_cmds[i].emplace_back(CQDispatchWritePackedMulticastSubCmd{ .noc_xy_addr = this->device->get_noc_multicast_encoding( this->noc_index, std::get(dst_noc_info.first)), @@ -768,7 +769,7 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro } // Unicast Semaphore Cmd - uint32_t num_unicast_semaphores = program.program_transfer_info.unicast_semaphores.size(); + uint32_t num_unicast_semaphores = program_transfer_info.unicast_semaphores.size(); std::vector> unicast_sem_sub_cmds(num_unicast_semaphores); std::vector>> unicast_sem_data(num_unicast_semaphores); std::vector>> unicast_sem_payload(num_unicast_semaphores); @@ -776,7 +777,7 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro unicast_sem_dst_size.reserve(num_unicast_semaphores); if (num_unicast_semaphores > 0) { uint32_t i = 0; - for (const auto& [dst, transfer_info_vec] : program.program_transfer_info.unicast_semaphores) { + for (const auto& [dst, transfer_info_vec] : program_transfer_info.unicast_semaphores) { // TODO: loop over things inside transfer_info[i] uint32_t write_packed_len = transfer_info_vec[0].data.size(); unicast_sem_dst_size.emplace_back(std::make_pair(dst, write_packed_len * sizeof(uint32_t))); @@ -785,7 +786,7 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro for (const auto& dst_noc_info : transfer_info.dst_noc_info) { TT_ASSERT( transfer_info.data.size() == write_packed_len, - "Not all data vectors in write packed semaphore cmd equal in len"); + "Not all data std::vectors in write packed semaphore cmd equal in len"); unicast_sem_sub_cmds[i].emplace_back(CQDispatchWritePackedUnicastSubCmd{ .noc_xy_addr = this->device->get_noc_unicast_encoding( this->noc_index, std::get(dst_noc_info.first))}); @@ -876,7 +877,8 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro const uint32_t max_length_per_sub_cmd = dispatch_constants::get(this->dispatch_core_type).scratch_db_size() / 2; const uint32_t max_paged_length_per_sub_cmd = max_length_per_sub_cmd / HostMemDeviceCommand::PROGRAM_PAGE_SIZE * HostMemDeviceCommand::PROGRAM_PAGE_SIZE; - for (const auto& [cores, num_mcast_dests, kg_transfer_info] : program.program_transfer_info.kernel_bins) { + const auto &kernels_buffer = program.get_kernels_buffer(); + for (const auto& [cores, num_mcast_dests, kg_transfer_info] : program_transfer_info.kernel_bins) { bool write_linear; uint32_t noc_encoding; std::visit( @@ -913,14 +915,14 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro uint32_t base_address, page_offset; if (kg_transfer_info.page_offsets[kernel_idx] > CQ_PREFETCH_RELAY_PAGED_START_PAGE_MASK) { - const uint32_t num_banks = this->device->num_banks(this->program.kernels_buffer->buffer_type()); + const uint32_t num_banks = this->device->num_banks(kernels_buffer->buffer_type()); page_offset = kg_transfer_info.page_offsets[kernel_idx] % num_banks; uint32_t num_full_pages_written_per_bank = kg_transfer_info.page_offsets[kernel_idx] / num_banks; - base_address = this->program.kernels_buffer->address() + - num_full_pages_written_per_bank * this->program.kernels_buffer->page_size(); + base_address = kernels_buffer->address() + + num_full_pages_written_per_bank * kernels_buffer->page_size(); } else { - base_address = this->program.kernels_buffer->address(); + base_address = kernels_buffer->address(); page_offset = kg_transfer_info.page_offsets[kernel_idx]; } @@ -928,11 +930,11 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro true, // is_dram page_offset, base_address, - this->program.kernels_buffer->page_size(), - relayed_bytes / this->program.kernels_buffer->page_size(), + kernels_buffer->page_size(), + relayed_bytes / kernels_buffer->page_size(), length_adjust); } else { - uint32_t base_address = this->program.kernels_buffer->address(); + uint32_t base_address = kernels_buffer->address(); uint32_t page_offset = kg_transfer_info.page_offsets[kernel_idx]; uint32_t dst_addr = kg_transfer_info.dst_base_addrs[kernel_idx]; uint32_t aligned_length = align(kg_transfer_info.lengths[kernel_idx], hal.get_alignment(HalMemType::DRAM)); @@ -1068,7 +1070,7 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro } // if dispatch_s is enabled have dispatch_d send a semaphore update to dispatch_s (this will include a write barrier on dispatch_d if program is active) // if not, check if the program is active on workers. If active, have dispatch_d issue a write barrier - cmd_sequence_sizeB += (this->device->dispatch_s_enabled() || program.program_transfer_info.num_active_cores > 0) * CQ_PREFETCH_CMD_BARE_MIN_SIZE; + cmd_sequence_sizeB += (this->device->dispatch_s_enabled() || program_transfer_info.num_active_cores > 0) * CQ_PREFETCH_CMD_BARE_MIN_SIZE; // either dispatch_s or dispatch_d will send the go signal (go_signal_mcast command) cmd_sequence_sizeB += CQ_PREFETCH_CMD_BARE_MIN_SIZE; @@ -1251,11 +1253,11 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro DispatcherSelect dispatcher_for_go_signal = DispatcherSelect::DISPATCH_MASTER; if (this->device->dispatch_s_enabled()) { // dispatch_d signals dispatch_s to send the go signal, use a barrier if there are cores active - device_command_sequence.add_notify_dispatch_s_go_signal_cmd(program.program_transfer_info.num_active_cores > 0); + device_command_sequence.add_notify_dispatch_s_go_signal_cmd(program_transfer_info.num_active_cores > 0); dispatcher_for_go_signal = DispatcherSelect::DISPATCH_SLAVE; } else { // Wait Noc Write Barrier, wait for binaries/configs and launch_msg to be written to worker cores - if (program.program_transfer_info.num_active_cores > 0) { + if (program_transfer_info.num_active_cores > 0) { device_command_sequence.add_dispatch_wait(true, this->dispatch_message_addr, 0, 0, false, false); } } @@ -1463,7 +1465,7 @@ void EnqueueProgramCommand::process() { } const std::pair&> reservation = - this->manager.get_config_buffer_mgr().reserve(program.program_config_sizes_); + this->manager.get_config_buffer_mgr().reserve(program.get_program_config_sizes()); bool stall_first = reservation.first.need_sync; // Note: since present implementation always stalls, we always free up to "now" this->manager.get_config_buffer_mgr().free(reservation.first.sync_count); @@ -1484,9 +1486,10 @@ void EnqueueProgramCommand::process() { // Cache is only usable if caching is enabled and program is finalized // If cache has a program entry but the program is not finalized, then the cache is stale // Currently this is mapped by device, but will be mapped by multiple values in the future + auto &cached_program_command_sequences = program.get_cached_program_command_sequences(); uint64_t command_hash = this->device->id(); - auto cached_cmd_iter = this->program.cached_program_command_sequences_.find(command_hash); - bool is_cached = is_finalized && cached_cmd_iter != this->program.cached_program_command_sequences_.end(); + auto cached_cmd_iter = cached_program_command_sequences.find(command_hash); + bool is_cached = is_finalized && cached_cmd_iter != cached_program_command_sequences.end(); // Calculate all commands size and determine how many fetch q entries to use // Preamble, some waits and stalls @@ -1506,7 +1509,7 @@ void EnqueueProgramCommand::process() { this->assemble_device_commands(program_command_sequence, kernel_config_addrs); this->write_program_command_sequence(program_command_sequence, stall_first); this->assemble_stall_commands(program_command_sequence, false); - this->program.cached_program_command_sequences_.insert({command_hash, std::move(program_command_sequence)}); + cached_program_command_sequences.insert({command_hash, std::move(program_command_sequence)}); } else { static constexpr uint32_t wait_count_offset = (sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, wait.count)); static constexpr uint32_t tensix_l1_write_offset_offset = @@ -2230,20 +2233,19 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) { ZoneScopedN("HWCommandQueue_enqueue_program"); if (not program.is_finalized()) { TT_FATAL(!this->manager.get_bypass_mode(), "Tracing should only be used when programs have been cached"); - if (program.kernels_buffer != nullptr) { + if (const auto &kernels_buffer = program.get_kernels_buffer()) { this->enqueue_write_buffer( - *program.kernels_buffer, program.program_transfer_info.binary_data.data(), false); + *kernels_buffer, program.get_program_transfer_info().binary_data.data(), false); } } #ifdef DEBUG if (tt::llrt::OptionsG.get_validate_kernel_binaries()) { TT_FATAL(!this->manager.get_bypass_mode(), "Tracing cannot be used while validating program binaries"); - if (program.kernels_buffer != nullptr) { - const auto& buffer = program.kernels_buffer; + if (const auto &buffer = program.get_kernels_buffer()) { std::vector read_data(buffer->page_size() * buffer->num_pages() / sizeof(uint32_t)); - this->enqueue_read_buffer(*program.kernels_buffer, read_data.data(), true); + this->enqueue_read_buffer(*buffer, read_data.data(), true); TT_FATAL( - program.program_transfer_info.binary_data == read_data, + program.get_program_transfer_info().binary_data == read_data, "Binary for program to be executed is corrupted. Another program likely corrupted this binary"); } } @@ -2293,12 +2295,11 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) { #ifdef DEBUG if (tt::llrt::OptionsG.get_validate_kernel_binaries()) { TT_FATAL(!this->manager.get_bypass_mode(), "Tracing cannot be used while validating program binaries"); - if (program.kernels_buffer != nullptr) { - const auto& buffer = program.kernels_buffer; + if (const auto& buffer = program.get_kernels_buffer()) { std::vector read_data(buffer->page_size() * buffer->num_pages() / sizeof(uint32_t)); - this->enqueue_read_buffer(*program.kernels_buffer, read_data.data(), true); + this->enqueue_read_buffer(*buffer, read_data.data(), true); TT_FATAL( - program.program_transfer_info.binary_data == read_data, + program.get_program_transfer_info().binary_data == read_data, "Binary for program that executed is corrupted. This program likely corrupted its own binary."); } } @@ -2307,7 +2308,7 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) { log_trace( tt::LogMetal, "Created EnqueueProgramCommand (active_cores: {} bypass_mode: {} expected_workers_completed: {})", - program.program_transfer_info.num_active_cores, + program.get_program_transfer_info().num_active_cores, this->manager.get_bypass_mode(), expected_workers_completed); } @@ -2790,7 +2791,7 @@ inline namespace v0 { void EnqueueReadBuffer( CommandQueue& cq, std::variant, std::shared_ptr> buffer, - vector& dst, + std::vector& dst, bool blocking) { // TODO(agrebenisan): Move to deprecated ZoneScoped; @@ -2820,7 +2821,7 @@ void EnqueueReadBuffer( void EnqueueWriteBuffer( CommandQueue& cq, std::variant, std::shared_ptr> buffer, - vector& src, + std::vector& src, bool blocking) { // TODO(agrebenisan): Move to deprecated EnqueueWriteBuffer(cq, buffer, src.data(), blocking); diff --git a/tt_metal/impl/dispatch/command_queue.hpp b/tt_metal/impl/dispatch/command_queue.hpp index 64f6c5407b7..766346dfe52 100644 --- a/tt_metal/impl/dispatch/command_queue.hpp +++ b/tt_metal/impl/dispatch/command_queue.hpp @@ -14,7 +14,6 @@ #include #include "common/env_lib.hpp" -#include "tt_metal/common/base.hpp" #include "tt_metal/impl/dispatch/program_command_sequence.hpp" #include "tt_metal/impl/dispatch/command_queue_interface.hpp" #include "tt_metal/impl/dispatch/device_command.hpp" @@ -478,9 +477,6 @@ using CompletionReaderQueue = LockFreeQueue; struct AllocBufferMetadata { Buffer* buffer; std::reference_wrapper allocator; - BufferType buffer_type; - uint32_t device_address; - bool bottom_up; }; struct RuntimeArgsMetadata { diff --git a/tt_metal/impl/dispatch/command_queue_interface.hpp b/tt_metal/impl/dispatch/command_queue_interface.hpp index e1a8f3f0b0f..bf8ac017030 100644 --- a/tt_metal/impl/dispatch/command_queue_interface.hpp +++ b/tt_metal/impl/dispatch/command_queue_interface.hpp @@ -432,21 +432,21 @@ class SystemMemoryManager { chip_id_t device_id; uint8_t num_hw_cqs; const std::function fast_write_callable; - vector completion_byte_addrs; + std::vector completion_byte_addrs; char *cq_sysmem_start; - vector cq_interfaces; + std::vector cq_interfaces; uint32_t cq_size; uint32_t channel_offset; - vector cq_to_event; - vector cq_to_last_completed_event; - vector cq_to_event_locks; - vector prefetcher_cores; - vector prefetch_q_writers; - vector prefetch_q_dev_ptrs; - vector prefetch_q_dev_fences; + std::vector cq_to_event; + std::vector cq_to_last_completed_event; + std::vector cq_to_event_locks; + std::vector prefetcher_cores; + std::vector prefetch_q_writers; + std::vector prefetch_q_dev_ptrs; + std::vector prefetch_q_dev_fences; bool bypass_enable; - vector bypass_buffer; + std::vector bypass_buffer; uint32_t bypass_buffer_write_offset; WorkerConfigBufferMgr config_buffer_mgr; @@ -528,7 +528,7 @@ class SystemMemoryManager { prefetch_q_base + dispatch_constants::get(core_type, num_hw_cqs).prefetch_q_entries() * sizeof(dispatch_constants::prefetch_q_entry_type); } - vector temp_mutexes(num_hw_cqs); + std::vector temp_mutexes(num_hw_cqs); cq_to_event_locks.swap(temp_mutexes); for (uint32_t index = 0; index < hal.get_programmable_core_type_count(); index++) { @@ -636,7 +636,7 @@ class SystemMemoryManager { chip_id_t get_device_id() const { return this->device_id; } - vector& get_cq_interfaces() { return this->cq_interfaces; } + std::vector& get_cq_interfaces() { return this->cq_interfaces; } void *issue_queue_reserve(uint32_t cmd_size_B, const uint8_t cq_id) { if (this->bypass_enable) { diff --git a/tt_metal/impl/dispatch/data_collection.cpp b/tt_metal/impl/dispatch/data_collection.cpp index c5c400e01c3..bf18bb2d6f5 100644 --- a/tt_metal/impl/dispatch/data_collection.cpp +++ b/tt_metal/impl/dispatch/data_collection.cpp @@ -35,7 +35,7 @@ class DispatchStats { Update(other.max_transaction_size, other.min_transaction_size, other.num_writes, other.total_write_size); } - void Dump(std::ofstream &outfile, map &raw_data) { + void Dump(std::ofstream &outfile, std::map &raw_data) { outfile << fmt::format("\t\tmax_transaction_size = {}\n", max_transaction_size); outfile << fmt::format("\t\tmin_transaction_size = {}\n", min_transaction_size); outfile << fmt::format("\t\tnum_writes = {}\n", num_writes); @@ -74,7 +74,7 @@ class DispatchData { // Track stats for all RISCS, as well as per RISC DispatchStats total_stats; - map total_data; + std::map total_data; for (auto &riscv_and_data : data) { // Go through all data and update stats DispatchStats riscv_stats; @@ -98,7 +98,7 @@ class DispatchData { } private: - map> data; // RISCV -> transaction size -> count + std::map> data; // RISCV -> transaction size -> count data_collector_t type; }; @@ -117,21 +117,21 @@ class DataCollector { }; void RecordData(Program &program, data_collector_t type, uint32_t transaction_size, RISCV riscv); - void RecordKernelGroups(Program &program, CoreType core_type, vector &kernel_groups); + void RecordKernelGroups(Program &program, CoreType core_type, std::vector &kernel_groups); void RecordProgramRun(Program &program); void DumpData(); private: - map> program_id_to_dispatch_data; - map>>> program_id_to_kernel_groups; - map program_id_to_call_count; + std::map> program_id_to_dispatch_data; + std::map>>> program_id_to_kernel_groups; + std::map program_id_to_call_count; }; void DataCollector::RecordData(Program &program, data_collector_t type, uint32_t transaction_size, RISCV riscv) { uint64_t program_id = program.get_id(); if (program_id_to_dispatch_data.count(program_id) == 0) { // If no existing data for this program, initialize starting values. - program_id_to_dispatch_data[program_id] = vector(); + program_id_to_dispatch_data[program_id] = std::vector(); for (int idx = 0; idx < DISPATCH_DATA_COUNT; idx++) { data_collector_t curr_type = static_cast(idx); DispatchData data(curr_type); @@ -142,7 +142,7 @@ void DataCollector::RecordData(Program &program, data_collector_t type, uint32_t program_id_to_dispatch_data[program_id].at(type).Update(transaction_size, riscv); } -void DataCollector::RecordKernelGroups(Program &program, CoreType core_type, vector &kernel_groups) { +void DataCollector::RecordKernelGroups(Program &program, CoreType core_type, std::vector &kernel_groups) { uint64_t program_id = program.get_id(); // Make a copy of relevant info, since user may destroy program before we dump. for (KernelGroup &kernel_group : kernel_groups) { @@ -189,7 +189,7 @@ void DataCollector::DumpData() { std::ofstream outfile = std::ofstream("dispatch_data.txt"); // Extra DispatchData objects to collect data across programs - vector cross_program_data; + std::vector cross_program_data; for (int idx = 0; idx < DISPATCH_DATA_COUNT; idx++) { cross_program_data.push_back(new DispatchData(idx)); } @@ -202,7 +202,7 @@ void DataCollector::DumpData() { // Dump kernel ids for each kernel group in this program for (auto &core_type_and_kernel_groups : program_id_to_kernel_groups[program_id]) { CoreType core_type = core_type_and_kernel_groups.first; - vector> &kernel_groups = core_type_and_kernel_groups.second; + std::vector> &kernel_groups = core_type_and_kernel_groups.second; outfile << fmt::format("\t{} Kernel Groups: {}\n", core_type, kernel_groups.size()); for (auto &ids_and_ranges : kernel_groups) { // Dump kernel ids in this group @@ -266,7 +266,7 @@ void RecordDispatchData(Program &program, data_collector_t type, uint32_t transa DataCollector::inst->RecordData(program, type, transaction_size, riscv); } -void RecordKernelGroups(Program &program, CoreType core_type, vector &kernel_groups) { +void RecordKernelGroups(Program &program, CoreType core_type, std::vector &kernel_groups) { // Do nothing if we're not enabling data collection. if (!tt::llrt::OptionsG.get_dispatch_data_collection_enabled()) return; diff --git a/tt_metal/impl/dispatch/debug_tools.cpp b/tt_metal/impl/dispatch/debug_tools.cpp index f8c54fa3573..ea5141443b6 100644 --- a/tt_metal/impl/dispatch/debug_tools.cpp +++ b/tt_metal/impl/dispatch/debug_tools.cpp @@ -23,7 +23,7 @@ void match_device_program_data_with_host_program_data(const char* host_file, con host_dispatch_dump_file.open(host_file); device_dispatch_dump_file.open(device_file); - vector>> host_map; + std::vector>> host_map; string line; @@ -36,7 +36,7 @@ void match_device_program_data_with_host_program_data(const char* host_file, con } else if (line.find("BINARY SPAN") != string::npos or line.find("SEM") != string::npos or line.find("CB") != string::npos) { type = line; } else { - vector host_data = {line}; + std::vector host_data = {line}; while (std::getline(host_dispatch_dump_file, line) and (line.find("*") == string::npos)) { host_data.push_back(line); } @@ -44,8 +44,8 @@ void match_device_program_data_with_host_program_data(const char* host_file, con } } - vector> device_map; - vector device_data; + std::vector> device_map; + std::vector device_data; while (std::getline(device_dispatch_dump_file, line) and line != "EXIT_CONDITION") { if (line == "CHUNK") { if (not device_data.empty()) { @@ -63,7 +63,7 @@ void match_device_program_data_with_host_program_data(const char* host_file, con for (const auto& [type, host_data] : host_map) { bool match = false; - for (const vector& device_data : device_map) { + for (const std::vector& device_data : device_map) { if (host_data == device_data) { tt::log_info("Matched on {}", type); match = true; @@ -292,7 +292,7 @@ void dump_completion_queue_entries( uint32_t base_addr = (cq_interface.issue_fifo_limit << 4); // Read out in pages, this is fine since all completion Q entries are page aligned. - vector read_data; + std::vector read_data; read_data.resize(dispatch_constants::TRANSFER_PAGE_SIZE); tt::log_info("Reading Device {} CQ {}, Completion Queue...", sysmem_manager.get_device_id(), cq_interface.id); cq_file << fmt::format( @@ -382,7 +382,7 @@ void dump_issue_queue_entries( uint32_t issue_q_base_addr = cq_interface.offset + cq_interface.cq_start; // Read out in 4K pages, could do ISSUE_Q_ALIGNMENT chunks to match the entries but this is ~2x faster. - vector read_data; + std::vector read_data; read_data.resize(dispatch_constants::TRANSFER_PAGE_SIZE); tt::log_info("Reading Device {} CQ {}, Issue Queue...", sysmem_manager.get_device_id(), cq_interface.id); iq_file << fmt::format( @@ -542,7 +542,7 @@ void dump_command_queue_raw_data( } // Read out in pages - vector read_data; + std::vector read_data; read_data.resize(dispatch_constants::TRANSFER_PAGE_SIZE); out_file << std::endl; out_file << fmt::format( diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index 4cece79d0c4..69fcdb4af45 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -78,8 +78,173 @@ size_t KernelCompileHash(const std::shared_ptr kernel, JitBuildOptions & } // namespace namespace detail { +class Program_ { + public: + Program_(); + + Program_(const Program_ &other) = delete; + Program_& operator=(const Program_ &other) = delete; + + Program_(Program_ &&other) = default; + Program_& operator=(Program_ &&other) = default; + + void set_runtime_id(uint64_t id); + ~Program_() noexcept = default; + + uint64_t get_id() const; + uint64_t get_runtime_id() const; + + size_t num_kernels() const; + + const std::vector> &circular_buffers() const; + + const std::vector< Semaphore > & semaphores() const; + + KernelGroup * kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index); + std::vector& get_kernel_groups(uint32_t programmable_core_type_index); + void add_buffer(std::shared_ptr buf); + void release_buffers(); + std::vector> circular_buffers_on_core(const CoreCoord &core) const; + + std::vector> circular_buffers_on_corerange(const CoreRange &cr) const; + + std::vector circular_buffers_unique_coreranges() const; + + std::vector> semaphores_on_core(const CoreCoord &core) const; + + size_t num_semaphores () const; + void init_semaphores ( const Device & device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const; + // XXXXX TODO: this should return a const reference + std::vector> logical_cores() const; + + void compile(Device * device, bool fd_bootloader_mode = false); + + void invalidate_circular_buffer_allocation(); + + void allocate_circular_buffers(const Device *device); + + bool is_finalized() const; + void finalize(Device *device); + std::shared_ptr get_kernel(KernelHandle kernel_id) const; + + ProgramConfig& get_program_config(uint32_t programmable_core_type_index); + + // debug/test + uint32_t get_sem_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const; + uint32_t get_cb_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const; + uint32_t get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const; + uint32_t get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const; + + private: + void populate_dispatch_data(Device *device); + + // Buffers temporarily owned by the program + std::vector> owned_buffer_pool = {}; + + // The buffer that holds the kernel/binaries/etc for this program + std::shared_ptr kernels_buffer = nullptr; + ProgramTransferInfo program_transfer_info; + + bool finalized_; + + struct CircularBufferAllocator { + CircularBufferAllocator(const CoreRange &core_range_) : core_range(core_range_) {} + + // Circular buffers are created and allocated at core range granularity + CoreRange core_range; + + // Holds vector of addresses where circular buffers are allocated [start, end) + // There are multiple ranges because per core L1 regions are not in lockstep but circular buffers spanning multiple cores must share the same address + // To enable this, circular buffer address is the maximum address amongst all of its target cores + // This vector is sorted from lower to higher address spaces + std::vector> l1_regions; + + // Returns address for next circular buffer + // Circular buffers are placed sequentially on a core so the next available address gets appended to the last L1 region + uint64_t get_cb_region_end() const { + return this->l1_regions.empty() ? 0 : this->l1_regions.back().second; + } + + // If address is the end of the last L1 region, the last region is extended by size bytes, + // otherwise address must be higher than existing regions and a new L1 region [address, size) is added + void mark_address(uint64_t address, uint64_t size, uint64_t base_address); + + // Reset when circular buffer allocation is invalidated + void reset_available_addresses() { this->l1_regions.clear(); } + }; + + uint64_t id; // Need to make non-const due to move constructor + uint64_t runtime_id; + static std::atomic program_counter; + std::vector >> kernels_; + std::vector grid_extent_; + + std::vector> circular_buffers_; + std::unordered_map> circular_buffer_by_id_; + // Tracks which circular buffer indices are being used + std::unordered_map> per_core_cb_indices_; + // Used to generate circular buffer addresses. There is one CircularBufferAllocator per unique CoreRange + std::vector cb_allocators_; + + std::vector semaphores_; + + std::unordered_set compiled_; + bool local_circular_buffer_allocation_needed_; + + static constexpr uint8_t core_to_kernel_group_invalid_index = 0xff; + std::vector> kernel_groups_; + std::vector> core_to_kernel_group_index_table_; + uint32_t tensix_go_signal_count_; + + std::vector> config_buffers_; + + std::vector program_configs_; + std::vector program_config_sizes_; + + std::unordered_map cached_program_command_sequences_; + + friend std::shared_ptr GetCircularBuffer(const Program &program, CBHandle id); + friend void ValidateCircularBufferRegion(const Program &program, const Device *device); + + friend KernelHandle AddKernel(Program &program, std::shared_ptr kernel, const HalProgrammableCoreType core_type); + + KernelHandle add_kernel(std::shared_ptr kernel, const HalProgrammableCoreType &core_type); + + CBHandle add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config); + std::shared_ptr get_circular_buffer(CBHandle cb_id) const; + + void add_semaphore(const CoreRangeSet & crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type); + + friend void AddConfigBuffer(Program &program, std::shared_ptr config_buffer); + void add_config_buffer(std::shared_ptr config_buffer); + + // Ensures that statically allocated circular buffers do not grow into L1 buffer space + void validate_circular_buffer_region(const Device *device) const; + + void set_cb_data_fmt( Device *device, const std::vector & crs, JitBuildOptions& build_options) const; + + void set_cb_tile_dims( Device *device, const std::vector & crs, JitBuildOptions& build_options) const; + + void update_kernel_groups(uint32_t programmable_core_type_index); + + uint32_t& get_program_config_size(uint32_t programmable_core_type_index); + + uint32_t finalize_rt_args(uint32_t programmable_core_type_index, uint32_t base_offset); + uint32_t finalize_sems(uint32_t programmable_core_type_index, uint32_t base_offset); + uint32_t finalize_cbs(uint32_t programmable_core_type_index, uint32_t base_offset); + uint32_t finalize_kernel_bins(Device *device, uint32_t programmable_core_type_index, uint32_t base_offset); + void set_launch_msg_sem_offsets(); + + bool runs_on_noc_unicast_only_cores(); + bool runs_on_noc_multicast_only_cores(); + + friend HWCommandQueue; + friend EnqueueProgramCommand; + friend Program; +}; + KernelHandle AddKernel (Program &program, std::shared_ptr kernel, const HalProgrammableCoreType core_type) { - return program.add_kernel(kernel, core_type); + return program.pimpl_->add_kernel(std::move(kernel), core_type); } std::shared_ptr GetKernel(const Program &program, KernelHandle kernel_id) { @@ -87,16 +252,16 @@ std::shared_ptr GetKernel(const Program &program, KernelHandle kernel_id } std::shared_ptr GetCircularBuffer(const Program &program, CBHandle id) { - return program.get_circular_buffer(id); + return program.pimpl_->get_circular_buffer(id); } // Checks that circular buffers do not grow into L1 buffer space void ValidateCircularBufferRegion(const Program &program, const Device *device) { - program.validate_circular_buffer_region(device); + program.pimpl_->validate_circular_buffer_region(device); } void AddConfigBuffer(Program &program, std::shared_ptr config_buffer) { - program.add_config_buffer(config_buffer); + program.pimpl_->add_config_buffer(std::move(config_buffer)); } void EnablePersistentKernelCache() { enable_persistent_kernel_cache = true; } @@ -104,12 +269,11 @@ void EnablePersistentKernelCache() { enable_persistent_kernel_cache = true; } void DisablePersistentKernelCache() { enable_persistent_kernel_cache = false; } } // namespace detail -std::atomic Program::program_counter = 0; +std::atomic detail::Program_::program_counter = 0; -Program::Program() : +detail::Program_::Program_() : id(program_counter++), runtime_id(0), - worker_crs_(), local_circular_buffer_allocation_needed_(false), finalized_(false) { uint32_t programmable_core_count = hal.get_programmable_core_type_count(); @@ -124,7 +288,9 @@ Program::Program() : program_config_sizes_.resize(programmable_core_count); } -KernelHandle Program::add_kernel(std::shared_ptr kernel, const HalProgrammableCoreType &programmable_core_type) { +Program::Program() : pimpl_(std::make_unique()) {} + +KernelHandle detail::Program_::add_kernel(std::shared_ptr kernel, const HalProgrammableCoreType &programmable_core_type) { TT_FATAL(this->compiled_.empty(), "Cannot add kernel to an already compiled program {}", this->id); // Id is unique across all kernels on all core types KernelHandle id = this->num_kernels(); @@ -135,7 +301,7 @@ KernelHandle Program::add_kernel(std::shared_ptr kernel, const HalProgra return id; } -std::shared_ptr Program::get_kernel(KernelHandle kernel_id) const { +std::shared_ptr detail::Program_::get_kernel(KernelHandle kernel_id) const { // TT_ASSERT(kernel_id < this->kernels_.size(), "Expected Kernel with ID {} to be in Program {}", kernel_id, // this->id); // find coretype based on kernel_id @@ -149,10 +315,12 @@ std::shared_ptr Program::get_kernel(KernelHandle kernel_id) const { return nullptr; } +std::shared_ptr Program::get_kernel(KernelHandle kernel_id) const { return pimpl_->get_kernel(kernel_id); } + KernelGroup::KernelGroup() : core_ranges(CoreRangeSet()) {} KernelGroup::KernelGroup( - const Program &program, + const detail::Program_ &program, uint32_t programmable_core_type_index, kernel_id_array_t kernel_ids, bool erisc_is_idle, @@ -219,12 +387,16 @@ CoreType KernelGroup::get_core_type() const { return hal.get_core_type(this->programmable_core_type_index); }; -std::vector &Program::get_kernel_groups(uint32_t programmable_core_type_index) { +std::vector &detail::Program_::get_kernel_groups(uint32_t programmable_core_type_index) { update_kernel_groups(programmable_core_type_index); return kernel_groups_[programmable_core_type_index]; } -KernelGroup *Program::kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index) { +std::vector &Program::get_kernel_groups(uint32_t programmable_core_type_index) { + return pimpl_->get_kernel_groups(programmable_core_type_index); +} + +KernelGroup *detail::Program_::kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index) { update_kernel_groups(programmable_core_type_index); if (core.x >= grid_extent_[programmable_core_type_index].x || core.y >= grid_extent_[programmable_core_type_index].y) return nullptr; @@ -232,6 +404,10 @@ KernelGroup *Program::kernels_on_core(const CoreCoord &core, uint32_t programmab return (index == core_to_kernel_group_invalid_index) ? nullptr : &kernel_groups_[programmable_core_type_index].at(index); } +KernelGroup *Program::kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index) { + return pimpl_->kernels_on_core(core, programmable_core_type_index); +} + struct KernelGroupInt { bool valid; kernel_id_array_t kernel_ids; @@ -262,7 +438,7 @@ struct KernelGroupIntHasher { } }; -void Program::update_kernel_groups(uint32_t programmable_core_type_index) { +void detail::Program_::update_kernel_groups(uint32_t programmable_core_type_index) { if (core_to_kernel_group_index_table_[programmable_core_type_index].size() == 0) { bool erisc_is_idle = false; @@ -351,7 +527,7 @@ void Program::update_kernel_groups(uint32_t programmable_core_type_index) { } } -void Program::CircularBufferAllocator::mark_address(uint64_t address, uint64_t size, uint64_t base_address) { +void detail::Program_::CircularBufferAllocator::mark_address(uint64_t address, uint64_t size, uint64_t base_address) { if (this->l1_regions.empty()) { this->l1_regions.emplace_back(base_address, base_address); } @@ -370,7 +546,7 @@ void Program::CircularBufferAllocator::mark_address(uint64_t address, uint64_t s } } -CBHandle Program::add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config) { +CBHandle detail::Program_::add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config) { TT_FATAL(this->compiled_.empty(), "Cannot add circular buffer to an already compiled program {}", this->id); std::shared_ptr circular_buffer = std::make_shared(core_range_set, config); // Globally allocated circular buffer do not invalidate allocation because their addresses are tracked by memory @@ -421,14 +597,18 @@ CBHandle Program::add_circular_buffer(const CoreRangeSet &core_range_set, const return circular_buffer->id(); } -std::shared_ptr Program::get_circular_buffer(CBHandle cb_id) const { +CBHandle Program::add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config) { + return pimpl_->add_circular_buffer(core_range_set, config); +} + +std::shared_ptr detail::Program_::get_circular_buffer(CBHandle cb_id) const { if (this->circular_buffer_by_id_.find(cb_id) == this->circular_buffer_by_id_.end()) { TT_THROW("No circular buffer with id {} exists in Program {}", cb_id, this->id); } return this->circular_buffer_by_id_.at(cb_id); } -const std::vector> Program::circular_buffers_on_core(const CoreCoord &core) const { +std::vector> detail::Program_::circular_buffers_on_core(const CoreCoord &core) const { std::vector> cbs_on_core; for (auto circular_buffer : circular_buffers_) { if (circular_buffer->is_on_logical_core(core)) { @@ -438,7 +618,11 @@ const std::vector> Program::circular_buffers_on_ return cbs_on_core; } -const std::vector> Program::circular_buffers_on_corerange(const CoreRange &cr) const { +std::vector> Program::circular_buffers_on_core(const CoreCoord &core) const { + return pimpl_->circular_buffers_on_core(core); +} + +std::vector> detail::Program_::circular_buffers_on_corerange(const CoreRange &cr) const { std::vector> cbs_on_core; for (auto circular_buffer : circular_buffers_) { if (circular_buffer->is_on_logical_corerange(cr)) { @@ -448,7 +632,11 @@ const std::vector> Program::circular_buffers_on_ return cbs_on_core; } -const std::vector Program::circular_buffers_unique_coreranges() const { +std::vector> Program::circular_buffers_on_corerange(const CoreRange &cr) const { + return pimpl_->circular_buffers_on_corerange(cr); +} + +std::vector detail::Program_::circular_buffers_unique_coreranges() const { std::vector core_ranges; for (auto circular_buffer : circular_buffers_) { for (const CoreRange &core_range : circular_buffer->core_ranges().ranges()) { @@ -460,7 +648,11 @@ const std::vector Program::circular_buffers_unique_coreranges() const return core_ranges; } -void Program::invalidate_circular_buffer_allocation() { +std::vector Program::circular_buffers_unique_coreranges() const { + return pimpl_->circular_buffers_unique_coreranges(); +} + +void detail::Program_::invalidate_circular_buffer_allocation() { if (this->local_circular_buffer_allocation_needed_) { return; } @@ -470,7 +662,9 @@ void Program::invalidate_circular_buffer_allocation() { this->local_circular_buffer_allocation_needed_ = true; } -void Program::allocate_circular_buffers(const Device *device) { +void Program::invalidate_circular_buffer_allocation() { pimpl_->invalidate_circular_buffer_allocation(); } + +void detail::Program_::allocate_circular_buffers(const Device *device) { ZoneScoped; if (not this->local_circular_buffer_allocation_needed_) { return; @@ -512,7 +706,9 @@ void Program::allocate_circular_buffers(const Device *device) { this->local_circular_buffer_allocation_needed_ = false; } -void Program::validate_circular_buffer_region(const Device *device) const { +void Program::allocate_circular_buffers(const Device *device) { pimpl_->allocate_circular_buffers(device); } + +void detail::Program_::validate_circular_buffer_region(const Device *device) const { ZoneScoped; // Banks are in lockstep so we only need to get lowest L1 address of one compute and storage core @@ -549,9 +745,11 @@ void Program::validate_circular_buffer_region(const Device *device) const { size_t Program::num_semaphores(const CoreCoord &core) const { return semaphores_on_core(core).size(); } -size_t Program::num_semaphores() const { return semaphores_.size(); } +size_t detail::Program_::num_semaphores() const { return semaphores_.size(); } -void Program::init_semaphores(const Device &device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const { +size_t Program::num_semaphores() const { return pimpl_->num_semaphores(); } + +void detail::Program_::init_semaphores(const Device &device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const { auto semaphores_on_core = this->semaphores_on_core(logical_core); uint64_t kernel_config_base = hal.get_dev_addr(programmable_core_type_index, HalL1MemAddrType::KERNEL_CONFIG); @@ -566,14 +764,22 @@ void Program::init_semaphores(const Device &device, const CoreCoord &logical_cor } } -void Program::add_semaphore(const CoreRangeSet &crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type) { +void Program::init_semaphores(const Device &device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const { + pimpl_->init_semaphores(device, logical_core, programmable_core_type_index); +} + +void detail::Program_::add_semaphore(const CoreRangeSet &crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type) { TT_FATAL(this->compiled_.empty(), "Cannot add semaphore to an already compiled program {}", this->id); semaphores_.emplace_back(Semaphore(crs, semaphore_id, init_value, core_type)); } -void Program::add_config_buffer(std::shared_ptr config_buffer) { config_buffers_.emplace_back(config_buffer); } +void Program::add_semaphore(const CoreRangeSet &crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type) { + pimpl_->add_semaphore(crs, semaphore_id, init_value, core_type); +} -std::vector> Program::logical_cores() const { +void detail::Program_::add_config_buffer(std::shared_ptr config_buffer) { config_buffers_.emplace_back(config_buffer); } + +std::vector> detail::Program_::logical_cores() const { std::vector> cores_in_program; std::vector> unique_cores; for (uint32_t programmable_core_type_index = 0; programmable_core_type_index < kernels_.size(); programmable_core_type_index++) { @@ -593,17 +799,9 @@ std::vector> Program::logical_cores() const { return cores_in_program; } -void Program::construct_core_range_set_for_worker_cores() { - bool found_kernels = false; - uint32_t index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); - for (auto [id, kernel] : kernels_[index]) { - this->worker_crs_ = this->worker_crs_.merge(kernel->core_range_set()); - found_kernels = true; - } - TT_ASSERT(!found_kernels || this->worker_crs_.ranges().size() >= 1, "Invalid core range set"); -} +std::vector> Program::logical_cores() const { return pimpl_->logical_cores(); } -void Program::set_cb_data_fmt(Device *device, const std::vector &crs, JitBuildOptions &build_options) const { +void detail::Program_::set_cb_data_fmt(Device *device, const std::vector &crs, JitBuildOptions &build_options) const { ZoneScoped; for (auto logical_cr : crs) { auto cbs_on_core = this->circular_buffers_on_corerange(logical_cr); @@ -616,7 +814,7 @@ void Program::set_cb_data_fmt(Device *device, const std::vector &crs, } } -void Program::set_cb_tile_dims(Device *device, const std::vector &crs, JitBuildOptions &build_options) const { +void detail::Program_::set_cb_tile_dims(Device *device, const std::vector &crs, JitBuildOptions &build_options) const { ZoneScoped; for (const auto &logical_cr : crs) { auto cbs_on_core = this->circular_buffers_on_corerange(logical_cr); @@ -647,7 +845,7 @@ void Program::set_cb_tile_dims(Device *device, const std::vector &crs } } -void Program::populate_dispatch_data(Device *device) { +void detail::Program_::populate_dispatch_data(Device *device) { static const uint32_t processor_to_firmware_base[] = { MEM_BRISC_FIRMWARE_BASE, MEM_NCRISC_FIRMWARE_BASE, @@ -668,7 +866,7 @@ void Program::populate_dispatch_data(Device *device) { auto extract_dst_noc_unicast_info = [&device](const auto &ranges, const CoreType core_type) -> std::vector> { // This API extracts all the pairs of noc multicast encodings given a set of core ranges - vector> dst_noc_unicast_info; + std::vector> dst_noc_unicast_info; for (const CoreRange &core_range : ranges) { for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { @@ -682,13 +880,13 @@ void Program::populate_dispatch_data(Device *device) { // Unicast/Multicast Semaphores for (const Semaphore &semaphore : this->semaphores()) { - vector semaphore_data(1); + std::vector semaphore_data(1); semaphore_data[0] = semaphore.initial_value(); // TODO: use semaphore.core_type from main if (semaphore.core_type() == CoreType::WORKER) { uint32_t index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); - vector> dst_noc_multicast_info = + std::vector> dst_noc_multicast_info = device->extract_dst_noc_multicast_info>( semaphore.core_range_set().ranges(), CoreType::WORKER); transfer_info transfer_info = { @@ -700,7 +898,7 @@ void Program::populate_dispatch_data(Device *device) { } else if (semaphore.core_type() == CoreType::ETH) { // TODO: we only fast dispatch to active eth... uint32_t index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH); - vector> dst_noc_unicast_info = + std::vector> dst_noc_unicast_info = extract_dst_noc_unicast_info(semaphore.core_range_set().ranges(), CoreType::ETH); transfer_info transfer_info = { .dst_base_addr = semaphore.offset(), @@ -752,7 +950,7 @@ void Program::populate_dispatch_data(Device *device) { uint32_t max_kernel_bin_size = processor_to_firmware_size[sub_kernels[sub_kernel_index]]; - kernel_bin.process_spans([&](vector::const_iterator mem_ptr, uint64_t dst, uint32_t len) { + kernel_bin.process_spans([&](std::vector::const_iterator mem_ptr, uint64_t dst, uint32_t len) { max_kernel_bin_size -= dst - processor_to_firmware_base[sub_kernels[sub_kernel_index]]; @@ -802,7 +1000,7 @@ void Program::populate_dispatch_data(Device *device) { device->extract_dst_noc_multicast_info>( kernel_group.core_ranges.ranges(), core_type); - vector kernel_ids; + std::vector kernel_ids; for (auto &optional_id : kernel_group.kernel_ids) { if (optional_id) { kernel_ids.push_back(optional_id.value()); @@ -817,10 +1015,10 @@ void Program::populate_dispatch_data(Device *device) { } } else { TT_ASSERT(core_type == CoreType::ETH); - vector> dst_noc_unicast_info = + std::vector> dst_noc_unicast_info = extract_dst_noc_unicast_info(kernel_group.core_ranges.ranges(), core_type); - vector kernel_ids; + std::vector kernel_ids; if (kernel_group.kernel_ids[DISPATCH_CLASS_ETH_DM0]) { kernel_ids.push_back(kernel_group.kernel_ids[DISPATCH_CLASS_ETH_DM0].value()); } @@ -841,15 +1039,15 @@ void Program::populate_dispatch_data(Device *device) { return; } -uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32_t base_offset) { +uint32_t detail::Program_::finalize_rt_args(uint32_t programmable_core_type_index, uint32_t base_offset) { // Iterate over kernels in the program and "level" the number of RTAs based on the max // Unique RTAs are packed across dispatch classes // Common RTAs come after unique RTAs uint32_t processor_classes = hal.get_processor_classes_count(programmable_core_type_index); - vector max_rtas(processor_classes); - vector max_crtas(processor_classes); + std::vector max_rtas(processor_classes); + std::vector max_crtas(processor_classes); uint32_t max_unique_rta_size = 0; uint32_t total_crta_size = 0; @@ -864,7 +1062,7 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32 max_rtas[dispatch_class] = 0; auto& optional_id = kg.kernel_ids[dispatch_class]; if (optional_id) { - auto kernel = detail::GetKernel(*this, optional_id.value()); + auto kernel = get_kernel(optional_id.value()); for (const CoreRange &core_range : kg.core_ranges.ranges()) { for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { @@ -882,7 +1080,7 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32 auto& optional_id = kg.kernel_ids[dispatch_class]; kg.rta_sizes[dispatch_class] = max_rtas[dispatch_class] * sizeof(uint32_t); if (optional_id) { - auto kernel = detail::GetKernel(*this, optional_id.value()); + auto kernel = get_kernel(optional_id.value()); kernel->set_runtime_args_count(kg.core_ranges, max_rtas[dispatch_class]); kg.launch_msg.kernel_config.rta_offset[dispatch_class].rta_offset = base_offset + offset; offset += max_rtas[dispatch_class] * sizeof(uint32_t); @@ -901,7 +1099,7 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32 } // Find the max # common RTAs across all kernels for each dispatch class for (size_t kernel_id = 0; kernel_id < this->num_kernels(); kernel_id++) { - auto kernel = detail::GetKernel(*this, kernel_id); + auto kernel = get_kernel(kernel_id); // TODO: kernels should be stored by programmable core type if (core_type == kernel->get_kernel_core_type() && (programmable_core_type == HalProgrammableCoreType::IDLE_ETH) == kernel->is_idle_eth()) { @@ -924,7 +1122,7 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32 // Set the runtime_args_data sizing info based on the shared max for (size_t kernel_id = 0; kernel_id < this->num_kernels(); kernel_id++) { - auto kernel = detail::GetKernel(*this, kernel_id); + auto kernel = get_kernel(kernel_id); // TODO: as above, fix when kernels are stored by programmable core type if (core_type == kernel->get_kernel_core_type() && (programmable_core_type == HalProgrammableCoreType::IDLE_ETH) == kernel->is_idle_eth()) { @@ -947,11 +1145,15 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32 return max_unique_rta_size + total_crta_size; } -ProgramConfig& Program::get_program_config(uint32_t programmable_core_type_index) { +ProgramConfig& detail::Program_::get_program_config(uint32_t programmable_core_type_index) { return this->program_configs_[programmable_core_type_index]; } -uint32_t Program::finalize_sems(uint32_t programmable_core_type_index, uint32_t base_offset) { +ProgramConfig& Program::get_program_config(uint32_t programmable_core_type_index) { + return pimpl_->get_program_config(programmable_core_type_index); +} + +uint32_t detail::Program_::finalize_sems(uint32_t programmable_core_type_index, uint32_t base_offset) { int max_id = -1; CoreType core_type = hal.get_core_type(programmable_core_type_index); @@ -969,7 +1171,7 @@ uint32_t Program::finalize_sems(uint32_t programmable_core_type_index, uint32_t return base_offset + sem_size; } -void Program::set_launch_msg_sem_offsets() { +void detail::Program_::set_launch_msg_sem_offsets() { for (uint32_t kg_type_index = 0; kg_type_index < hal.get_programmable_core_type_count(); kg_type_index++) { for (auto& kg : this->get_kernel_groups(kg_type_index)) { @@ -981,7 +1183,7 @@ void Program::set_launch_msg_sem_offsets() { } } -uint32_t Program::finalize_cbs(uint32_t programmable_core_type_index, uint32_t base_offset) { +uint32_t detail::Program_::finalize_cbs(uint32_t programmable_core_type_index, uint32_t base_offset) { int count = 0; @@ -1003,7 +1205,7 @@ uint32_t Program::finalize_cbs(uint32_t programmable_core_type_index, uint32_t b return base_offset + cb_size; } -uint32_t Program::finalize_kernel_bins(Device *device, uint32_t programmable_core_type_index, uint32_t base_offset) { +uint32_t detail::Program_::finalize_kernel_bins(Device *device, uint32_t programmable_core_type_index, uint32_t base_offset) { uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); @@ -1062,11 +1264,11 @@ uint32_t Program::finalize_kernel_bins(Device *device, uint32_t programmable_cor return max_offset; } -uint32_t& Program::get_program_config_size(uint32_t programmable_core_type_index) { +uint32_t& detail::Program_::get_program_config_size(uint32_t programmable_core_type_index) { return this->program_config_sizes_[programmable_core_type_index]; } -void Program::finalize(Device *device) { +void detail::Program_::finalize(Device *device) { // Store the number of tensix "go signals" for use by CQ // CQ iterates over these to update runtime addresses, needs to know when eth begins (after tensix) // TODO: should store all the counts @@ -1105,7 +1307,9 @@ void Program::finalize(Device *device) { finalized_ = true; } -void Program::compile(Device *device, bool fd_bootloader_mode) { +void Program::finalize(Device *device) { pimpl_->finalize(device); } + +void detail::Program_::compile(Device *device, bool fd_bootloader_mode) { ZoneScoped; if (compiled_.contains(device->id())) { return; @@ -1185,7 +1389,7 @@ void Program::compile(Device *device, bool fd_bootloader_mode) { } if (detail::CompilationReporter::enabled()) { detail::CompilationReporter::inst().add_kernel_compile_stats( - *this, kernel, cache_hit, kernel_hash); + get_id(), kernel, cache_hit, kernel_hash); } kernel->set_binary_path(build_options.path); }, @@ -1202,23 +1406,28 @@ void Program::compile(Device *device, bool fd_bootloader_mode) { sync_build_step(events); - this->construct_core_range_set_for_worker_cores(); if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr) { this->populate_dispatch_data(device); // TODO: maybe rename } if (detail::CompilationReporter::enabled()) { - detail::CompilationReporter::inst().flush_program_entry(*this, enable_persistent_kernel_cache); + detail::CompilationReporter::inst().flush_program_entry(get_id(), num_kernels(), [this](size_t kernel_id) { + return get_kernel(kernel_id); + }, enable_persistent_kernel_cache); } if (detail::MemoryReporter::enabled()) { - detail::MemoryReporter::inst().flush_program_memory_usage(*this, device); + detail::MemoryReporter::inst().flush_program_memory_usage(get_id(), device); } compiled_.insert(device->id()); } -void Program::set_runtime_id(uint64_t id) { this->runtime_id = id; } +void Program::compile(Device *device, bool fd_bootloader_mode) { pimpl_->compile(device, fd_bootloader_mode); } -uint32_t Program::get_sem_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const { +void detail::Program_::set_runtime_id(uint64_t id) { this->runtime_id = id; } + +void Program::set_runtime_id(uint64_t id) { pimpl_->set_runtime_id(id); } + +uint32_t detail::Program_::get_sem_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const { CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type); HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core); @@ -1231,7 +1440,11 @@ uint32_t Program::get_sem_base_addr(Device *device, CoreCoord logical_core, Core return base_addr + this->program_configs_[index].sem_offset; } -uint32_t Program::get_cb_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const { +uint32_t Program::get_sem_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const { + return pimpl_->get_sem_base_addr(device, logical_core, core_type); +} + +uint32_t detail::Program_::get_cb_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const { CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type); HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core); @@ -1244,7 +1457,11 @@ uint32_t Program::get_cb_base_addr(Device *device, CoreCoord logical_core, CoreT return base_addr + this->program_configs_[index].cb_offset; } -uint32_t Program::get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const { +uint32_t Program::get_cb_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const { + return pimpl_->get_cb_base_addr(device, logical_core, core_type); +} + +uint32_t detail::Program_::get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const { CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type); HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core); @@ -1253,7 +1470,11 @@ uint32_t Program::get_sem_size(Device *device, CoreCoord logical_core, CoreType return this->program_configs_[index].sem_size; } -uint32_t Program::get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const { +uint32_t Program::get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const { + return pimpl_->get_sem_size(device, logical_core, core_type); +} + +uint32_t detail::Program_::get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const { CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type); HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core); @@ -1262,17 +1483,92 @@ uint32_t Program::get_cb_size(Device *device, CoreCoord logical_core, CoreType c return this->program_configs_[index].cb_size; } +uint32_t Program::get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const { + return pimpl_->get_cb_size(device, logical_core, core_type); +} + // TODO: Too low level for program.cpp. Move this to HAL, once we have support. -bool Program::runs_on_noc_unicast_only_cores() { +bool detail::Program_::runs_on_noc_unicast_only_cores() { return (hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH) != -1 and - this->get_kernel_groups(hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH)).size()); + not this->get_kernel_groups(hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH)).empty()); } +bool Program::runs_on_noc_unicast_only_cores() { return pimpl_->runs_on_noc_unicast_only_cores(); } + // TODO: Too low level for program.cpp. Move this to HAL, once we have support. -bool Program::runs_on_noc_multicast_only_cores() { +bool detail::Program_::runs_on_noc_multicast_only_cores() { return (hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX) != -1 and - this->get_kernel_groups(hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)).size()); + not this->get_kernel_groups(hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)).empty()); +} + +bool Program::runs_on_noc_multicast_only_cores() { return pimpl_->runs_on_noc_multicast_only_cores(); } + +Program::Program(Program &&other) noexcept = default; + +Program& Program::operator=(Program &&other) noexcept = default; + +Program::~Program() noexcept = default; + +uint64_t detail::Program_::get_id() const { return this->id; } + +uint64_t Program::get_id() const { return pimpl_->get_id(); } + +uint64_t detail::Program_::get_runtime_id() const { return this->runtime_id; } + +uint64_t Program::get_runtime_id() const { return pimpl_->get_runtime_id(); } + +size_t detail::Program_::num_kernels() const { + size_t count = 0; + for (const auto& kernels : kernels_) { + count += kernels.size(); + } + return count; +} + +size_t Program::num_kernels() const { return pimpl_->num_kernels(); } + +const std::vector> &detail::Program_::circular_buffers() const { return circular_buffers_; } + +const std::vector> &Program::circular_buffers() const { return pimpl_->circular_buffers(); } + +const std::vector< Semaphore > & detail::Program_::semaphores() const { return semaphores_; } + +const std::vector< Semaphore > & Program::semaphores() const { return pimpl_->semaphores(); } + +void detail::Program_::add_buffer(std::shared_ptr buf) { owned_buffer_pool.push_back(std::move(buf)); } + +void Program::add_buffer(std::shared_ptr buf) { pimpl_->add_buffer(std::move(buf)); } + +void detail::Program_::release_buffers() { owned_buffer_pool = {}; } + +void Program::release_buffers() { pimpl_->release_buffers(); } + +std::vector> detail::Program_::semaphores_on_core(const CoreCoord &core) const { + std::vector> semaphores; + for (const Semaphore &s : this->semaphores_) { + if (s.initialized_on_logical_core(core)) { + semaphores.emplace_back(std::cref(s)); + } + } + return semaphores; +} + +std::vector> Program::semaphores_on_core(const CoreCoord &core) const { + return pimpl_->semaphores_on_core(core); +} + +bool detail::Program_::is_finalized() const { return this->finalized_; } + +bool Program::is_finalized() const { return pimpl_->is_finalized(); } + +const ProgramTransferInfo &Program::get_program_transfer_info() const noexcept { return pimpl_->program_transfer_info; } + +const std::shared_ptr &Program::get_kernels_buffer() const noexcept { return pimpl_->kernels_buffer; } + +const std::vector &Program::get_program_config_sizes() const noexcept { return pimpl_->program_config_sizes_; } + +std::unordered_map &Program::get_cached_program_command_sequences() noexcept { + return pimpl_->cached_program_command_sequences_; } -Program::~Program() {} } // namespace tt::tt_metal diff --git a/tt_metal/impl/program/program.hpp b/tt_metal/impl/program/program.hpp index 05aa822d787..b239ddf93b3 100644 --- a/tt_metal/impl/program/program.hpp +++ b/tt_metal/impl/program/program.hpp @@ -35,6 +35,8 @@ class EnqueueProgramCommand; class HWCommandQueue; class JitBuildOptions; namespace detail{ + class Program_; + void ValidateCircularBufferRegion(const Program &program, const Device *device); KernelHandle AddKernel (Program &program, std::shared_ptr kernel, const HalProgrammableCoreType core_type); std::shared_ptr GetKernel(const Program &program, KernelHandle kernel_id); @@ -56,7 +58,7 @@ struct KernelGroup { KernelGroup(); KernelGroup( - const Program &program, + const detail::Program_ &program, uint32_t programmable_core_type_index, kernel_id_array_t kernel_ids, bool erisc_is_idle, @@ -90,48 +92,32 @@ class Program { Program(const Program &other) = delete; Program& operator=(const Program &other) = delete; - Program(Program &&other) = default; - Program& operator=(Program &&other) = default; + Program(Program &&other) noexcept; + Program& operator=(Program &&other) noexcept; void set_runtime_id(uint64_t id); - ~Program(); - - void construct_core_range_set_for_worker_cores(); + ~Program() noexcept; - const uint64_t get_id() const { return this->id; } - const uint64_t get_runtime_id() const { return this->runtime_id; } + uint64_t get_id() const; + uint64_t get_runtime_id() const; - size_t num_kernels() const { - size_t count = 0; - for (const auto& kernels : kernels_) { - count += kernels.size(); - } - return count; - } + size_t num_kernels() const; - const std::vector> &circular_buffers() const { return circular_buffers_; } + const std::vector> &circular_buffers() const; - const std::vector< Semaphore > & semaphores() const { return semaphores_; } + const std::vector< Semaphore > & semaphores() const; KernelGroup * kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index); std::vector& get_kernel_groups(uint32_t programmable_core_type_index); - inline void add_buffer(std::shared_ptr buf) { owned_buffer_pool.push_back(buf); } - inline void release_buffers() { owned_buffer_pool = {}; } - const std::vector> circular_buffers_on_core(const CoreCoord &core) const; + void add_buffer(std::shared_ptr buf); + void release_buffers(); + std::vector> circular_buffers_on_core(const CoreCoord &core) const; - const std::vector> circular_buffers_on_corerange(const CoreRange &cr) const; + std::vector> circular_buffers_on_corerange(const CoreRange &cr) const; - const std::vector circular_buffers_unique_coreranges() const; + std::vector circular_buffers_unique_coreranges() const; - auto semaphores_on_core(const CoreCoord &core) const { - std::vector> semaphores; - for ( const Semaphore & s : this->semaphores_) { - if (s.initialized_on_logical_core(core)) { - semaphores.emplace_back(std::cref(s)); - } - } - return semaphores; - } + std::vector> semaphores_on_core(const CoreCoord &core) const; size_t num_semaphores ( const CoreCoord & core ) const; size_t num_semaphores () const; @@ -139,16 +125,13 @@ class Program { // XXXXX TODO: this should return a const reference std::vector> logical_cores() const; - // Is worker_crs_ used anywhere? - const CoreRangeSet& get_worker_core_range_set() const { return worker_crs_; }; - void compile(Device * device, bool fd_bootloader_mode = false); void invalidate_circular_buffer_allocation(); void allocate_circular_buffers(const Device *device); - bool is_finalized() const { return this->finalized_; } + bool is_finalized() const; void finalize(Device *device); std::shared_ptr get_kernel(KernelHandle kernel_id) const; @@ -161,73 +144,7 @@ class Program { uint32_t get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const; private: - void populate_dispatch_data(Device *device); - - // Buffers temporarily owned by the program - std::vector> owned_buffer_pool = {}; - - // The buffer that holds the kernel/binaries/etc for this program - std::shared_ptr kernels_buffer = nullptr; - ProgramTransferInfo program_transfer_info; - - bool finalized_; - - struct CircularBufferAllocator { - CircularBufferAllocator(const CoreRange &core_range_) : core_range(core_range_) {} - - // Circular buffers are created and allocated at core range granularity - CoreRange core_range; - - // Holds vector of addresses where circular buffers are allocated [start, end) - // There are multiple ranges because per core L1 regions are not in lockstep but circular buffers spanning multiple cores must share the same address - // To enable this, circular buffer address is the maximum address amongst all of its target cores - // This vector is sorted from lower to higher address spaces - std::vector> l1_regions; - - // Returns address for next circular buffer - // Circular buffers are placed sequentially on a core so the next available address gets appended to the last L1 region - uint64_t get_cb_region_end() const { - return this->l1_regions.empty() ? 0 : this->l1_regions.back().second; - } - - // If address is the end of the last L1 region, the last region is extended by size bytes, - // otherwise address must be higher than existing regions and a new L1 region [address, size) is added - void mark_address(uint64_t address, uint64_t size, uint64_t base_address); - - // Reset when circular buffer allocation is invalidated - void reset_available_addresses() { this->l1_regions.clear(); } - }; - - uint64_t id; // Need to make non-const due to move constructor - uint64_t runtime_id; - static std::atomic program_counter; - std::vector >> kernels_; - std::vector grid_extent_; - - std::vector> circular_buffers_; - std::unordered_map> circular_buffer_by_id_; - // Tracks which circular buffer indices are being used - std::unordered_map> per_core_cb_indices_; - // Used to generate circular buffer addresses. There is one CircularBufferAllocator per unique CoreRange - std::vector cb_allocators_; - - std::vector semaphores_; - - CoreRangeSet worker_crs_; - std::unordered_set compiled_; - bool local_circular_buffer_allocation_needed_; - - static constexpr uint8_t core_to_kernel_group_invalid_index = 0xff; - std::vector> kernel_groups_; - std::vector> core_to_kernel_group_index_table_; - uint32_t tensix_go_signal_count_; - - std::vector> config_buffers_; - - std::vector program_configs_; - std::vector program_config_sizes_; - - std::unordered_map cached_program_command_sequences_; + std::unique_ptr pimpl_; friend CBHandle CreateCircularBuffer(Program &program, const std::variant &core_spec, const CircularBufferConfig &config); friend std::shared_ptr detail::GetCircularBuffer(const Program &program, CBHandle id); @@ -237,38 +154,23 @@ class Program { friend std::shared_ptr detail::GetKernel(const Program &program, KernelHandle kernel_id); friend uint32_t CreateSemaphore(Program &program, const std::variant &core_spec, uint32_t initial_value, CoreType core_type); - KernelHandle add_kernel(std::shared_ptr kernel, const HalProgrammableCoreType &core_type); CBHandle add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config); - std::shared_ptr get_circular_buffer(CBHandle cb_id) const; void add_semaphore(const CoreRangeSet & crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type); friend void detail::AddConfigBuffer(Program &program, std::shared_ptr config_buffer); - void add_config_buffer(std::shared_ptr config_buffer); - - // Ensures that statically allocated circular buffers do not grow into L1 buffer space - void validate_circular_buffer_region(const Device *device) const; - - void set_cb_data_fmt( Device *device, const std::vector & crs, JitBuildOptions& build_options) const; - - void set_cb_tile_dims( Device *device, const std::vector & crs, JitBuildOptions& build_options) const; - - void update_kernel_groups(uint32_t programmable_core_type_index); - - uint32_t& get_program_config_size(uint32_t programmable_core_type_index); - - uint32_t finalize_rt_args(uint32_t programmable_core_type_index, uint32_t base_offset); - uint32_t finalize_sems(uint32_t programmable_core_type_index, uint32_t base_offset); - uint32_t finalize_cbs(uint32_t programmable_core_type_index, uint32_t base_offset); - uint32_t finalize_kernel_bins(Device *device, uint32_t programmable_core_type_index, uint32_t base_offset); - void set_launch_msg_sem_offsets(); bool runs_on_noc_unicast_only_cores(); bool runs_on_noc_multicast_only_cores(); friend HWCommandQueue; friend EnqueueProgramCommand; + + const ProgramTransferInfo &get_program_transfer_info() const noexcept; + const std::shared_ptr &get_kernels_buffer() const noexcept; + const std::vector &get_program_config_sizes() const noexcept; + std::unordered_map &get_cached_program_command_sequences() noexcept; }; } // namespace v0 diff --git a/tt_metal/impl/trace/trace.cpp b/tt_metal/impl/trace/trace.cpp index aaeea3d05b7..59d16af6b8c 100644 --- a/tt_metal/impl/trace/trace.cpp +++ b/tt_metal/impl/trace/trace.cpp @@ -30,7 +30,7 @@ size_t interleaved_page_size( const uint32_t buf_size, const uint32_t num_banks, const uint32_t min_size, const uint32_t max_size) { // Populate power of 2 numbers within min and max as candidates TT_FATAL(min_size > 0 and min_size <= max_size, "min_size {} not positive and less than or equal to max_size {}.", min_size, max_size); - vector candidates; + std::vector candidates; candidates.reserve(__builtin_clz(min_size) - __builtin_clz(max_size) + 1); for (uint32_t size = 1; size <= max_size; size <<= 1) { if (size >= min_size) { @@ -71,7 +71,7 @@ std::shared_ptr Trace::create_empty_trace_buffer() { } void Trace::initialize_buffer(CommandQueue& cq, std::shared_ptr trace_buffer) { - vector& trace_data = trace_buffer->desc->data; + std::vector& trace_data = trace_buffer->desc->data; uint64_t unpadded_size = trace_data.size() * sizeof(uint32_t); size_t page_size = interleaved_page_size( unpadded_size, cq.device()->num_banks(BufferType::DRAM), kExecBufPageMin, kExecBufPageMax); @@ -98,7 +98,7 @@ void Trace::initialize_buffer(CommandQueue& cq, std::shared_ptr tra // there is a cost to validation, please use it judiciously void Trace::validate_instance(const TraceBuffer& trace_buffer) { - vector backdoor_data; + std::vector backdoor_data; detail::ReadFromBuffer(trace_buffer.buffer, backdoor_data); if (backdoor_data != trace_buffer.desc->data) { log_info(LogMetalTrace, "Trace buffer expected: {}", trace_buffer.desc->data); diff --git a/tt_metal/jit_build/build.hpp b/tt_metal/jit_build/build.hpp index 2962cccc8af..fdb3751d435 100644 --- a/tt_metal/jit_build/build.hpp +++ b/tt_metal/jit_build/build.hpp @@ -121,7 +121,7 @@ class alignas(CACHE_LINE_ALIGNMENT) JitBuildState { // Set of build states // Used for parallel builds, builds all members in one call -typedef vector> JitBuildStateSet; +typedef std::vector> JitBuildStateSet; // Exracts a slice of builds from a JitBuildState // Used for parallel building a subset of the builds in a JitBuildStateSet diff --git a/tt_metal/jit_build/data_format.cpp b/tt_metal/jit_build/data_format.cpp index f5e9e7e8e7f..6e7d67276ad 100644 --- a/tt_metal/jit_build/data_format.cpp +++ b/tt_metal/jit_build/data_format.cpp @@ -1,10 +1,20 @@ // SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC // // SPDX-License-Identifier: Apache-2.0 + #include "data_format.hpp" -#include "hostdevcommon/common_runtime_address_map.h" -#include -#include + +#include // for basic_ostream +#include // for operator!= +#include // for set +#include // for char_traits +#include // for unordered_map + +#include "fmt/base.h" // for format_string +#include "tt_metal/common/assert.hpp" // for tt_throw, TT_FATAL +#include "tt_metal/common/base_types.hpp" // for UnpackToDestMode +#include "hostdevcommon/common_runtime_address_map.h" // for NUM_CIRCULAR_B... + namespace tt { static const std::set ALL_VALID_FORMATS = { diff --git a/tt_metal/jit_build/data_format.hpp b/tt_metal/jit_build/data_format.hpp index a92d6dbd2fc..c4ab84f2679 100644 --- a/tt_metal/jit_build/data_format.hpp +++ b/tt_metal/jit_build/data_format.hpp @@ -4,17 +4,16 @@ #pragma once #include -#include -#include #include -#include -#include "common/base.hpp" +#include "common/tt_backend_api_types.hpp" // for DataFormat +#include "device/tt_arch_types.h" // for ARCH +enum class UnpackToDestMode : std::uint8_t; namespace tt { static constexpr uint NUM_OPERANDS = 8; -enum class ExpPrecision : uint8_t +enum class ExpPrecision : std::uint8_t { A = 0, B = 1, diff --git a/tt_metal/llrt/llrt.cpp b/tt_metal/llrt/llrt.cpp index 28521cabc72..dc22b623c3a 100644 --- a/tt_metal/llrt/llrt.cpp +++ b/tt_metal/llrt/llrt.cpp @@ -158,7 +158,7 @@ uint32_t generate_risc_startup_addr(bool is_eth_core) { } void program_risc_startup_addr(chip_id_t chip_id, const CoreCoord &core) { - vector jump_to_fw; + std::vector jump_to_fw; jump_to_fw.push_back(generate_risc_startup_addr(is_ethernet_core(core, chip_id))); write_hex_vec_to_core(chip_id, core, jump_to_fw, 0); } diff --git a/tt_metal/llrt/rtoptions.cpp b/tt_metal/llrt/rtoptions.cpp index 00494a636b7..baaa49dfdc4 100644 --- a/tt_metal/llrt/rtoptions.cpp +++ b/tt_metal/llrt/rtoptions.cpp @@ -220,7 +220,7 @@ void RunTimeOptions::ParseFeatureEnv(RunTimeDebugFeatures feature) { void RunTimeOptions::ParseFeatureCoreRange( RunTimeDebugFeatures feature, const std::string &env_var, CoreType core_type) { char *str = std::getenv(env_var.c_str()); - vector cores; + std::vector cores; // Check if "all" is specified, rather than a range of cores. feature_targets[feature].all_cores[core_type] = RunTimeDebugClassNoneSpecified; @@ -280,7 +280,7 @@ void RunTimeOptions::ParseFeatureCoreRange( } void RunTimeOptions::ParseFeatureChipIds(RunTimeDebugFeatures feature, const std::string &env_var) { - vector chips; + std::vector chips; char *env_var_str = std::getenv(env_var.c_str()); // If the environment variable is not empty, parse it. diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index d89cfa14e2f..d79b282a415 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -4,18 +4,47 @@ #include "tt_cluster.hpp" -#include - +#include +#include +#include #include -#include #include +#include +#include +#include +#include #include +#include // for get +#include +#include +#include +#include + +#include "fmt/base.h" +#include "tt_metal/common/base.hpp" +#include "tt_metal/common/logger.hpp" +#include "tt_metal/common/metal_soc_descriptor.h" +#include "tt_metal/common/test_common.hpp" +#include "tt_metal/common/tt_backend_api_types.hpp" +#include "third_party/umd/device/tt_arch_types.h" +#include "third_party/umd/device/tt_cluster_descriptor.h" +#include "third_party/umd/device/tt_cluster_descriptor_types.h" +#include "third_party/umd/device/tt_device.h" +#include "third_party/umd/device/tt_soc_descriptor.h" +#include "third_party/umd/device/tt_xy_pair.h" +#include "third_party/umd/device/xy_pair.h" + +// TODO: ARCH_NAME specific, must remove +#include "eth_l1_address_map.h" +#include "dev_msgs.h" +#include "tensix.h" +// +// +#include "llrt/hal.hpp" // for Hal -#include "hostdevcommon/dprint_common.h" -#include "rtoptions.hpp" -#include "third_party/umd/device/tt_silicon_driver_common.hpp" +#include "third_party/tracy/public/tracy/Tracy.hpp" #include "third_party/umd/device/simulation/tt_simulation_device.h" -#include "tools/profiler/profiler.hpp" + #include "tt_metal/impl/debug/sanitize_noc_host.hpp" #include "tt_metal/llrt/rtoptions.hpp" #include "tt_metal/llrt/tlb_config.hpp" @@ -409,7 +438,7 @@ inline uint64_t get_sys_addr(uint32_t chip_x, uint32_t chip_y, uint32_t noc_x, u return result; } -void Cluster::write_dram_vec(vector &vec, tt_target_dram dram, uint64_t addr, bool small_access) const { +void Cluster::write_dram_vec(std::vector &vec, tt_target_dram dram, uint64_t addr, bool small_access) const { int chip_id, d_chan, d_subchannel; std::tie(chip_id, d_chan, d_subchannel) = dram; const metal_SocDescriptor &desc_to_use = get_soc_desc(chip_id); @@ -427,7 +456,7 @@ void Cluster::write_dram_vec(vector &vec, tt_target_dram dram, uint64_ } void Cluster::read_dram_vec( - vector &vec, uint32_t sz_in_bytes, tt_target_dram dram, uint64_t addr, bool small_access) const { + std::vector &vec, uint32_t sz_in_bytes, tt_target_dram dram, uint64_t addr, bool small_access) const { int chip_id, d_chan, d_subchannel; std::tie(chip_id, d_chan, d_subchannel) = dram; const metal_SocDescriptor &desc_to_use = get_soc_desc(chip_id); @@ -473,7 +502,7 @@ void Cluster::read_core( } void Cluster::read_core( - vector &data, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access) const { + std::vector &data, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access) const { data.resize(size_in_bytes / sizeof(uint32_t)); read_core(data.data(), size_in_bytes, core, addr, small_access); } diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp index 2bf49c81e8d..9d216bc7f47 100644 --- a/tt_metal/llrt/tt_cluster.hpp +++ b/tt_metal/llrt/tt_cluster.hpp @@ -19,6 +19,7 @@ // clang-format off #include "noc/noc_parameters.h" #include "eth_interface.h" +#include "eth_l1_address_map.h" #include "dev_msgs.h" // clang-format on @@ -27,7 +28,6 @@ static constexpr std::uint32_t SW_VERSION = 0x00020000; using tt_target_dram = std::tuple; -using tt::TargetDevice; enum EthRouterMode : uint32_t { IDLE = 0, @@ -79,9 +79,9 @@ class Cluster { void deassert_risc_reset_at_core(const tt_cxy_pair &physical_chip_coord) const; void assert_risc_reset_at_core(const tt_cxy_pair &physical_chip_coord) const; - void write_dram_vec(vector &vec, tt_target_dram dram, uint64_t addr, bool small_access = false) const; + void write_dram_vec(std::vector &vec, tt_target_dram dram, uint64_t addr, bool small_access = false) const; void read_dram_vec( - vector &vec, + std::vector &vec, uint32_t size_in_bytes, tt_target_dram dram, uint64_t addr, @@ -93,7 +93,7 @@ class Cluster { void read_core( void *mem_ptr, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const; void read_core( - vector &data, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const; + std::vector &data, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const; std::optional> get_tlb_data(const tt_cxy_pair &target) const { chip_id_t mmio_device_id = device_to_mmio_device_.at(target.chip); diff --git a/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp b/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp index 23f21b3c5c8..80093cb45c1 100644 --- a/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp +++ b/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp @@ -69,7 +69,7 @@ int main(int argc, char **argv) { DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); /* Set the parameters that the compute kernel will use */ - vector compute_kernel_args = {}; + std::vector compute_kernel_args = {}; /* Use the add_tiles operation in the compute kernel */ KernelHandle eltwise_binary_kernel_id = CreateKernel( diff --git a/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp b/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp index 73ab8f59cc1..e4a31d64676 100644 --- a/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp +++ b/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp @@ -127,7 +127,7 @@ int main(int argc, char **argv) { /* * Set the parameters that the compute kernel will use. */ - vector compute_kernel_args = { + std::vector compute_kernel_args = { }; constexpr bool fp32_dest_acc_en = false; diff --git a/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp b/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp index 7744018cc4c..bc3e6593501 100644 --- a/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp +++ b/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp @@ -90,7 +90,7 @@ int main(int argc, char **argv) { /* * Set the parameters that the compute kernel will use. */ - vector compute_kernel_args = { + std::vector compute_kernel_args = { num_tiles, 1 }; diff --git a/tt_metal/programming_examples/hello_world_compute_kernel/hello_world_compute_kernel.cpp b/tt_metal/programming_examples/hello_world_compute_kernel/hello_world_compute_kernel.cpp index 2ab6c43d454..90f663adbe6 100644 --- a/tt_metal/programming_examples/hello_world_compute_kernel/hello_world_compute_kernel.cpp +++ b/tt_metal/programming_examples/hello_world_compute_kernel/hello_world_compute_kernel.cpp @@ -20,7 +20,7 @@ int main(int argc, char **argv) { // Configure and Create Void Kernel - vector compute_kernel_args = {}; + std::vector compute_kernel_args = {}; KernelHandle void_compute_kernel_id = CreateKernel( program, "tt_metal/programming_examples/hello_world_compute_kernel/kernels/compute/void_compute_kernel.cpp", diff --git a/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp b/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp index 409b2c6f009..f655bc93dea 100644 --- a/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp +++ b/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp @@ -27,7 +27,7 @@ void golden_matmul(std::vector& a, std::vector& b, std::vect float c_f; float float_tmp; - vector c_bf(M * N, 0); + std::vector c_bf(M * N, 0); for (int i = 0; i < M; i++) { for (int j = 0; j < N; j++) { @@ -47,7 +47,7 @@ void golden_matmul(std::vector& a, std::vector& b, std::vect } -void matmul_multi_core(vector& a, vector& b, vector& output, bool bcast_batch, +void matmul_multi_core(std::vector& a, std::vector& b, std::vector& output, bool bcast_batch, uint32_t M, uint32_t N, uint32_t K, uint32_t B, Device* device) { /* @@ -169,7 +169,7 @@ void matmul_multi_core(vector& a, vector& b, vector compute_args_group_1 = { + std::vector compute_args_group_1 = { 1, // B 1, // Mt Kt, // Kt @@ -184,7 +184,7 @@ void matmul_multi_core(vector& a, vector& b, vector compute_args_group_2 = { + std::vector compute_args_group_2 = { 1, // B 1, // Mt Kt, // Kt @@ -285,7 +285,7 @@ int main(int argc, char **argv) { std::vector src1_vec = create_random_vector_of_bfloat16_native(dram_buffer_B_size, 1, 12522, -0.2); /* Golden Matmul running on CPU (Float)*/ - vector golden_vec(M * N, 0); + std::vector golden_vec(M * N, 0); golden_matmul(src0_vec, src1_vec, golden_vec, M, N, K, B); /* Input vector tilizing */ @@ -293,7 +293,7 @@ int main(int argc, char **argv) { tilize(src1_vec, K, N); /* Calling the MatMul host program. Read in result into a host vector */ - vector result_vec(dram_buffer_C_size/sizeof(bfloat16)); + std::vector result_vec(dram_buffer_C_size/sizeof(bfloat16)); matmul_multi_core(src0_vec, src1_vec, result_vec, false, M, N, K, B, device); untilize(result_vec, M, N); diff --git a/tt_metal/programming_examples/matmul_multicore_reuse/matmul_multicore_reuse.cpp b/tt_metal/programming_examples/matmul_multicore_reuse/matmul_multicore_reuse.cpp index 50226e14d58..5c5f12eedcf 100644 --- a/tt_metal/programming_examples/matmul_multicore_reuse/matmul_multicore_reuse.cpp +++ b/tt_metal/programming_examples/matmul_multicore_reuse/matmul_multicore_reuse.cpp @@ -19,7 +19,7 @@ using namespace tt; using namespace tt::tt_metal; -void golden_matmul(vector& a, vector& b, vector& output, +void golden_matmul(std::vector& a, std::vector& b, std::vector& output, uint32_t M, uint32_t N, uint32_t K, uint32_t B) { std::uint32_t idx_c = 0; std::uint32_t idx_a = 0; @@ -27,7 +27,7 @@ void golden_matmul(vector& a, vector& b, vector& o float c_f; float float_tmp; - vector c_bf(M * N, 0); + std::vector c_bf(M * N, 0); for (int i = 0; i < M; i++) { for (int j = 0; j < N; j++) { @@ -46,7 +46,7 @@ void golden_matmul(vector& a, vector& b, vector& o } } -void matmul_multicore_reuse(vector& a, vector& b, vector& output, bool bcast_batch, +void matmul_multicore_reuse(std::vector& a, std::vector& b, std::vector& output, bool bcast_batch, uint32_t M, uint32_t N, uint32_t K, uint32_t B, Device* device) { /* @@ -122,7 +122,7 @@ void matmul_multicore_reuse(vector& a, vector& b, vector compute_kernel_args = { + std::vector compute_kernel_args = { in0_block_w, // in0_block_w in0_num_subblocks, // in0_num_subblocks in0_block_num_tiles, // in0_block_num_tiles @@ -372,7 +372,7 @@ int main(int argc, char **argv) { std::vector src1_vec = create_random_vector_of_bfloat16_native(dram_buffer_B_size, 1, 12522, -0.3); /* Golden Matmul running on CPU (Float)*/ - vector golden_vec(M * N, 0); + std::vector golden_vec(M * N, 0); golden_matmul(src0_vec, src1_vec, golden_vec, M, N, K, B); /* Input vector tilizing */ @@ -380,7 +380,7 @@ int main(int argc, char **argv) { tilize(src1_vec, K, N); /* Calling the MatMul host program. Read in result into a host vector */ - vector result_vec(dram_buffer_C_size/sizeof(bfloat16)); + std::vector result_vec(dram_buffer_C_size/sizeof(bfloat16)); matmul_multicore_reuse(src0_vec, src1_vec, result_vec, false, M, N, K, B, device); untilize(result_vec, M, N); diff --git a/tt_metal/programming_examples/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp b/tt_metal/programming_examples/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp index 97c7b7f06d7..b6beb079bea 100644 --- a/tt_metal/programming_examples/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp +++ b/tt_metal/programming_examples/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp @@ -123,7 +123,7 @@ void matmul_multicore_reuse_mcast(std::vector& a, std::vector compute_kernel_args = { + std::vector compute_kernel_args = { in0_block_w, // in0_block_w in0_num_subblocks, // in0_num_subblocks in0_block_num_tiles, // in0_block_num_tiles @@ -479,7 +479,7 @@ int main(int argc, char **argv) { std::vector src1_vec = create_random_vector_of_bfloat16_native(dram_buffer_B_size, 1, 12522, -0.3); /* Golden Matmul running on CPU (Float)*/ - vector golden_vec(M * N, 0); + std::vector golden_vec(M * N, 0); golden_matmul(src0_vec, src1_vec, golden_vec, M, N, K, B); /* Input vector tilizing */ @@ -487,7 +487,7 @@ int main(int argc, char **argv) { tilize(src1_vec, K, N); /* Calling the MatMul host program. Read in result into a host vector */ - vector result_vec(dram_buffer_C_size/sizeof(bfloat16)); + std::vector result_vec(dram_buffer_C_size/sizeof(bfloat16)); matmul_multicore_reuse_mcast(src0_vec, src1_vec, result_vec, false, M, N, K, B, device); untilize(result_vec, M, N); diff --git a/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp b/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp index 598e559b967..6e95757ba1e 100644 --- a/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp +++ b/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp @@ -26,7 +26,7 @@ void golden_matmul(std::vector& a, std::vector& b, std::vect float c_f; float float_tmp; - vector c_bf(M * N, 0); + std::vector c_bf(M * N, 0); for (int i = 0; i < M; i++) { for (int j = 0; j < N; j++) { @@ -154,7 +154,7 @@ void matmul_single_core(std::vector& a, std::vector& b, std: core, tt_metal::DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = writer_compile_time_args}); - vector compute_args = { + std::vector compute_args = { B, // B Mt, // Mt Kt, // Kt @@ -224,7 +224,7 @@ int main(int argc, char **argv) { std::vector src1_vec = create_random_vector_of_bfloat16_native(dram_buffer_B_size, 1, 12522); /* Golden Matmul running on CPU (Float)*/ - vector golden_vec(M * N, 0); + std::vector golden_vec(M * N, 0); golden_matmul(src0_vec, src1_vec, golden_vec, M, N, K, B); /* Input vector tilizing */ @@ -232,7 +232,7 @@ int main(int argc, char **argv) { tilize(src1_vec, K, N); /* Calling the MatMul host program. Read in result into a host vector */ - vector result_vec(dram_buffer_C_size/sizeof(bfloat16)); + std::vector result_vec(dram_buffer_C_size/sizeof(bfloat16)); matmul_single_core(src0_vec, src1_vec, result_vec, false, M, N, K, B, device); untilize(result_vec, M, N); diff --git a/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp b/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp index adfa251352a..19bf2d8d43e 100644 --- a/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp +++ b/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp @@ -36,7 +36,7 @@ bool RunCustomCycle(tt_metal::Device *device, int loop_count) all_cores, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .defines = kernel_defines}); - vector trisc_kernel_args = {}; + std::vector trisc_kernel_args = {}; tt_metal::KernelHandle trisc_kernel = tt_metal::CreateKernel( program, "tt_metal/programming_examples/profiler/test_custom_cycle_count/kernels/custom_cycle_count_compute.cpp", all_cores, diff --git a/tt_metal/programming_examples/profiler/test_custom_cycle_count_slow_dispatch/test_custom_cycle_count_slow_dispatch.cpp b/tt_metal/programming_examples/profiler/test_custom_cycle_count_slow_dispatch/test_custom_cycle_count_slow_dispatch.cpp index 9856c83d007..377c0016c26 100644 --- a/tt_metal/programming_examples/profiler/test_custom_cycle_count_slow_dispatch/test_custom_cycle_count_slow_dispatch.cpp +++ b/tt_metal/programming_examples/profiler/test_custom_cycle_count_slow_dispatch/test_custom_cycle_count_slow_dispatch.cpp @@ -36,7 +36,7 @@ bool RunCustomCycle(tt_metal::Device *device, int loop_count) all_cores, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .defines = kernel_defines}); - vector trisc_kernel_args = {}; + std::vector trisc_kernel_args = {}; tt_metal::KernelHandle trisc_kernel = tt_metal::CreateKernel( program, "tt_metal/programming_examples/profiler/test_custom_cycle_count_slow_dispatch/kernels/custom_cycle_count_compute_slow_dispatch.cpp", all_cores, diff --git a/tt_metal/programming_examples/profiler/test_dispatch_cores/test_dispatch_cores.cpp b/tt_metal/programming_examples/profiler/test_dispatch_cores/test_dispatch_cores.cpp index 46cfce4c214..dc16098bf9b 100644 --- a/tt_metal/programming_examples/profiler/test_dispatch_cores/test_dispatch_cores.cpp +++ b/tt_metal/programming_examples/profiler/test_dispatch_cores/test_dispatch_cores.cpp @@ -34,7 +34,7 @@ void RunCustomCycle(tt_metal::Device *device, int loop_count) all_cores, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .defines = kernel_defines}); - vector trisc_kernel_args = {}; + std::vector trisc_kernel_args = {}; tt_metal::KernelHandle trisc_kernel = tt_metal::CreateKernel( program, "tt_metal/programming_examples/profiler/test_custom_cycle_count/kernels/custom_cycle_count_compute.cpp", all_cores, diff --git a/tt_metal/programming_examples/profiler/test_full_buffer/test_full_buffer.cpp b/tt_metal/programming_examples/profiler/test_full_buffer/test_full_buffer.cpp index b3e7a629d6d..34a47b1d5c8 100644 --- a/tt_metal/programming_examples/profiler/test_full_buffer/test_full_buffer.cpp +++ b/tt_metal/programming_examples/profiler/test_full_buffer/test_full_buffer.cpp @@ -33,7 +33,7 @@ void RunFillUpAllBuffers(tt_metal::Device *device, int loop_count, bool fast_dis program, "tt_metal/programming_examples/profiler/test_full_buffer/kernels/full_buffer.cpp", all_cores, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .defines = kernel_defines}); - vector trisc_kernel_args = {}; + std::vector trisc_kernel_args = {}; tt_metal::KernelHandle trisc_kernel = tt_metal::CreateKernel( program, "tt_metal/programming_examples/profiler/test_full_buffer/kernels/full_buffer_compute.cpp", all_cores, diff --git a/tt_metal/programming_examples/profiler/test_multi_op/test_multi_op.cpp b/tt_metal/programming_examples/profiler/test_multi_op/test_multi_op.cpp index 844d4dc3bdb..4f92c4c0b34 100644 --- a/tt_metal/programming_examples/profiler/test_multi_op/test_multi_op.cpp +++ b/tt_metal/programming_examples/profiler/test_multi_op/test_multi_op.cpp @@ -28,7 +28,7 @@ void RunCustomCycle(tt_metal::Device *device, int fastDispatch) all_cores, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); - vector trisc_kernel_args = {}; + std::vector trisc_kernel_args = {}; tt_metal::KernelHandle trisc_kernel = tt_metal::CreateKernel( program, "tt_metal/programming_examples/profiler/test_multi_op/kernels/multi_op_compute.cpp", all_cores, diff --git a/tt_metal/third_party/umd b/tt_metal/third_party/umd index 6deb8d7d2c6..357154e0782 160000 --- a/tt_metal/third_party/umd +++ b/tt_metal/third_party/umd @@ -1 +1 @@ -Subproject commit 6deb8d7d2c6513af090d91c58e3ace53b4564b4e +Subproject commit 357154e078258810da1e84d74556cb4d0c0cde64 diff --git a/tt_metal/tools/memset.cpp b/tt_metal/tools/memset.cpp index 73e9a181bc6..9713f839e5c 100644 --- a/tt_metal/tools/memset.cpp +++ b/tt_metal/tools/memset.cpp @@ -7,7 +7,7 @@ #include -void memset_l1(vector mem_vec, uint32_t chip_id, uint32_t start_addr) { +void memset_l1(std::vector mem_vec, uint32_t chip_id, uint32_t start_addr) { // Utility function that writes a memory vector to L1 for all cores at a specific start address. const metal_SocDescriptor &sdesc = tt::Cluster::instance().get_soc_desc(chip_id); for (auto &worker_core : sdesc.physical_workers) { @@ -15,7 +15,7 @@ void memset_l1(vector mem_vec, uint32_t chip_id, uint32_t start_addr) } } -void memset_dram(vector mem_vec, uint32_t chip_id, uint32_t start_addr) { +void memset_dram(std::vector mem_vec, uint32_t chip_id, uint32_t start_addr) { // Utility function that writes a memory to all channels and subchannels at a specific start address. const metal_SocDescriptor &sdesc = tt::Cluster::instance().get_soc_desc(chip_id); for (uint32_t dram_src_channel_id = 0; dram_src_channel_id < sdesc.dram_cores.size(); dram_src_channel_id++) { diff --git a/tt_metal/tools/profiler/op_profiler.hpp b/tt_metal/tools/profiler/op_profiler.hpp index 04aeb667af1..c7c35c1bde2 100644 --- a/tt_metal/tools/profiler/op_profiler.hpp +++ b/tt_metal/tools/profiler/op_profiler.hpp @@ -195,7 +195,7 @@ static inline json get_tensor_json(const Tensor& tensor) { static inline std::vector get_tensors_json(const std::vector& tensors) { ZoneScoped; - vector ret; + std::vector ret; for (auto& tensor : tensors) { ret.push_back(get_tensor_json(tensor)); } @@ -204,7 +204,7 @@ static inline std::vector get_tensors_json(const std::vector& tens static inline std::vector get_tensors_json(const std::vector>& tensors) { ZoneScoped; - vector ret; + std::vector ret; for (auto& tensor : tensors) { if (tensor.has_value()) { ret.push_back(get_tensor_json(tensor.value())); @@ -215,7 +215,7 @@ static inline std::vector get_tensors_json(const std::vector get_tensors_json(const std::vector>& tensors) { ZoneScoped; - vector ret; + std::vector ret; for (auto& tensor : tensors) { if (tensor.has_value()) { ret.push_back(get_tensor_json(tensor.value())); diff --git a/tt_metal/tools/profiler/profiler.cpp b/tt_metal/tools/profiler/profiler.cpp index ff4c7078723..c761343eaa0 100644 --- a/tt_metal/tools/profiler/profiler.cpp +++ b/tt_metal/tools/profiler/profiler.cpp @@ -24,7 +24,7 @@ namespace tt_metal { void DeviceProfiler::readRiscProfilerResults( int device_id, - const vector &profile_buffer, + const std::vector &profile_buffer, const CoreCoord &worker_core ){ @@ -52,7 +52,7 @@ void DeviceProfiler::readRiscProfilerResults( uint32_t coreFlatID = soc_d.physical_routing_to_profiler_flat_id.at(worker_core); uint32_t startIndex = coreFlatID * MAX_RISCV_PER_CORE * PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC; - vector control_buffer = tt::llrt::read_hex_vec_from_core( + std::vector control_buffer = tt::llrt::read_hex_vec_from_core( device_id, worker_core, reinterpret_cast(profiler_msg->control_vector), @@ -372,7 +372,7 @@ void DeviceProfiler::generateZoneSourceLocationsHashes() void DeviceProfiler::dumpResults ( Device *device, - const vector &worker_cores, + const std::vector &worker_cores, bool lastDump){ #if defined(TRACY_ENABLE) ZoneScoped; diff --git a/tt_metal/tools/profiler/profiler.hpp b/tt_metal/tools/profiler/profiler.hpp index dfcf6986572..483546fd280 100644 --- a/tt_metal/tools/profiler/profiler.hpp +++ b/tt_metal/tools/profiler/profiler.hpp @@ -81,7 +81,7 @@ class DeviceProfiler { // Helper function for reading risc profile results void readRiscProfilerResults( int device_id, - const vector &profile_buffer, + const std::vector &profile_buffer, const CoreCoord &worker_core); //Push device results to tracy @@ -114,7 +114,7 @@ class DeviceProfiler { void setOutputDir(const std::string& new_output_dir); //Traverse all cores on the device and dump the device profile results - void dumpResults(Device *device, const vector &worker_cores, bool lastDump); + void dumpResults(Device *device, const std::vector &worker_cores, bool lastDump); }; } // namespace tt_metal diff --git a/tt_metal/tools/profiler/tt_metal_profiler.cpp b/tt_metal/tools/profiler/tt_metal_profiler.cpp index 0832677dc7f..b7a8eaae932 100644 --- a/tt_metal/tools/profiler/tt_metal_profiler.cpp +++ b/tt_metal/tools/profiler/tt_metal_profiler.cpp @@ -166,7 +166,7 @@ void syncDeviceHost(Device *device, CoreCoord logical_core, std::shared_ptr(&profiler_msg->buffer[briscIndex][kernel_profiler::CUSTOM_MARKERS]); - vector sync_times = tt::llrt::read_hex_vec_from_core( + std::vector sync_times = tt::llrt::read_hex_vec_from_core( device_id, core, addr, @@ -416,7 +416,7 @@ void DumpDeviceProfileResults(Device *device, std::vector &worker_cor tt::get_logical_dispatch_cores(device_id, device_num_hw_cqs, dispatch_core_type)) { const auto curr_core = device->physical_core_from_logical_core(core, dispatch_core_type); profiler_msg_t *profiler_msg = device->get_dev_addr(curr_core, HalL1MemAddrType::PROFILER); - vector control_buffer = tt::llrt::read_hex_vec_from_core( + std::vector control_buffer = tt::llrt::read_hex_vec_from_core( device_id, curr_core, reinterpret_cast(profiler_msg->control_vector), @@ -436,7 +436,7 @@ void DumpDeviceProfileResults(Device *device, std::vector &worker_cor { const auto curr_core = device->physical_core_from_logical_core(core, CoreType::ETH); profiler_msg_t *profiler_msg = device->get_dev_addr(curr_core, HalL1MemAddrType::PROFILER); - vector control_buffer = tt::llrt::read_hex_vec_from_core( + std::vector control_buffer = tt::llrt::read_hex_vec_from_core( device_id, core, reinterpret_cast(profiler_msg->control_vector), diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index f71c5c49302..001cec165e1 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -794,33 +794,29 @@ void CompileProgram(Device *device, Program &program, bool fd_bootloader_mode) { program.compile(device, fd_bootloader_mode); } -DeviceAddr AllocateBuffer(const Buffer *buffer, bool bottom_up) { - if(GraphTracker::instance().hook_allocate(buffer, bottom_up)) { - GraphTracker::instance().track_allocate(buffer, bottom_up); +DeviceAddr AllocateBuffer(Buffer *buffer) { + if(GraphTracker::instance().hook_allocate(buffer)) { + GraphTracker::instance().track_allocate(buffer); return 0; } - uint32_t allocated_addr; + DeviceAddr allocated_addr; if (is_sharded(buffer->buffer_layout())) { allocated_addr = allocator::allocate_buffer( *(buffer->device()->allocator_), - buffer->shard_spec().size() * buffer->num_cores() * buffer->page_size(), - buffer->page_size(), - buffer->buffer_type(), - bottom_up, - buffer->num_cores()); + buffer->shard_spec().size() * buffer->num_cores().value() * buffer->page_size(), + buffer); } else { allocated_addr = allocator::allocate_buffer( *(buffer->device()->allocator_), buffer->size(), - buffer->page_size(), - buffer->buffer_type(), - bottom_up, - std::nullopt); + buffer); } + // Assertion here because buffer class returns a u32 when address is queried + // Requires updating all use cases of buffer address to accept a u64 to remove TT_ASSERT(allocated_addr <= std::numeric_limits::max()); - GraphTracker::instance().track_allocate(buffer, bottom_up); + GraphTracker::instance().track_allocate(buffer); return allocated_addr; } @@ -831,7 +827,7 @@ void DeallocateBuffer(Buffer *buffer) { return; } - allocator::deallocate_buffer(*buffer->device()->allocator_, buffer->address(), buffer->buffer_type()); + allocator::deallocate_buffer(*buffer->device()->allocator_, buffer); } } // namespace detail diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt index 873296beece..5042aedc602 100644 --- a/ttnn/CMakeLists.txt +++ b/ttnn/CMakeLists.txt @@ -371,7 +371,10 @@ set(ALL_TTNN_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/uniform/uniform_pybind.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/uniform/device/uniform_device_operation.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/uniform/device/uniform_program_factory.cpp - + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/bernoulli/bernoulli.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/bernoulli/bernoulli_pybind.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/bernoulli/device/bernoulli_program_factory.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_program_factory.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_adam/moreh_adam_pybind.cpp @@ -519,6 +522,10 @@ set(ALL_TTNN_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_sum/moreh_sum_pybind.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_sum/moreh_sum.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/index_fill/index_fill.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/index_fill/index_fill_pybind.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/index_fill/device/index_fill_multi_core_factory.cpp ) #Split src and python bindings diff --git a/ttnn/cpp/pybind11/operations/__init__.hpp b/ttnn/cpp/pybind11/operations/__init__.hpp index 0083e077e7f..6c165efb027 100644 --- a/ttnn/cpp/pybind11/operations/__init__.hpp +++ b/ttnn/cpp/pybind11/operations/__init__.hpp @@ -10,6 +10,7 @@ #include "pybind11/operations/copy.hpp" #include "pybind11/operations/core.hpp" #include "pybind11/operations/creation.hpp" +#include "ttnn/operations/bernoulli/bernoulli_pybind.hpp" #include "ttnn/operations/ccl/all_gather/all_gather_pybind.hpp" #include "ttnn/operations/ccl/reduce_scatter/reduce_scatter_pybind.hpp" #include "ttnn/operations/conv/conv2d/conv2d_pybind.hpp" @@ -29,6 +30,7 @@ #include "ttnn/operations/experimental/experimental_pybind.hpp" #include "ttnn/operations/full/full_pybind.hpp" #include "ttnn/operations/full_like/full_like_pybind.hpp" +#include "ttnn/operations/index_fill/index_fill_pybind.hpp" #include "ttnn/operations/kv_cache/kv_cache_pybind.hpp" #include "ttnn/operations/loss/loss_pybind.hpp" #include "ttnn/operations/matmul/matmul_pybind.hpp" @@ -148,6 +150,12 @@ void py_module(py::module& module) { auto m_uniform = module.def_submodule("uniform", "uniform operations"); uniform::bind_uniform_operation(m_uniform); + + auto m_index_fill = module.def_submodule("index_fill", "index_fill operation"); + index_fill::bind_index_fill_operation(m_index_fill); + + auto m_bernoulli = module.def_submodule("bernoulli", "bernoulli operations"); + bernoulli::bind_bernoulli_operation(m_bernoulli); } } // namespace operations diff --git a/ttnn/cpp/ttnn/graph/graph_processor.cpp b/ttnn/cpp/ttnn/graph/graph_processor.cpp index bebeadebd9d..882f9588a22 100644 --- a/ttnn/cpp/ttnn/graph/graph_processor.cpp +++ b/ttnn/cpp/ttnn/graph/graph_processor.cpp @@ -90,7 +90,7 @@ GraphProcessor::GraphProcessor(RunMode mode) : run_mode(mode) { end_function_any_map[typeid(std::reference_wrapper)] = [ptr = this] (const std::any& val) mutable {ptr->end_function_process_tensor(val);}; } -void GraphProcessor::track_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) { +void GraphProcessor::track_allocate(const tt::tt_metal::Buffer* buffer) { const std::lock_guard lock(mutex); auto buf_id = add_buffer(buffer); @@ -478,7 +478,7 @@ nlohmann::json GraphProcessor::end_graph_capture() { return res; } -bool ProcessorHooks::hook_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) { +bool ProcessorHooks::hook_allocate(const tt::tt_metal::Buffer* buffer) { return do_block; } diff --git a/ttnn/cpp/ttnn/graph/graph_processor.hpp b/ttnn/cpp/ttnn/graph/graph_processor.hpp index 4f7d6f1b6e7..83179dabe59 100644 --- a/ttnn/cpp/ttnn/graph/graph_processor.hpp +++ b/ttnn/cpp/ttnn/graph/graph_processor.hpp @@ -22,7 +22,7 @@ namespace ttnn::graph { public: ProcessorHooks() = default; - bool hook_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) override; + bool hook_allocate(const tt::tt_metal::Buffer* buffer) override; bool hook_deallocate(tt::tt_metal::Buffer* buffer) override; @@ -40,7 +40,7 @@ namespace ttnn::graph { GraphProcessor(tt::tt_metal::IGraphProcessor::RunMode mode); ~GraphProcessor() override; - void track_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) override; + void track_allocate(const tt::tt_metal::Buffer* buffer) override; void track_deallocate(tt::tt_metal::Buffer* buffer) override; diff --git a/ttnn/cpp/ttnn/operations/bernoulli/bernoulli.cpp b/ttnn/cpp/ttnn/operations/bernoulli/bernoulli.cpp new file mode 100644 index 00000000000..61c4dbe5622 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/bernoulli/bernoulli.cpp @@ -0,0 +1,19 @@ + +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "bernoulli.hpp" + +#include "device/bernoulli_device_operation.hpp" + +namespace ttnn::operations::bernoulli { +Tensor Bernoulli::invoke( + const Tensor& input, + const std::optional& output, + const std::optional& dtype, + const std::optional& memory_config, + const std::optional& compute_kernel_config) { + return ttnn::prim::bernoulli(input, output, dtype, memory_config, compute_kernel_config); +} +} // namespace ttnn::operations::bernoulli diff --git a/ttnn/cpp/ttnn/operations/bernoulli/bernoulli.hpp b/ttnn/cpp/ttnn/operations/bernoulli/bernoulli.hpp new file mode 100644 index 00000000000..0562def8df9 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/bernoulli/bernoulli.hpp @@ -0,0 +1,23 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "ttnn/decorators.hpp" +#include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" + +namespace ttnn::operations::bernoulli { +struct Bernoulli { + static Tensor invoke( + const Tensor& input, + const std::optional& output, + const std::optional& dtype, + const std::optional& memory_config, + const std::optional& compute_kernel_config); +}; +} // namespace ttnn::operations::bernoulli + +namespace ttnn { +constexpr auto bernoulli = + ttnn::register_operation_with_auto_launch_op<"ttnn::bernoulli", ttnn::operations::bernoulli::Bernoulli>(); +} // namespace ttnn diff --git a/ttnn/cpp/ttnn/operations/bernoulli/bernoulli_pybind.cpp b/ttnn/cpp/ttnn/operations/bernoulli/bernoulli_pybind.cpp new file mode 100644 index 00000000000..405aefc834c --- /dev/null +++ b/ttnn/cpp/ttnn/operations/bernoulli/bernoulli_pybind.cpp @@ -0,0 +1,46 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "bernoulli_pybind.hpp" + +#include "bernoulli.hpp" +#include "pybind11/decorators.hpp" + +namespace ttnn::operations::bernoulli { +void bind_bernoulli_operation(py::module &module) { + std::string doc = + R"doc( + Generates a tensor to draw binary random numbers (0 or 1) from a Bernoulli distribution. + + Args: + input (ttnn.Tensor): The input tensor of probability values for the Bernoulli distribution. + + Keyword args: + output (ttnn.Tensor, optional): The output tensor. + dtype (ttnn.DataType, optional): Output tensor dtype, default float32. + memory_config (ttnn.MemoryConfig, optional): Memory configuration for the operation. Defaults to `None`. + compute_kernel_config (ttnn.DeviceComputeKernelConfig, optional): Configuration for the compute kernel. Defaults to `None`. + + Returns: + ttnn.Tensor: the output tensor. + + Example: + >>> input = ttnn.to_device(ttnn.from_torch(torch.empty(3, 3).uniform_(0, 1), dtype=torch.bfloat16)), device=device) + >>> output = ttnn.bernoulli(input) + + )doc"; + + bind_registered_operation( + module, + ttnn::bernoulli, + doc, + ttnn::pybind_arguments_t{ + py::arg("input"), + py::kw_only(), + py::arg("output") = std::nullopt, + py::arg("dtype") = std::nullopt, + py::arg("memory_config") = std::nullopt, + py::arg("compute_kernel_config") = std::nullopt}); +} +} // namespace ttnn::operations::bernoulli diff --git a/ttnn/cpp/ttnn/operations/bernoulli/bernoulli_pybind.hpp b/ttnn/cpp/ttnn/operations/bernoulli/bernoulli_pybind.hpp new file mode 100644 index 00000000000..5c321318bba --- /dev/null +++ b/ttnn/cpp/ttnn/operations/bernoulli/bernoulli_pybind.hpp @@ -0,0 +1,13 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "pybind11/pybind_fwd.hpp" + +namespace py = pybind11; + +namespace ttnn::operations::bernoulli { +void bind_bernoulli_operation(py::module &module); +} // namespace ttnn::operations::bernoulli diff --git a/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.cpp b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.cpp new file mode 100644 index 00000000000..da542429466 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.cpp @@ -0,0 +1,85 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "bernoulli_device_operation.hpp" + +namespace ttnn::operations::bernoulli { + +BernoulliDeviceOperation::program_factory_t BernoulliDeviceOperation::select_program_factory( + const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) { + return ProgramFactory{}; +} + +void BernoulliDeviceOperation::validate_inputs( + const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) { + const auto& input = tensor_args.input; + const auto& output = tensor_args.output; + + TT_FATAL(input.storage_type() == StorageType::DEVICE, "Bernoulli: Input tensor need to be on device"); + TT_FATAL(input.buffer() != nullptr, "Bernoulli: Input tensor need to be allocated in buffers on device"); + TT_FATAL((input.get_layout() == Layout::TILE), "Bernoulli: Input tensor must be tilized"); + TT_FATAL( + input.get_dtype() == DataType::BFLOAT16 || input.get_dtype() == DataType::FLOAT32, + "Bernoulli: Input tensor must be Float32 or Bfloat16"); + + if (output.has_value()) { + TT_FATAL(output.value().storage_type() == StorageType::DEVICE, "Bernoulli: Output tensor need to be on device"); + TT_FATAL( + output.value().buffer() != nullptr, "Bernoulli: Output tensor need to be allocated in buffers on device"); + TT_FATAL((output.value().get_layout() == Layout::TILE), "Bernoulli: Output tensor must be tilized"); + TT_FATAL( + output.value().get_dtype() == DataType::BFLOAT16 || output.value().get_dtype() == DataType::FLOAT32, + "Bernoulli: Output tensor must be Float32 or Bfloat16"); + TT_FATAL( + input.get_logical_volume() == output.value().get_logical_volume(), + "Bernoulli: Output and input tensor shape must be equal"); + } +} + +void BernoulliDeviceOperation::validate_on_program_cache_miss( + const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) { + validate_inputs(operation_attributes, tensor_args); +} + +void BernoulliDeviceOperation::validate_on_program_cache_hit( + const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) { + validate_inputs(operation_attributes, tensor_args); +} + +BernoulliDeviceOperation::shape_return_value_t BernoulliDeviceOperation::compute_output_shapes( + const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) { + return tensor_args.input.get_logical_shape(); +} + +BernoulliDeviceOperation::tensor_return_value_t BernoulliDeviceOperation::create_output_tensors( + const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) { + if (tensor_args.output.has_value()) { + return tensor_args.output.value(); + } + + auto output_shapes = compute_output_shapes(operation_attributes, tensor_args); + return create_device_tensor( + output_shapes, + operation_attributes.dtype, + Layout::TILE, + tensor_args.input.device(), + operation_attributes.memory_config); +} + +std::tuple +BernoulliDeviceOperation::invoke( + const Tensor& input, + const std::optional& output, + const std::optional& dtype, + const std::optional& memory_config, + const std::optional& compute_kernel_config) { + return { + operation_attributes_t{ + dtype.value_or(DataType::FLOAT32), + memory_config.value_or(input.memory_config()), + init_device_compute_kernel_config(input.device()->arch(), compute_kernel_config, MathFidelity::HiFi4)}, + tensor_args_t{input, output}}; +} + +} // namespace ttnn::operations::bernoulli diff --git a/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.hpp b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.hpp new file mode 100644 index 00000000000..d15841d1442 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.hpp @@ -0,0 +1,69 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "ttnn/decorators.hpp" +#include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" + +namespace ttnn::operations::bernoulli { + +struct BernoulliDeviceOperation { + struct operation_attributes_t { + const DataType dtype; + const MemoryConfig memory_config; + const DeviceComputeKernelConfig compute_kernel_config; + }; + + struct tensor_args_t { + const Tensor& input; + const std::optional& output; + }; + + using shape_return_value_t = SimpleShape; + using tensor_return_value_t = Tensor; + + struct ProgramFactory { + struct shared_variables_t { + KernelHandle reader_kernel_id; + KernelHandle compute_kernel_id; + KernelHandle writer_kernel_id; + std::vector cores; + }; + + using cached_program_t = ttnn::device_operation::CachedProgram; + + static cached_program_t create( + const operation_attributes_t& operation_attributes, + const tensor_args_t& tensor_args, + tensor_return_value_t& output); + + static void override_runtime_arguments( + cached_program_t& cached_program, + const operation_attributes_t& operation_attributes, + const tensor_args_t& tensor_args, + tensor_return_value_t& output); + }; + + using program_factory_t = std::variant; + + static program_factory_t select_program_factory(const operation_attributes_t&, const tensor_args_t&); + static void validate_inputs(const operation_attributes_t& attributes, const tensor_args_t& tensor_args); + static void validate_on_program_cache_miss(const operation_attributes_t&, const tensor_args_t&); + static void validate_on_program_cache_hit(const operation_attributes_t&, const tensor_args_t&); + static shape_return_value_t compute_output_shapes(const operation_attributes_t&, const tensor_args_t&); + static tensor_return_value_t create_output_tensors(const operation_attributes_t&, const tensor_args_t&); + + static std::tuple invoke( + const Tensor& input, + const std::optional& output, + const std::optional& dtype, + const std::optional& memory_config, + const std::optional& compute_kernel_config); +}; + +} // namespace ttnn::operations::bernoulli + +namespace ttnn::prim { +constexpr auto bernoulli = + ttnn::register_operation<"ttnn::prim::bernoulli", ttnn::operations::bernoulli::BernoulliDeviceOperation>(); +} // namespace ttnn::prim diff --git a/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_program_factory.cpp b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_program_factory.cpp new file mode 100644 index 00000000000..b9a5067b1a4 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_program_factory.cpp @@ -0,0 +1,166 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 +#include "bernoulli_device_operation.hpp" +#include "common/constants.hpp" +#include "impl/kernels/kernel_types.hpp" +#include "tt_metal/common/work_split.hpp" +#include "ttnn/tensor/types.hpp" + +namespace ttnn::operations::bernoulli { + +using namespace tt; +using namespace tt::tt_metal; + +std::mt19937 rng(std::time(0)); +std::uniform_int_distribution d(1, 1 << 20); + +uint32_t get_random_seed() { return d(rng); } + +BernoulliDeviceOperation::ProgramFactory::cached_program_t BernoulliDeviceOperation::ProgramFactory::create( + const operation_attributes_t& operation_attributes, + const tensor_args_t& tensor_args, + tensor_return_value_t& output) { + const Tensor& input = tensor_args.input; + + Device* device = output.device(); + auto grid = device->compute_with_storage_grid_size(); + auto core_h = grid.y; + + uint32_t units_to_divide = output.volume() / constants::TILE_HW; + auto [num_cores, all_cores, core_group_1, core_group_2, units_per_core_group_1, units_per_core_group_2] = + split_work_to_cores(grid, units_to_divide); + + uint32_t num_cores_x = grid.x; + uint32_t num_cores_y = grid.y; + auto cores = grid_to_cores(num_cores, num_cores_x, num_cores_y); + + Program program = Program(); + + constexpr uint32_t num_tiles = 2; + auto in_data_format = datatype_to_dataformat_converter(input.dtype()); + const uint32_t in_dtype_tile_size = tile_size(in_data_format); + constexpr uint32_t in_cb_id = CB::c_in0; + CircularBufferConfig cb_in_config = + CircularBufferConfig(num_tiles * in_dtype_tile_size, {{in_cb_id, in_data_format}}) + .set_page_size(in_cb_id, in_dtype_tile_size); + CBHandle cb_input = tt_metal::CreateCircularBuffer(program, all_cores, cb_in_config); + + const uint32_t float32_tile_size = tile_size(tt::DataFormat::Float32); + constexpr uint32_t intermed_cb_id = CB::c_intermed0; + CircularBufferConfig cb_intermed_config = + CircularBufferConfig(num_tiles * float32_tile_size, {{intermed_cb_id, tt::DataFormat::Float32}}) + .set_page_size(intermed_cb_id, float32_tile_size); + CBHandle cb_intermed = tt_metal::CreateCircularBuffer(program, all_cores, cb_intermed_config); + + auto out_data_format = datatype_to_dataformat_converter(output.dtype()); + const uint32_t out_dtype_tile_size = tile_size(out_data_format); + constexpr uint32_t intermed1_cb_id = CB::c_intermed1; + CircularBufferConfig cb_intermed1_config = + CircularBufferConfig(1 * out_dtype_tile_size, {{intermed1_cb_id, out_data_format}}) + .set_page_size(intermed1_cb_id, out_dtype_tile_size); + CBHandle cb_intermed1 = tt_metal::CreateCircularBuffer(program, all_cores, cb_intermed1_config); + + const std::string kernels_dir_path = "ttnn/cpp/ttnn/operations/bernoulli/device/kernels/"; + const uint32_t input_is_dram = input.buffer()->buffer_type() == BufferType::DRAM ? 1 : 0; + const std::vector reader_compile_time_args{in_cb_id, input_is_dram}; + const std::string reader_file_path = kernels_dir_path + "reader_bernoulli.cpp"; + const std::vector compute_compile_time_args{intermed_cb_id}; + const std::string compute_file_path = kernels_dir_path + "compute_bernoulli.cpp"; + const uint32_t output_is_dram = output.buffer()->buffer_type() == BufferType::DRAM ? 1 : 0; + const std::vector writer_compile_time_args{in_cb_id, intermed_cb_id, intermed1_cb_id, output_is_dram}; + const std::string writer_file_path = kernels_dir_path + "writer_bernoulli.cpp"; + + std::map writer_defines; + switch (input.dtype()) { + case DataType::BFLOAT16: writer_defines["INPUT_DTYPE_BFLOAT16"] = "1"; break; + case DataType::FLOAT32: writer_defines["INPUT_DTYPE_FLOAT32"] = "1"; break; + default: break; + } + switch (output.dtype()) { + case DataType::BFLOAT16: writer_defines["OUTPUT_DTYPE_BFLOAT16"] = "1"; break; + case DataType::FLOAT32: writer_defines["OUTPUT_DTYPE_FLOAT32"] = "1"; break; + default: break; + } + + KernelHandle reader_kernel_id = tt_metal::CreateKernel( + program, reader_file_path, all_cores, ReaderDataMovementConfig(reader_compile_time_args)); + KernelHandle writer_kernel_id = tt_metal::CreateKernel( + program, writer_file_path, all_cores, WriterDataMovementConfig(writer_compile_time_args, writer_defines)); + auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc, dst_full_sync_en] = + get_compute_kernel_config_args(device->arch(), operation_attributes.compute_kernel_config); + KernelHandle compute_kernel_id = CreateKernel( + program, + compute_file_path, + all_cores, + ComputeConfig{ + .math_fidelity = math_fidelity, + .fp32_dest_acc_en = + true, // must always be true otherwise, generated float number are always in range of [0.4, 0.5] + .dst_full_sync_en = dst_full_sync_en, + .math_approx_mode = math_approx_mode, + .compile_args = compute_compile_time_args, + }); + + uint32_t tile_offset = 0; + for (const auto& core : cores) { + uint32_t units_per_core; + if (core_group_1.core_coord_in_core_ranges(core)) { + units_per_core = units_per_core_group_1; + } else if (core_group_2.core_coord_in_core_ranges(core)) { + units_per_core = units_per_core_group_2; + } else { + TT_THROW("Core not in specified core ranges"); + } + + std::vector reader_runtime_args = {input.buffer()->address(), tile_offset, units_per_core}; + SetRuntimeArgs(program, reader_kernel_id, core, reader_runtime_args); + + std::vector compute_runtime_args = {get_random_seed(), tile_offset, units_per_core}; + SetRuntimeArgs(program, compute_kernel_id, core, compute_runtime_args); + + std::vector writer_runtime_args = {output.buffer()->address(), tile_offset, units_per_core}; + SetRuntimeArgs(program, writer_kernel_id, core, writer_runtime_args); + + tile_offset += units_per_core; + } + + return { + std::move(program), + {.reader_kernel_id = reader_kernel_id, + .compute_kernel_id = compute_kernel_id, + .writer_kernel_id = writer_kernel_id, + .cores = cores}}; +} + +void BernoulliDeviceOperation::ProgramFactory::override_runtime_arguments( + cached_program_t& cached_program, + const operation_attributes_t& operation_attributes, + const tensor_args_t& tensor_args, + tensor_return_value_t& output) { + auto& program = cached_program.program; + auto& reader_kernel_id = cached_program.shared_variables.reader_kernel_id; + auto& writer_kernel_id = cached_program.shared_variables.writer_kernel_id; + auto& compute_kernel_id = cached_program.shared_variables.compute_kernel_id; + auto& cores = cached_program.shared_variables.cores; + + const uint32_t input_addr = tensor_args.input.buffer()->address(); + const uint32_t output_addr = output.buffer()->address(); + + for (const auto& core : cores) { + { + auto& runtime_args = GetRuntimeArgs(program, reader_kernel_id, core); + runtime_args[0] = input_addr; + } + { + auto& runtime_args = GetRuntimeArgs(program, compute_kernel_id, core); + runtime_args[0] = get_random_seed(); + } + { + auto& runtime_args = GetRuntimeArgs(program, writer_kernel_id, core); + runtime_args[0] = output_addr; + } + } +} + +} // namespace ttnn::operations::bernoulli diff --git a/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/compute_bernoulli.cpp b/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/compute_bernoulli.cpp new file mode 100644 index 00000000000..816ad043d7e --- /dev/null +++ b/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/compute_bernoulli.cpp @@ -0,0 +1,43 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "compute_kernel_api.h" +#include "compute_kernel_api/eltwise_unary/eltwise_unary.h" +#include "compute_kernel_api/eltwise_unary/rand.h" + +namespace NAMESPACE { + +void MAIN { + constexpr uint32_t intermed_cb_id = get_compile_time_arg_val(0); + + const uint32_t seed = get_arg_val(0); + const uint32_t start_id = get_arg_val(1); + const uint32_t num_tiles = get_arg_val(2); + const uint32_t end_id = start_id + num_tiles; + + init_sfpu(intermed_cb_id); + + union f2u { + float f; + uint32_t u; + } rand_scale; + rand_scale.f = 1; + uint32_t rand_from = 0; + + rand_tile_init(seed); + for (uint32_t i = start_id; i < end_id; ++i) { + cb_reserve_back(intermed_cb_id, 1); + + tile_regs_acquire(); + rand_tile(0, rand_from, rand_scale.u); + tile_regs_commit(); + + tile_regs_wait(); + pack_tile(0, intermed_cb_id, 0); + tile_regs_release(); + + cb_push_back(intermed_cb_id, 1); + } +} +} // namespace NAMESPACE diff --git a/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/reader_bernoulli.cpp b/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/reader_bernoulli.cpp new file mode 100644 index 00000000000..ad3af7f5d26 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/reader_bernoulli.cpp @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "dataflow_api.h" + +void kernel_main() { + constexpr uint32_t in_cb_id = get_compile_time_arg_val(0); + constexpr bool input_is_dram = get_compile_time_arg_val(1) == 1; + + uint32_t input_addr = get_arg_val(0); + uint32_t start_id = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); + uint32_t end_id = start_id + num_tiles; + + const InterleavedAddrGenFast input_addrg = { + .bank_base_address = input_addr, .page_size = get_tile_size(in_cb_id), .data_format = get_dataformat(in_cb_id)}; + + for (uint32_t i = start_id; i < end_id; ++i) { + cb_reserve_back(in_cb_id, 1); + uint32_t in_cb_write_ptr = get_write_ptr(in_cb_id); + noc_async_read_tile(i, input_addrg, in_cb_write_ptr); + noc_async_read_barrier(); + cb_push_back(in_cb_id, 1); + } +} diff --git a/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/writer_bernoulli.cpp b/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/writer_bernoulli.cpp new file mode 100644 index 00000000000..f2bdbc8c9d8 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/writer_bernoulli.cpp @@ -0,0 +1,78 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 +#include "common/constants.hpp" +#include "dataflow_api.h" + +using namespace tt; + +void kernel_main() { + constexpr uint32_t in_cb_id = get_compile_time_arg_val(0); + constexpr uint32_t intermed_cb_id = get_compile_time_arg_val(1); + constexpr uint32_t intermed1_cb_id = get_compile_time_arg_val(2); + constexpr bool output_is_dram = get_compile_time_arg_val(3) == 1; + + auto out_addr = get_arg_val(0); + auto start_id = get_arg_val(1); + auto num_tiles = get_arg_val(2); + uint32_t end_id = start_id + num_tiles; + + const InterleavedAddrGenFast output_addrg = { + .bank_base_address = out_addr, + .page_size = get_tile_size(intermed1_cb_id), + .data_format = get_dataformat(intermed1_cb_id)}; + + cb_reserve_back(intermed1_cb_id, 1); + uint32_t intermed1_cb_write_ptr = get_write_ptr(intermed1_cb_id); + + for (uint32_t i = start_id; i < end_id; ++i) { + cb_wait_front(in_cb_id, 1); + cb_wait_front(intermed_cb_id, 1); + + uint32_t intermed_cb_read_ptr = get_read_ptr(intermed_cb_id); + uint32_t in_cb_read_ptr = get_read_ptr(in_cb_id); + + auto in_cb_addr = reinterpret_cast(in_cb_read_ptr); + auto intermed_cb_addr = reinterpret_cast(intermed_cb_read_ptr); + auto intermed1_cb_addr = reinterpret_cast(intermed1_cb_write_ptr); + + for (uint32_t k = 0; k < constants::TILE_WIDTH; k++) { + for (uint32_t j = 0; j < constants::TILE_HEIGHT; j++) { + float rand_float = *intermed_cb_addr; + + float input = 0; +#ifdef INPUT_DTYPE_FLOAT32 + input = *reinterpret_cast(in_cb_addr); + in_cb_addr += 4; +#endif +#ifdef INPUT_DTYPE_BFLOAT16 // cast: uint16 => uint32 => float and write to input variable. + uint16_t *in_u16_ptr = reinterpret_cast(in_cb_addr); + uint32_t u32 = static_cast(*in_u16_ptr) << 16; + float *f_ptr = reinterpret_cast(&u32); + input = *f_ptr; + in_cb_addr += 2; +#endif + float output = 0; + if (rand_float <= input) { + output = 1; + } + +#ifdef OUTPUT_DTYPE_FLOAT32 + *(float *)intermed1_cb_addr = output; + intermed1_cb_addr += 4; +#endif +#ifdef OUTPUT_DTYPE_BFLOAT16 + uint16_t *out_u16_ptr = reinterpret_cast(&output) + 1; + *(uint16_t *)intermed1_cb_addr = *out_u16_ptr; + intermed1_cb_addr += 2; +#endif + intermed_cb_addr += 1; + } + } + cb_pop_front(in_cb_id, 1); + cb_pop_front(intermed_cb_id, 1); + + noc_async_write_tile(i, output_addrg, intermed1_cb_write_ptr); + noc_async_write_barrier(); + } +} diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp b/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp index 410f8aaf85c..c66b196eea7 100644 --- a/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp @@ -720,10 +720,14 @@ std::vector generate_slice_sequence_on_dim( std::size_t worker_index ) { static_assert(std::is_same_v, "generate_slice_sequence_on_dim not yet implemented for type not of tt_xy_pair"); - TT_ASSERT(fracture_dim == 3); // We don't support 4D shapes in the CCL kernels yet, which are needed for proper reduction/concatenation in some cases // so for now we subtract the outer dims from the fracture_dim since we only support 2D at the moment. - fracture_dim -= 2; + if (fracture_dim == 3) { + fracture_dim -= 2; + } else { + // dims are + fracture_dim = 0; + } TT_ASSERT(worker_slice_shape.y == 1); @@ -743,7 +747,7 @@ std::vector generate_slice_sequence_on_dim( log_trace(tt::LogOp, "worker_index {}", worker_index); } - auto worker_slice_start_offset = fracture_dim == 0 ? TensorSlice::ords_t{0, worker_index * worker_slice_shape.y} : TensorSlice::ords_t{worker_index * worker_slice_shape.x, 0}; + auto worker_slice_start_offset = /*fracture_dim == 0 ? TensorSlice::ords_t{0, worker_index * worker_slice_shape.y} :*/ TensorSlice::ords_t{worker_index * worker_slice_shape.x, 0}; auto generate_slice = [forward_direction,incr, &slices, &tensor_shape, &slice_shape, &worker_slice_shape, tensor_slice_offset, &worker_slice_start_offset, fracture_dim, dim_start_offset, slice_size_on_dim](std::int64_t i){ auto tensor_slice_offset_adjusted = tensor_slice_offset; diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp index f0f80e7e3f2..8a2b8ed7815 100644 --- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp @@ -210,7 +210,7 @@ static std::tuple compute_kernel_args = {}; + std::vector compute_kernel_args = {}; constexpr bool fp32_dest_acc_en = false; constexpr bool math_approx_mode = false; std::map eltwise_defines = ttnn::operations::binary::utils::get_defines(binary_math_op); @@ -253,7 +253,7 @@ static void set_reduce_scatter_worker_rt( std::vector& cw_edm_builders, std::vector& ccw_edm_builders, EdmInterfaceAddresses const& edm_interface_addresses, - WorkerAttributes &worker_attributes, + WorkerAttributes const& worker_attributes, std::size_t num_edm_channels, std::size_t edm_num_buffers_per_channel, ttnn::operations::binary::BinaryOpType binary_math_op) { diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp index 2c87dd4dd00..8c55e0e68b8 100644 --- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp @@ -24,7 +24,7 @@ std::vector ReduceScatter::compute_output_shapes(const std::v auto shape = input_tensors[0].get_logical_shape(); TT_FATAL( shape[this->scatter_dim] % this->ring_size == 0, - "The size of the scatter dimension must be a multiple of the ring size"); + "The size of the scatter dimension must be a multiple of the ring size. Dimension size: {}, ring Size: {}", shape[this->scatter_dim], this->ring_size); shape[this->scatter_dim] /= this->ring_size; return std::vector(input_tensors.size(), shape); } @@ -132,6 +132,78 @@ Tensor reduce_scatter( return output_tensors.at(0); } + + + +Tensor reduce_scatter( + const Tensor &input_tensor, + const uint32_t scatter_dim, + const uint32_t cluster_axis, + const MeshDevice& mesh_device, + ttnn::operations::reduction::ReduceType reduce_op, + const uint32_t num_links, + const std::optional& output_mem_config, + ttnn::ccl::Topology topology, + const std::optional user_defined_num_workers, + const std::optional user_defined_num_buffers_per_channel) { + ttnn::operations::binary::BinaryOpType binary_op_type = convert_reduce_type_to_eltwise_type(reduce_op); + + TT_FATAL(topology == ttnn::ccl::Topology::Linear, "This all_gather API with cluster_axis is currently supported only for the Linear topology"); + const auto mesh_view = mesh_device.get_view(); + std::size_t num_devices = (cluster_axis == 0) ? mesh_view->num_rows() : mesh_view->num_cols(); + + std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; + + operation::launch_op( + [scatter_dim, binary_op_type, num_links, output_mem_config, mesh_view, cluster_axis, user_defined_num_workers, user_defined_num_buffers_per_channel, num_devices, topology]( + const std::vector& input_tensors, + const std::vector>& optional_input_tensors, + const std::vector>& optional_output_tensors) mutable -> std::vector { + + const auto& input_device_tensor = input_tensors.at(0); + + const auto coordinate = mesh_view->find_device(input_device_tensor.device()->id()); + const auto view_index = (cluster_axis == 0) ? coordinate.col : coordinate.row; + const auto device_index = (cluster_axis == 0) ? coordinate.row : coordinate.col; + + auto get_chip_id = [&](std::size_t line_index) -> std::optional { + auto new_coord = coordinate; + if (cluster_axis == 0) { + new_coord.row = line_index % num_devices; + } else { + new_coord.col = line_index % num_devices; + } + return mesh_view->find_device_id(new_coord); + }; + + bool is_last_chip_in_clockwise_direction = device_index == (num_devices - 1); + bool is_last_chip_in_counter_clockwise_direction = device_index == 0; + auto receiver_device_id = is_last_chip_in_clockwise_direction ? std::nullopt : get_chip_id(device_index + 1); + auto sender_device_id = is_last_chip_in_counter_clockwise_direction ? std::nullopt : get_chip_id(device_index + num_devices - 1); + + return operation::run( + ttnn::ReduceScatter{ + binary_op_type, + scatter_dim, + num_links, + num_devices, + device_index, + receiver_device_id, + sender_device_id, + output_mem_config.value_or(input_device_tensor.memory_config()), + topology, + user_defined_num_workers, + user_defined_num_buffers_per_channel}, + {input_device_tensor}); + }, + {input_tensor}, + output_tensors); + return output_tensors.at(0); + +} + + + } // namespace ccl } // namespace operations diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.hpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.hpp index 996d3078ca0..a1d5ea4f1dc 100644 --- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.hpp @@ -51,12 +51,24 @@ operation::ProgramWithCallbacks reduce_scatter_with_workers( namespace operations{ namespace ccl{ - Tensor reduce_scatter( +Tensor reduce_scatter( const Tensor &input_tensor, const uint32_t scatter_split_dim, ttnn::operations::reduction::ReduceType reduce_op = ttnn::operations::reduction::ReduceType::Sum, const uint32_t num_links = 1, - const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + const MemoryConfig &output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + ttnn::ccl::Topology topology = ttnn::ccl::Topology::Ring, + const std::optional user_defined_num_workers = std::nullopt, + const std::optional user_defined_num_buffers_per_channel = std::nullopt); + +Tensor reduce_scatter( + const ttnn::Tensor &input_tensor, + const uint32_t scatter_dim, + const uint32_t cluster_axis, + const MeshDevice& mesh_device, + ttnn::operations::reduction::ReduceType reduce_op = ttnn::operations::reduction::ReduceType::Sum, + const uint32_t num_links = 1, + const std::optional& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, ttnn::ccl::Topology topology = ttnn::ccl::Topology::Ring, const std::optional user_defined_num_workers = std::nullopt, const std::optional user_defined_num_buffers_per_channel = std::nullopt); diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.cpp index 1fedbae5584..e36e49ed2ec 100644 --- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.cpp @@ -4,7 +4,6 @@ #include "ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.hpp" -// #include "tt_metal/common/base.hpp" #include "ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp" #include diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.cpp index 3802ef74873..ea28f4bd932 100644 --- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.cpp @@ -21,5 +21,20 @@ ttnn::Tensor ExecuteReduceScatter::invoke( MemoryConfig out_memory_config = memory_config.value_or(input_tensor.memory_config()); return ttnn::operations::ccl::reduce_scatter(input_tensor, scatter_dim, math_op, num_links, out_memory_config, topology, num_workers, num_buffers_per_channel); } +ttnn::Tensor ExecuteReduceScatter::invoke( + const ttnn::Tensor& input_tensor, + const uint32_t scatter_dim, + const uint32_t cluster_axis, + const MeshDevice& mesh_device, + ttnn::operations::reduction::ReduceType math_op, + const uint32_t num_links, + const std::optional& memory_config, + ttnn::ccl::Topology topology, + const std::optional num_workers, + const std::optional num_buffers_per_channel) { + + MemoryConfig out_memory_config = memory_config.value_or(input_tensor.memory_config()); + return ttnn::operations::ccl::reduce_scatter(input_tensor, scatter_dim, cluster_axis, mesh_device, math_op, num_links, out_memory_config, topology, num_workers, num_buffers_per_channel); +} } // namespace ttnn::operations::ccl diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp index 04a11f1f236..b7acc80e794 100644 --- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp @@ -15,6 +15,18 @@ namespace operations { namespace ccl { struct ExecuteReduceScatter { + static ttnn::Tensor invoke( + const Tensor &input_tensor, + const uint32_t scatter_dim, + const uint32_t cluster_axis, + const MeshDevice& mesh_device, + ttnn::operations::reduction::ReduceType reduce_op = ttnn::operations::reduction::ReduceType::Sum, + const uint32_t num_links = 1, + const std::optional& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + ttnn::ccl::Topology topology = ttnn::ccl::Topology::Ring, + const std::optional user_defined_num_workers = std::nullopt, + const std::optional user_defined_num_buffers_per_channel = std::nullopt); + static ttnn::Tensor invoke( const ttnn::Tensor& input_tensor, const uint32_t scatter_dim, diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter_pybind.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter_pybind.cpp index 10574a7efb9..bfac2f9a1d1 100644 --- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter_pybind.cpp @@ -8,7 +8,6 @@ #include #include "ttnn/cpp/pybind11/decorators.hpp" -#include "ttnn/operations/ccl/ccl_host_datastructures.hpp" #include "ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp" #include "ttnn/types.hpp" @@ -44,7 +43,33 @@ void bind_reduce_scatter(pybind11::module& module, const ccl_operation_t& operat py::arg("memory_config") = std::nullopt, py::arg("topology") = ttnn::ccl::Topology::Ring, py::arg("num_workers") = std::nullopt, - py::arg("num_buffers_per_channel") = std::nullopt}); + py::arg("num_buffers_per_channel") = std::nullopt}, + + ttnn::pybind_overload_t{ + [](const ccl_operation_t& self, + const ttnn::Tensor& input_tensor, + const uint32_t scatter_dim, + const uint32_t cluster_axis, + const MeshDevice& mesh_device, + ttnn::operations::reduction::ReduceType math_op, + const uint32_t num_links, + const std::optional& output_mem_config, + const std::optional num_workers, + const std::optional num_buffers_per_channel, + const ttnn::ccl::Topology topology) -> ttnn::Tensor { + return self(input_tensor, scatter_dim, cluster_axis, mesh_device, math_op, num_links, output_mem_config, topology, num_workers, num_buffers_per_channel); + }, + py::arg("input_tensor"), + py::arg("scatter_dim"), + py::arg("cluster_axis"), + py::arg("mesh_device"), + py::arg("math_op"), + py::kw_only(), + py::arg("num_links") = 1, + py::arg("memory_config") = std::nullopt, + py::arg("num_workers") = std::nullopt, + py::arg("num_buffers_per_channel") = std::nullopt, + py::arg("topology") = ttnn::ccl::Topology::Ring}); } } // namespace detail @@ -62,6 +87,11 @@ void py_bind_reduce_scatter(pybind11::module& module) { Args: input_tensor (ttnn.Tensor): multi-device tensor dim (int): Dimension to perform operation + cluster_axis (int): Provided a MeshTensor, the axis corresponding to MeshDevice to perform the line-all-gather operation on. + mesh_device (MeshDevice): Device mesh to perform the line-all-gather operation on. + * cluster_axis and mesh_device parameters are applicable only for Linear Topology. + + Mesh Tensor Programming Guide : https://github.com/tenstorrent/tt-metal/blob/main/tech_reports/Programming%20Mesh%20of%20Devices/Programming%20Mesh%20of%20Devices%20with%20TT-NN.md Keyword Args: num_links (int, optional): Number of links to use for the all-gather operation. Defaults to `1`. diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp index 0163c3d43a0..e081d6bf44d 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp @@ -159,14 +159,7 @@ void py_bind_conv2d(py::module& module) { uint32_t height, uint32_t width, uint32_t in_channels, - uint32_t out_channels, - std::array kernel_size, - std::array stride, - std::array padding, - std::array dilation, - uint32_t weights_width, - uint32_t input_width, - uint32_t groups) -> std::tuple { + uint32_t out_channels) -> std::tuple { return ttnn::operations::conv::conv2d::get_conv_padded_input_shape_and_mem_config( device, input_tensor, @@ -185,14 +178,7 @@ void py_bind_conv2d(py::module& module) { py::arg("height"), py::arg("width"), py::arg("in_channels"), - py::arg("out_channels"), - py::arg("kernel_size"), - py::arg("stride"), - py::arg("padding"), - py::arg("dilation"), - py::arg("weights_width"), - py::arg("input_width"), - py::arg("groups")); + py::arg("out_channels")); module.def( "convert_conv_weight_tensor_to_tiled_layout", diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp index d06ef84d459..fa67ff3b582 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp @@ -24,7 +24,7 @@ namespace optimized_conv_op_utils { using namespace tt; using namespace tt::tt_metal; -std::pair, vector> compute_opt_conv_activation_as_mm_shape(const tt::tt_metal::LegacyShape& conv_activation_shape, ttnn::operations::sliding_window::SlidingWindowConfig sliding_window_config, uint32_t act_block_h_ntiles) { +std::pair, std::vector> compute_opt_conv_activation_as_mm_shape(const tt::tt_metal::LegacyShape& conv_activation_shape, ttnn::operations::sliding_window::SlidingWindowConfig sliding_window_config, uint32_t act_block_h_ntiles) { uint32_t filter_h = (uint32_t)sliding_window_config.window_hw.first; // filter_h uint32_t filter_w = (uint32_t)sliding_window_config.window_hw.second; // filter_W diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp index 3d6eb25c939..038144993ab 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp @@ -178,6 +178,6 @@ using namespace tt; using namespace tt::tt_metal; -std::pair, vector> compute_opt_conv_activation_as_mm_shape(const tt::tt_metal::LegacyShape& conv_activation_shape, ttnn::operations::sliding_window::SlidingWindowConfig sliding_window_config, uint32_t act_block_h_ntiles); +std::pair, std::vector> compute_opt_conv_activation_as_mm_shape(const tt::tt_metal::LegacyShape& conv_activation_shape, ttnn::operations::sliding_window::SlidingWindowConfig sliding_window_config, uint32_t act_block_h_ntiles); } // optimized_conv_op_utils diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp index 8bd6bd51a0d..30197ecc6a5 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp @@ -1272,7 +1272,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( writer_compile_time_args.end(), split_reader_args.begin(), split_reader_args.end()); } - vector compute_kernel_args = { + std::vector compute_kernel_args = { in0_block_w, act_num_subblocks, in0_block_num_tiles, diff --git a/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_program_factory.cpp index 8ff30a226ba..1e36a88de03 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_program_factory.cpp @@ -56,7 +56,7 @@ CloneOperation::ProgramFactory::cached_program_t CloneOperation::ProgramFactory: bool input_is_dram = input_buffer->buffer_type() == BufferType::DRAM ? 1 : 0; bool output_is_dram = output_buffer->buffer_type() == BufferType::DRAM ? 1 : 0; - vector reader_compile_time_args, writer_compile_time_args; + std::vector reader_compile_time_args, writer_compile_time_args; if (tilized) { reader_compile_time_args = { (uint32_t)src_cb_id, @@ -102,7 +102,7 @@ CloneOperation::ProgramFactory::cached_program_t CloneOperation::ProgramFactory: get_compute_kernel_config_args(input.device()->arch(), operation_attributes.compute_kernel_config); auto create_compute_kernel = [&](const auto& core_group, uint32_t num_units_per_core) { if (!core_group.ranges().empty()) { - vector compute_kernel_args = { + std::vector compute_kernel_args = { (uint32_t)src_cb_id, (uint32_t)dst_cb_id, (uint32_t)num_units_per_core, diff --git a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp index af79fddf7d9..a638a6d46e8 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp @@ -46,14 +46,14 @@ operation::ProgramWithCallbacks s2s_rm_concat_two_tensors_multi_core( uint32_t num_output_rows = output.get_legacy_shape()[-2]; uint32_t num_input_tensors = input_tensors.size(); - vector cb_input(num_input_tensors); - vector input_num_units_per_shard_height(num_input_tensors); - vector input_num_units_per_shard_width(num_input_tensors); + std::vector cb_input(num_input_tensors); + std::vector input_num_units_per_shard_height(num_input_tensors); + std::vector input_num_units_per_shard_width(num_input_tensors); tt::DataFormat cb_data_format = tt_metal::datatype_to_dataformat_converter(output.get_dtype()); auto all_cores = input_tensors[0].shard_spec().value().grid; - vector cb_ids(num_input_tensors); + std::vector cb_ids(num_input_tensors); uint32_t input_unit_size = input_tensors[0].shard_spec().value().shape[1] * input_tensors[0].element_size(); // input CBs for (uint32_t input_id = 0; input_id < num_input_tensors; input_id++) { @@ -201,10 +201,10 @@ operation::ProgramWithCallbacks s2s_concat_multi_core( elements_per_page_height = TILE_HEIGHT; } - vector cb_inputs(num_input_tensors); - vector input_num_pages_per_stick(num_input_tensors); - vector input_num_sticks(num_input_tensors); - vector input_write_offsets(num_input_tensors); + std::vector cb_inputs(num_input_tensors); + std::vector input_num_pages_per_stick(num_input_tensors); + std::vector input_num_sticks(num_input_tensors); + std::vector input_write_offsets(num_input_tensors); // Assume inputs and output have the same sharding grid. const auto all_cores = input_tensors[0].shard_spec().value().grid; @@ -299,14 +299,14 @@ operation::ProgramWithCallbacks s2i_rm_concat_multi_core( uint32_t num_output_rows = output.get_legacy_shape()[-1]; uint32_t num_input_tensors = input_tensors.size(); - vector cb_input(num_input_tensors); - vector input_num_units_per_shard_height(num_input_tensors); - vector input_num_units_per_shard_width(num_input_tensors); + std::vector cb_input(num_input_tensors); + std::vector input_num_units_per_shard_height(num_input_tensors); + std::vector input_num_units_per_shard_width(num_input_tensors); tt::DataFormat cb_data_format = tt_metal::datatype_to_dataformat_converter(output.get_dtype()); auto all_cores = input_tensors[0].shard_spec().value().grid; - vector cb_ids(num_input_tensors); + std::vector cb_ids(num_input_tensors); uint32_t input_unit_size = input_tensors[0].shard_spec().value().shape[1] * input_tensors[0].element_size(); // input CBs for (uint32_t input_id = 0; input_id < num_input_tensors; input_id++) { @@ -357,8 +357,8 @@ operation::ProgramWithCallbacks s2i_rm_concat_multi_core( curr_num_output_rows = 0; } - vector reader_runtime_args = {}; - vector writer_runtime_args = { + std::vector reader_runtime_args = {}; + std::vector writer_runtime_args = { output.buffer()->address(), core_id, curr_num_output_rows, @@ -400,8 +400,8 @@ operation::ProgramWithCallbacks s2i_rm_concat_multi_core( curr_num_output_rows = 0; } - vector reader_runtime_args = {curr_num_input_tensors}; - vector writer_runtime_args = { + std::vector reader_runtime_args = {curr_num_input_tensors}; + std::vector writer_runtime_args = { dst_buffer->address(), curr_num_input_tensors, curr_num_output_rows}; for (uint32_t input_id = 0; input_id < num_input_tensors; input_id++) { UpdateDynamicCircularBufferAddress(program, input_id, *dst_buffer); @@ -540,7 +540,7 @@ operation::ProgramWithCallbacks concat_multi_core( num_output_pages_per_block += num_accum_pages * dim_pages; } } - vector common_reader_kernel_args = {0, 0, 0}; + std::vector common_reader_kernel_args = {0, 0, 0}; common_reader_kernel_args.insert(common_reader_kernel_args.end(), src_addr.begin(), src_addr.end()); common_reader_kernel_args.insert(common_reader_kernel_args.end(), is_dram.begin(), is_dram.end()); common_reader_kernel_args.insert( @@ -615,13 +615,13 @@ operation::ProgramWithCallbacks concat_multi_core( } } - vector reader_kernel_args = common_reader_kernel_args; + std::vector reader_kernel_args = common_reader_kernel_args; reader_kernel_args[0] = num_pages_per_core; reader_kernel_args[1] = curr_tensor; reader_kernel_args[2] = curr_tensor_id; reader_kernel_args.insert(reader_kernel_args.end(), page_id_per_tensor.begin(), page_id_per_tensor.end()); - vector writer_kernel_args; + std::vector writer_kernel_args; if (rm_layout) { writer_kernel_args = { dst_buffer->address(), output.buffer()->page_size(), num_pages_per_core, num_pages_written}; diff --git a/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_program_factory.cpp index 7556845fa77..e1f52f4b6be 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_program_factory.cpp @@ -102,7 +102,7 @@ operation::ProgramWithCallbacks copy_multi_core(const Tensor &input, const Tenso tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args, kernel_defines)); if (convert_dtype) { - vector compute_kernel_args_group_1 = { + std::vector compute_kernel_args_group_1 = { num_units_per_core_group_1 }; auto eltwise_unary_kernel_group_1 = tt::tt_metal::CreateKernel( @@ -113,7 +113,7 @@ operation::ProgramWithCallbacks copy_multi_core(const Tensor &input, const Tenso ); if (!core_group_2.ranges().empty()) { - vector compute_kernel_args_group_2 = { + std::vector compute_kernel_args_group_2 = { num_units_per_core_group_2 }; auto eltwise_unary_kernel_group_2 = tt::tt_metal::CreateKernel( diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp index b8d2756443d..8951df02fee 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp @@ -531,7 +531,7 @@ operation::ProgramWithCallbacks pad_tile(const Tensor &a, Tensor& output, const } -inline void log_rt_args(const CoreCoord& core, vector& args) { +inline void log_rt_args(const CoreCoord& core, std::vector& args) { for (auto v : args) { tt::log_debug(tt::LogOp, "{},{} :: {}", core.x, core.y, v); } @@ -1276,7 +1276,7 @@ inline std::vector, std::vector>> get_ } // reader rt args - vector reader_kernel_args; + std::vector reader_kernel_args; reader_kernel_args.push_back(core_stick_map.size()); // num_cores tt::log_debug("num_cores: {}", core_stick_map.size()); @@ -1296,7 +1296,7 @@ inline std::vector, std::vector>> get_ } // coalesce the sticks into chunks - vector>> stick_chunks_per_core; + std::vector>> stick_chunks_per_core; for (auto core_stick_pair : core_stick_map) { auto stick_chunks = group_contiguous_and_repeated_values(core_stick_pair.second); stick_chunks_per_core.push_back(stick_chunks); diff --git a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.cpp index 80aa09138dc..96d173d1712 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.cpp @@ -97,7 +97,7 @@ operation::ProgramWithCallbacks repeat_multi_core( num_pages_per_block = num_accum_pages * dim_pages; } - vector reader_kernel_args = {src_addr, 0, num_pages_per_block, 0, 0, 0, 0}; + std::vector reader_kernel_args = {src_addr, 0, num_pages_per_block, 0, 0, 0, 0}; if (rm_layout) { reader_kernel_args.push_back(src_page_size); } @@ -164,7 +164,7 @@ operation::ProgramWithCallbacks repeat_multi_core( reader_kernel_args[5] = curr_block_start_id; reader_kernel_args[6] = curr_id; - vector writer_kernel_args; + std::vector writer_kernel_args; if (rm_layout) { writer_kernel_args = { dst_buffer->address(), output.buffer()->page_size(), num_pages_per_core, num_pages_written}; diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp index db61e2d169e..7f459d8046e 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp @@ -59,7 +59,7 @@ ttnn::Tensor convert_tensor_to_rm_reshape_convert_back_to_orig_layout(const ttnn //Constraint in device kernel uint32_t ROW_MAJOR_WIDTH = 8; ttnn::Tensor reshaped_rm_tensor; - if((tensor_shape[-1] % ROW_MAJOR_WIDTH == 0 && shape[-1] % ROW_MAJOR_WIDTH == 0) and tensor_shape.rank() == 4) { + if((tensor_shape[-1] % ROW_MAJOR_WIDTH == 0 && shape[-1] % ROW_MAJOR_WIDTH == 0)) { auto rm_tensor = ttnn::to_layout(tensor, ttnn::ROW_MAJOR_LAYOUT, std::nullopt, std::nullopt, (Device *)nullptr); if (rm_tensor.is_contiguous()) { // Page size depends on the width, so only modify the shape if the width is the same diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_program_factory.cpp index e7da8344843..6d585e65a13 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_program_factory.cpp @@ -125,7 +125,7 @@ operation::ProgramWithCallbacks sharded_to_interleaved_multi_core( tt_metal::WriterDataMovementConfig(writer_compile_time_args)); } if (convert_df) { - vector compute_kernel_args = {num_units_per_shard}; + std::vector compute_kernel_args = {num_units_per_shard}; auto eltwise_unary_kernel_group_1 = tt_metal::CreateKernel( program, diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp index e7215850ea1..e159c7c02c4 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp @@ -61,7 +61,7 @@ inline std::vector, std::vector>> get_ uint32_t unpadded_row_size_bytes_offset = output_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? tt::round_up(unpadded_row_size_bytes, TILE_WIDTH) : tt::round_up(unpadded_row_size_bytes, TILE_WIDTH / 2); - vector common_reader_kernel_args = { + std::vector common_reader_kernel_args = { input_tensor.buffer()->address() + output_tensor_start[-1] * output_tensor.element_size(), padded_row_size_bytes, unpadded_row_size_bytes, @@ -108,7 +108,7 @@ inline std::vector, std::vector>> get_ unpadded_written = unpadded_written / num_unpadded_sticks_per_dim[j]; start_id += id_per_dim[j] * accumulated_total_per_dim[j - 1]; } - vector reader_kernel_args = common_reader_kernel_args; + std::vector reader_kernel_args = common_reader_kernel_args; // uint32_t addr_offset = 5; // input buffer addr, padded_row_size_bytes, unpadded_row_size_bytes, num_dims reader_kernel_args[addr_offset++] = start_id; @@ -117,7 +117,7 @@ inline std::vector, std::vector>> get_ reader_kernel_args[addr_offset] = num_read_per_barrier; reader_kernel_args.insert(reader_kernel_args.end(), id_per_dim.begin(), id_per_dim.end()); - vector writer_kernel_args = { + std::vector writer_kernel_args = { output_buffer->address(), unpadded_row_size_bytes, unpadded_row_size_bytes_offset, num_sticks_per_core, num_sticks_per_core_read, num_read_per_barrier, num_sticks_written, 0}; num_sticks_written += num_sticks_per_core; ret_val[i] = {reader_kernel_args, writer_kernel_args}; @@ -493,7 +493,7 @@ inline std::vector, std::vector>> get_ } // reader rt args - vector reader_kernel_args; + std::vector reader_kernel_args; reader_kernel_args.push_back(core_stick_map.size()); // num_cores tt::log_debug("num_cores: {}", core_stick_map.size()); @@ -513,7 +513,7 @@ inline std::vector, std::vector>> get_ } // coalesce the sticks into chunks - vector>> stick_chunks_per_core; + std::vector>> stick_chunks_per_core; for (auto core_stick_pair : core_stick_map) { auto stick_chunks = group_contiguous_values(core_stick_pair.second); stick_chunks_per_core.push_back(stick_chunks); @@ -531,7 +531,7 @@ inline std::vector, std::vector>> get_ } } - vector writer_kernel_args; + std::vector writer_kernel_args; ret_val[i] = {reader_kernel_args, writer_kernel_args}; } diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.cpp index 26152476af7..1f9acdd8e3f 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.cpp @@ -115,7 +115,7 @@ operation::ProgramWithCallbacks tilize_single_core(const Tensor& a, Tensor& outp core, tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args)); - vector compute_args = { + std::vector compute_args = { num_tiles / num_tiles_per_block, // per_core_block_cnt num_tiles_per_block // per_core_block_tile_cnt }; @@ -206,8 +206,8 @@ operation::ProgramWithCallbacks tilize_multi_core_interleaved(const Tensor& a, T /** compute */ - vector compute_args = {nblocks_per_core, ntiles_per_block}; - vector compute_args_cliff = {nblocks_per_core_cliff, ntiles_per_block}; + std::vector compute_args = {nblocks_per_core, ntiles_per_block}; + std::vector compute_args_cliff = {nblocks_per_core_cliff, ntiles_per_block}; if (core_range.ranges().size() > 0) { auto tilize_kernel_id = CreateKernel( @@ -370,7 +370,7 @@ operation::ProgramWithCallbacks tilize_multi_core_sharded(const Tensor& input, T all_cores, tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args)); - vector compute_args = {uint32_t(num_tiles_per_shard / num_tiles_per_row), uint32_t(num_tiles_per_row)}; + std::vector compute_args = {uint32_t(num_tiles_per_shard / num_tiles_per_row), uint32_t(num_tiles_per_row)}; auto untilize_kernel_id = tt::tt_metal::CreateKernel( program, diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.cpp index 716264458d8..2baf47aa6b7 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.cpp @@ -157,7 +157,7 @@ operation::ProgramWithCallbacks tilize_with_val_padding_single_core( core, tt::tt_metal::WriterDataMovementConfig({output_cb_index, out_is_dram})); - vector compute_kernel_args = {uint32_t(num_tiles / num_tiles_per_block), uint32_t(num_tiles_per_block)}; + std::vector compute_kernel_args = {uint32_t(num_tiles / num_tiles_per_block), uint32_t(num_tiles_per_block)}; auto tilize_kernel_id = tt::tt_metal::CreateKernel( program, @@ -292,7 +292,7 @@ operation::ProgramWithCallbacks tilize_with_val_padding_multi_core_interleaved( const std::vector& assignment = core_assignments.at(i); // reader runtime args - vector reader_rt_args = { + std::vector reader_rt_args = { src0_buffer->address(), unpadded_row_size_bytes, padded_row_size_bytes, @@ -425,7 +425,7 @@ operation::ProgramWithCallbacks tilize_with_val_padding_multi_core_sharded( */ KernelHandle unary_writer_kernel_id; bool out_is_dram = dst_buffer->buffer_type() == BufferType::DRAM ? 1 : 0; - vector writer_ct_args = { + std::vector writer_ct_args = { output_cb_index, }; unary_writer_kernel_id = CreateKernel( @@ -436,7 +436,7 @@ operation::ProgramWithCallbacks tilize_with_val_padding_multi_core_sharded( /** compute */ - vector compute_args = { + std::vector compute_args = { (uint32_t)nblocks_per_core, // per_core_block_cnt (uint32_t)ntiles_per_block, // per_block_ntiles }; diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp index 85c84790518..9cbbaf6cb2b 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp @@ -1637,7 +1637,7 @@ operation::ProgramWithCallbacks transpose_wh_multi_core_sharded(const Tensor &a, uint32_t NHtWt = N * HtWt; auto bbox = all_cores.bounding_box(); - vector cores = grid_to_cores_with_noop(bbox.end_coord.x, bbox.end_coord.y, num_cores_x, num_cores_y, row_major); + std::vector cores = grid_to_cores_with_noop(bbox.end_coord.x, bbox.end_coord.y, num_cores_x, num_cores_y, row_major); std::vector< std::vector > unary_reader_args = { cores.size(), std::vector(1) }; std::vector< std::vector > unary_compute_args = { cores.size(), std::vector(5) }; @@ -1703,7 +1703,7 @@ operation::ProgramWithCallbacks transpose_wh_multi_core_sharded(const Tensor &a, bool row_major = shard_spec.orientation == ShardOrientation::ROW_MAJOR; auto bbox = all_cores.bounding_box(); - vector cores = grid_to_cores_with_noop(bbox.end_coord.x, bbox.end_coord.y, num_cores_x, num_cores_y, row_major); + std::vector cores = grid_to_cores_with_noop(bbox.end_coord.x, bbox.end_coord.y, num_cores_x, num_cores_y, row_major); std::vector< std::vector > unary_reader_args = { cores.size(), std::vector(1) }; std::vector< std::vector > unary_compute_args = { cores.size(), std::vector(5) }; std::vector< std::vector > unary_writer_args = { cores.size(), std::vector(1) }; diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp index 2d9c90c2eb8..22826b7cd4d 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp @@ -135,11 +135,11 @@ operation::ProgramWithCallbacks untilize_multi_core_parallelize_column( /** compute */ - vector compute_args = { + std::vector compute_args = { (uint32_t)nblocks_per_core, // per_core_block_cnt (uint32_t)ntiles_per_block, // per_block_ntiles }; - vector compute_args_cliff = { + std::vector compute_args_cliff = { (uint32_t)nblocks_per_core_cliff , (uint32_t)ntiles_per_block, // per_block_ntiles }; @@ -384,7 +384,7 @@ operation::ProgramWithCallbacks untilize_multi_core( tt::tt_metal::ReaderDataMovementConfig(reader_ct_args)); } else { bool src0_is_dram = src0_buffer->buffer_type() == BufferType::DRAM ? 1 : 0; - vector reader_ct_args = {(uint32_t)src0_is_dram}; + std::vector reader_ct_args = {(uint32_t)src0_is_dram}; unary_reader_kernel_id = CreateKernel( program, @@ -406,7 +406,7 @@ operation::ProgramWithCallbacks untilize_multi_core( } else { bool out_is_dram = dst_buffer->buffer_type() == BufferType::DRAM ? 1 : 0; if (src_block_sharded) { - vector writer_ct_args = { + std::vector writer_ct_args = { (uint32_t)out_is_dram, (uint32_t)(input_cb_data_format == tt::DataFormat::Float32)}; unary_writer_kernel_id = CreateKernel( program, @@ -416,7 +416,7 @@ operation::ProgramWithCallbacks untilize_multi_core( } else { bool stick_size_is_power_of_two = is_power_of_two_at_least_32(block_size_nbytes); uint32_t log2_stick_size = stick_size_is_power_of_two ? (std::uint32_t)std::log2(block_size_nbytes) : 0; - vector writer_ct_args = { + std::vector writer_ct_args = { (uint32_t)out_is_dram, (uint32_t)stick_size_is_power_of_two, (uint32_t)log2_stick_size, @@ -433,11 +433,11 @@ operation::ProgramWithCallbacks untilize_multi_core( /** compute */ - vector compute_args = { + std::vector compute_args = { (uint32_t)nblocks_per_core, // per_core_block_cnt (uint32_t)ntiles_per_block, // per_block_ntiles }; - vector compute_args_cliff = { + std::vector compute_args_cliff = { (uint32_t)nblocks_per_core_cliff, (uint32_t)ntiles_per_block, // per_block_ntiles }; @@ -482,7 +482,7 @@ operation::ProgramWithCallbacks untilize_multi_core( continue; } // reader runtime args - vector reader_rt_args; + std::vector reader_rt_args; if (src_sharded) { reader_rt_args = { @@ -499,7 +499,7 @@ operation::ProgramWithCallbacks untilize_multi_core( // ntiles_per_block * nblocks_per_core); // writer runtime args - vector writer_rt_args; + std::vector writer_rt_args; if (out_sharded) { writer_rt_args = { ntiles_per_block * nblocks_per_core // ntiles @@ -570,7 +570,7 @@ operation::ProgramWithCallbacks untilize_multi_core( CoreCoord core = row_major ? CoreCoord{ncores_full % ncores_x, ncores_full / ncores_x} : CoreCoord{ncores_full / ncores_y, ncores_full % ncores_y}; // reader runtime args - vector reader_rt_args; + std::vector reader_rt_args; if (src_sharded) { reader_rt_args = { @@ -587,7 +587,7 @@ operation::ProgramWithCallbacks untilize_multi_core( // nblocks_per_core_cliff); // writer runtime args - vector writer_rt_args; + std::vector writer_rt_args; if (out_sharded) { writer_rt_args = { ntiles_per_block * nblocks_per_core_cliff // ntiles @@ -787,7 +787,7 @@ operation::ProgramWithCallbacks untilize_single_core( core, tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args)); - vector compute_args = { + std::vector compute_args = { uint32_t(num_tiles / num_tiles_per_block), // per_core_block_cnt uint32_t(num_tiles_per_block) // per_core_block_tile_cnt }; diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp index 20c55da5bd3..7699c7a3403 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp @@ -49,7 +49,7 @@ operation::ProgramWithCallbacks untilize_with_halo_multi_core_v2( auto output_shard_shape = output_tensor.shard_spec().value().shape; TT_ASSERT(input_shard_shape[1] == output_shard_shape[1]); uint32_t input_nhw_height = input_shape[0] * input_shape[1] * input_shape[2]; - uint32_t remapped_input_shard_shape_for_output_grid = input_nhw_height / ncores_nhw; + uint32_t remapped_input_shard_shape_for_output_grid = tt::div_up(input_nhw_height, ncores_nhw); uint32_t ntiles_per_block = tt::div_up(input_shard_shape[1], TILE_WIDTH); uint32_t input_nblocks_per_core = tt::div_up(remapped_input_shard_shape_for_output_grid, TILE_HEIGHT); uint32_t input_npages = ntiles_per_block * input_nblocks_per_core; diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp index 432aa4f43d9..10c8ce9dc22 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp @@ -157,7 +157,7 @@ operation::ProgramWithCallbacks untilize_with_unpadding_single_core( core, tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args)); - vector compute_args = {uint32_t(num_tiles / num_tiles_per_block), uint32_t(num_tiles_per_block)}; + std::vector compute_args = {uint32_t(num_tiles / num_tiles_per_block), uint32_t(num_tiles_per_block)}; std::string compute_kernel( "ttnn/cpp/ttnn/operations/data_movement/untilize/device/kernels/compute/pack_untilize.cpp"); @@ -314,7 +314,7 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_interleaved( const std::vector& assignment = core_assignments.at(i); // writer runtime args - vector writer_rt_args = { + std::vector writer_rt_args = { dst_buffer->address(), unpadded_row_size_bytes, padded_row_size_bytes, @@ -475,7 +475,7 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_sharded( */ KernelHandle unary_writer_kernel_id; if (out_sharded) { - vector writer_ct_args = {(uint32_t)output_cb_index, (uint32_t)sharded_output_cb_index}; + std::vector writer_ct_args = {(uint32_t)output_cb_index, (uint32_t)sharded_output_cb_index}; unary_writer_kernel_id = CreateKernel( program, unpad_tensor_w_16 ? "ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/kernels/dataflow/" @@ -486,7 +486,7 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_sharded( WriterDataMovementConfig(writer_ct_args)); } else { bool out_is_dram = dst_buffer->buffer_type() == BufferType::DRAM ? 1 : 0; - vector writer_ct_args = { + std::vector writer_ct_args = { (uint32_t)out_is_dram, (uint32_t)(input_cb_data_format == tt::DataFormat::Float32)}; unary_writer_kernel_id = CreateKernel( program, @@ -497,7 +497,7 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_sharded( /** compute */ - vector compute_args = { + std::vector compute_args = { (uint32_t)nblocks_per_core, // per_core_block_cnt (uint32_t)ntiles_per_block, // per_block_ntiles }; @@ -529,7 +529,7 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_sharded( std::vector cores; if (out_sharded) { - vector writer_rt_args; + std::vector writer_rt_args; if (unpad_tensor_w_16) { writer_rt_args = {num_output_rows_unpadded, num_input_tiles}; } else { diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp index 49980f2dc42..af6ee7605c6 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp @@ -110,13 +110,18 @@ auto preprocess_inputs( auto repeat_smaller = [](const auto &first, auto &second) { const auto first_shape = first.get_shape(); const auto second_shape = second.get_shape(); - // repeats second if it is smaller if (first_shape.rank() == 4 and second_shape.rank() == 4 and first_shape[0] > second_shape[0]) { - tt::log_warning(tt::LogOp, "Using repeat op to broadcast batch dim"); + TT_FATAL(second_shape[0] == 1, "Dimension trying to broadcast is not equal to 1"); Shape repeats(std::array{first_shape[0], 1, 1, 1}); second = ttnn::repeat(second, repeats); } + // repeats second if it is smaller + if (first_shape.rank() == 4 and second_shape.rank() == 4 and first_shape[1] > second_shape[1]) { + TT_FATAL(second_shape[1] == 1, "Dimension trying to broadcast is not equal to 1"); + Shape repeats(std::array{1, first_shape[1], 1, 1}); + second = ttnn::repeat(second, repeats); + } }; repeat_smaller(input_tensor_a, input_tensor_b); repeat_smaller(input_tensor_b, input_tensor_a); diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp index 1303233ab57..96936013a3b 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp @@ -280,7 +280,7 @@ Tensor ExecuteBinaryRemainder::invoke(const Tensor& input, float scalar, const s // Binary FMOD will be overloaded by unary FMOD in another PR Tensor ExecuteBinaryFmod::invoke(const Tensor& input_a, const Tensor& input_b, const std::optional& output_mem_config) { auto arch = input_a.device()->arch(); - TT_FATAL(arch == tt::ARCH::WORMHOLE_B0, "Op is only supported on Wormhole"); + TT_FATAL(arch == tt::ARCH::WORMHOLE_B0 or arch == tt::ARCH::BLACKHOLE, "Op is only supported on Wormhole or Blackhole"); DataType input_dtype = input_a.get_dtype(); Tensor a = typecast(input_a, DataType::FLOAT32); Tensor b = typecast(input_b, DataType::FLOAT32); diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp index eb57eb345b9..a6f6f3f8650 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp @@ -77,7 +77,7 @@ inline __attribute__((always_inline)) void set_eltwise_binary_runtime_args( uint32_t block_height = 0, block_width = 0, block_size = 0, output_width = 0, last_unpadded_block_height = 0, last_unpadded_block_width = 0; CoreCoord end_core; - vector cores; + std::vector cores; if (shard_spec.has_value()) { all_cores = shard_spec.value().grid; diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp index dee368d713c..1ba9dbd7ac5 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp @@ -817,7 +817,7 @@ Tensor _normalize_global(const Tensor& y, const std::optional& ou Tensor _frac(const Tensor& input, const std::optional& output_mem_config) { auto arch = input.device()->arch(); - TT_FATAL(arch == tt::ARCH::WORMHOLE_B0, "Op is only supported on Wormhole"); + TT_FATAL(arch == tt::ARCH::WORMHOLE_B0 or arch == tt::ARCH::BLACKHOLE, "Op is only supported on Wormhole or Blackhole"); Tensor trunc_res = ttnn::trunc(input); Tensor result = ttnn::subtract(input, trunc_res, std::nullopt, output_mem_config); return result; diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_program_factory.cpp index bca50f4d410..228bef05915 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_program_factory.cpp @@ -80,12 +80,12 @@ UnaryProgramFactory::cached_program_t UnaryProgramFactory::create( all_cores, tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args)); - vector compute_kernel_args_group_1 = { + std::vector compute_kernel_args_group_1 = { num_tiles_per_core_group_1, // per_core_block_cnt 1 // per_core_block_size }; - vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); + std::vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); if (args.preserve_fp32_precision) { unpack_to_dest_mode[src0_cb_index] = UnpackToDestMode::UnpackToDestFp32; } @@ -106,7 +106,7 @@ UnaryProgramFactory::cached_program_t UnaryProgramFactory::create( .defines = unary_defines}); if (!core_group_2.ranges().empty()) { - vector compute_kernel_args_group_2 = { + std::vector compute_kernel_args_group_2 = { num_tiles_per_core_group_2, // per_core_block_cnt 1 // per_core_block_size }; diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp index 2920fb011c2..e9d4c2d84e9 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp @@ -109,12 +109,12 @@ UnaryShardedProgramFactory::cached_program_t UnaryShardedProgramFactory::create( all_cores, tt::tt_metal::ReaderDataMovementConfig(reader_compile_time_args, kernel_defines)); - vector compute_kernel_args_group_1 = { + std::vector compute_kernel_args_group_1 = { 1, // per_core_block_cnt num_tile_per_core // per_core_block_size }; - vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); + std::vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); if (args.preserve_fp32_precision) { unpack_to_dest_mode[in_cb_id] = UnpackToDestMode::UnpackToDestFp32; } diff --git a/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp b/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp index 38b8a222c48..f35d2793889 100644 --- a/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp @@ -162,7 +162,7 @@ operation::ProgramWithCallbacks embeddings_tilized( embedding_defines)); if (num_blocks_per_core_group_1 > 0) { - vector compute_args_1 = { + std::vector compute_args_1 = { uint32_t(num_blocks_per_core_group_1), // per_core_block_cnt uint32_t(num_tiles_per_block) // per_core_block_tile_cnt }; @@ -174,7 +174,7 @@ operation::ProgramWithCallbacks embeddings_tilized( } if (num_blocks_per_core_group_2 > 0) { - vector compute_args_2 = { + std::vector compute_args_2 = { uint32_t(num_blocks_per_core_group_2), // per_core_block_cnt uint32_t(num_tiles_per_block) // per_core_block_tile_cnt }; diff --git a/ttnn/cpp/ttnn/operations/examples/example/device/multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/examples/example/device/multi_core_program_factory.cpp index 56f1d815fc4..451d6db66d9 100644 --- a/ttnn/cpp/ttnn/operations/examples/example/device/multi_core_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/examples/example/device/multi_core_program_factory.cpp @@ -68,7 +68,7 @@ ExampleDeviceOperation::MultiCore::cached_program_t ExampleDeviceOperation::Mult all_cores, tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args)); - vector compute_kernel_args_group_1 = { + std::vector compute_kernel_args_group_1 = { num_tiles_per_core_group_1, // per_core_block_cnt 1 // per_core_block_size }; @@ -84,7 +84,7 @@ ExampleDeviceOperation::MultiCore::cached_program_t ExampleDeviceOperation::Mult .compile_args = compute_kernel_args_group_1}); if (!core_group_2.ranges().empty()) { - vector compute_kernel_args_group_2 = { + std::vector compute_kernel_args_group_2 = { num_tiles_per_core_group_2, // per_core_block_cnt 1 // per_core_block_size }; diff --git a/ttnn/cpp/ttnn/operations/examples/example/device/single_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/examples/example/device/single_core_program_factory.cpp index 71e3acfaa72..34dea89e96e 100644 --- a/ttnn/cpp/ttnn/operations/examples/example/device/single_core_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/examples/example/device/single_core_program_factory.cpp @@ -68,7 +68,7 @@ ExampleDeviceOperation::SingleCore::cached_program_t ExampleDeviceOperation::Sin all_cores, tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args)); - vector compute_kernel_args_group_1 = { + std::vector compute_kernel_args_group_1 = { num_tiles_per_core_group_1, // per_core_block_cnt 1 // per_core_block_size }; @@ -84,7 +84,7 @@ ExampleDeviceOperation::SingleCore::cached_program_t ExampleDeviceOperation::Sin .compile_args = compute_kernel_args_group_1}); if (!core_group_2.ranges().empty()) { - vector compute_kernel_args_group_2 = { + std::vector compute_kernel_args_group_2 = { num_tiles_per_core_group_2, // per_core_block_cnt 1 // per_core_block_size }; diff --git a/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/single_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/single_core_program_factory.cpp index 204e66cb70b..b16a181b43a 100644 --- a/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/single_core_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/single_core_program_factory.cpp @@ -73,7 +73,7 @@ ExampleMultipleReturnDeviceOperation::SingleCore::cached_program_t ExampleMultip all_cores, tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args)); - vector compute_kernel_args_group_1 = { + std::vector compute_kernel_args_group_1 = { num_tiles_per_core_group_1, // per_core_block_cnt 1 // per_core_block_size }; @@ -89,7 +89,7 @@ ExampleMultipleReturnDeviceOperation::SingleCore::cached_program_t ExampleMultip .compile_args = compute_kernel_args_group_1}); if (!core_group_2.ranges().empty()) { - vector compute_kernel_args_group_2 = { + std::vector compute_kernel_args_group_2 = { num_tiles_per_core_group_2, // per_core_block_cnt 1 // per_core_block_size }; diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_program_factory.cpp index a24a8f45fed..2088ec7bb5c 100644 --- a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_program_factory.cpp @@ -149,7 +149,7 @@ operation::ProgramWithCallbacks multi_core_attn_matmul(const Tensor &a, const Te all_device_cores, tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args)); - vector compute_args = { + std::vector compute_args = { (uint32_t) transpose_hw_bool, // transpose_hw for matmul_init }; // bmm compute kernel the B, Mt, Nt are just 3 for loops that technically act as 1 large loop, so only set Nt for simplicity diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_program_factory.cpp index 011e9c11e97..8849d331c83 100644 --- a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_program_factory.cpp @@ -224,7 +224,7 @@ operation::ProgramWithCallbacks multi_core_group_attn_matmul(const Tensor &a, co } ); - vector compute_args = { + std::vector compute_args = { (uint32_t) transpose_hw_bool, // transpose_hw for matmul_init out_subblock_w, out_subblock_num_tiles, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_program_factory.cpp index 7db3b1a9e78..c4a346f3e61 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_program_factory.cpp @@ -24,7 +24,7 @@ std::vector, std::vector>> get_unpad_r auto input_buffer = input_tensor.buffer(); auto input_shape = input_tensor.get_legacy_shape(); - vector common_reader_kernel_args = {input_buffer->address(), 0}; + std::vector common_reader_kernel_args = {input_buffer->address(), 0}; std::vector, std::vector>> ret_val(num_cores_total); @@ -35,9 +35,9 @@ std::vector, std::vector>> get_unpad_r CoreCoord core = {i % num_cores_x, i / num_cores_x}; // reader and writer kernel args - vector reader_kernel_args = common_reader_kernel_args; + std::vector reader_kernel_args = common_reader_kernel_args; reader_kernel_args[1] = start_id; - vector writer_kernel_args = { + std::vector writer_kernel_args = { num_tiles_per_core, }; ret_val[i] = {reader_kernel_args, writer_kernel_args}; diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp index f9ed0492f56..e1b2660ef51 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp @@ -286,7 +286,7 @@ operation::ProgramWithCallbacks rotary_embedding_multi_core( all_cores, tt_metal::WriterDataMovementConfig(writer_compile_time_args, writer_kernel_defines)); - vector compute_kernel_args = { + std::vector compute_kernel_args = { (std::uint32_t)input_cb_index, (std::uint32_t)rotated_input_cb_index, (std::uint32_t)cos_cb_index, diff --git a/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.cpp b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.cpp new file mode 100644 index 00000000000..6b8ff0ba570 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.cpp @@ -0,0 +1,67 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 +#include "index_fill_device_operation.hpp" + +#include "ttnn/tensor/tensor.hpp" + +namespace ttnn::operations::index_fill { +IndexFillOperation::program_factory_t IndexFillOperation::select_program_factory( + const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) { + return MultiCore{}; +} + +void IndexFillOperation::validate( + const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) { + const auto& input = tensor_args.input; + const auto& index = tensor_args.index; + const uint32_t dim = operation_attributes.dim; + TT_FATAL(input.storage_type() == StorageType::DEVICE, "Index fill: Input must be on device"); + TT_FATAL(input.buffer() != nullptr, "Index fill: Input must be allocated in buffer on device"); + TT_FATAL( + input.memory_config().memory_layout == TensorMemoryLayout::INTERLEAVED, + "Index fill: Not currently supporting sharding"); + TT_FATAL( + operation_attributes.memory_config.memory_layout == TensorMemoryLayout::INTERLEAVED, + "Index fill: Not currently supporting sharding"); + TT_FATAL(index.get_logical_shape().rank() == 1, + "Index fill: Index tensor must be 1D!"); + TT_FATAL( + dim < input.get_logical_shape().rank() && dim >= 0, + "Index fill: Invalid dimension"); + TT_FATAL(index.get_logical_shape().rank() == 1, + "Index fill: Index tensor must be 1D!"); +} +void IndexFillOperation::validate_on_program_cache_miss( + const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) { + validate(operation_attributes, tensor_args); +} +void IndexFillOperation::validate_on_program_cache_hit( + const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) { + validate(operation_attributes, tensor_args); +} +IndexFillOperation::shape_return_value_t IndexFillOperation::compute_output_shapes( + const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) { + return tensor_args.input.get_logical_shape(); +} +IndexFillOperation::tensor_return_value_t IndexFillOperation::create_output_tensors( + const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) { + const auto output_shape = compute_output_shapes(operation_attributes, tensor_args); + const auto& input = tensor_args.input; + return create_device_tensor( + output_shape, + input.tensor_attributes->dtype, + input.tensor_attributes->layout, + input.device(), + operation_attributes.memory_config); +} +std::tuple IndexFillOperation::invoke( + const Tensor& input, + const uint32_t dim, + const Tensor& index, + const std::variant value, + const std::optional& memory_config) { + return { + operation_attributes_t{dim, value, memory_config.value_or(input.memory_config())}, tensor_args_t{input, index}}; +} +} // namespace ttnn::operations::index_fill diff --git a/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.hpp b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.hpp new file mode 100644 index 00000000000..7ed3cb413c3 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.hpp @@ -0,0 +1,59 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 +#include +#include + +#include "ttnn/decorators.hpp" +#include "ttnn/tensor/tensor.hpp" +#include "ttnn/types.hpp" +namespace ttnn::operations::index_fill { +struct IndexFillOperation { + struct operation_attributes_t { + const uint32_t dim; + const std::variant value; + const MemoryConfig memory_config; + }; + struct tensor_args_t { + const Tensor& input; + const Tensor& index; + }; + using shape_return_value_t = SimpleShape; + using tensor_return_value_t = Tensor; + struct MultiCore { + struct shared_variables_t { + KernelHandle reader_kernel_id; + KernelHandle writer_kernel_id; + std::size_t num_cores; + std::size_t num_cores_y; + }; + using cached_program_t = ttnn::device_operation::CachedProgram; + static cached_program_t create( + const operation_attributes_t& operation_attributes, + const tensor_args_t& tensor_args, + tensor_return_value_t& output); + static void override_runtime_arguments( + cached_program_t& cached_program, + const operation_attributes_t& operation_attributes, + const tensor_args_t& tensor_args, + tensor_return_value_t& output); + }; + using program_factory_t = std::variant; + static program_factory_t select_program_factory(const operation_attributes_t&, const tensor_args_t&); + static void validate_on_program_cache_miss(const operation_attributes_t&, const tensor_args_t&); + static void validate_on_program_cache_hit(const operation_attributes_t&, const tensor_args_t&); + static void validate(const operation_attributes_t&, const tensor_args_t&); + static shape_return_value_t compute_output_shapes(const operation_attributes_t&, const tensor_args_t&); + static tensor_return_value_t create_output_tensors(const operation_attributes_t&, const tensor_args_t&); + static std::tuple invoke( + const Tensor& input, + const uint32_t dim, + const Tensor& index, + const std::variant value, + const std::optional& memory_config); +}; +} // namespace ttnn::operations::index_fill +namespace ttnn::prim { +constexpr auto index_fill = + ttnn::register_operation<"ttnn::prim::index_fill", ttnn::operations::index_fill::IndexFillOperation>(); +} diff --git a/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_multi_core_factory.cpp b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_multi_core_factory.cpp new file mode 100644 index 00000000000..7327d13178f --- /dev/null +++ b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_multi_core_factory.cpp @@ -0,0 +1,196 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "host_api.hpp" +#include "impl/buffers/circular_buffer_types.hpp" +#include "index_fill_device_operation.hpp" +#include "tt_metal/common/work_split.hpp" +#include "tt_metal/host_api.hpp" +#include "ttnn/tensor/types.hpp" + +using namespace tt; +using namespace tt::tt_metal; +using namespace tt::constants; +using namespace tt::tt_metal::detail; + +union datatype { + uint32_t u32; + float f32; +} u_fill_value; + +namespace ttnn::operations::index_fill { +IndexFillOperation::MultiCore::cached_program_t IndexFillOperation::MultiCore::create( + const operation_attributes_t& operation_attributes, + const tensor_args_t& tensor_args, + tensor_return_value_t& output) { + const Tensor& index = tensor_args.index; + const Tensor& input = tensor_args.input; + uint32_t dim = operation_attributes.dim; + + auto dtype = input.get_dtype(); + + const auto input_shape = input.get_logical_shape(); + const auto n = input_shape.rank(); + + uint32_t num_rows_to_fill_per_index = 1; + for (int i = n - 2; i > dim; i--) { + num_rows_to_fill_per_index *= input_shape[i]; + } + + auto fill_value = operation_attributes.value; + if (std::holds_alternative(fill_value)) { + u_fill_value.u32 = std::get(fill_value); + } else if (std::holds_alternative(fill_value)) { + u_fill_value.f32 = std::get(fill_value); + } + + auto num_rows = input.volume() / input.get_logical_shape()[-1]; + Program program{}; + Device* device = input.device(); + + auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); + uint32_t num_cores_x = compute_with_storage_grid_size.x; + uint32_t num_cores_y = compute_with_storage_grid_size.y; + + auto [num_cores, all_cores, core_group_1, core_group_2, num_rows_per_core_group_1, num_rows_per_core_group_2] = + tt_metal::split_work_to_cores(compute_with_storage_grid_size, num_rows); + + auto input_data_format = datatype_to_dataformat_converter(dtype); + auto index_data_format = datatype_to_dataformat_converter(index.get_dtype()); + auto output_data_format = datatype_to_dataformat_converter(output.get_dtype()); + + uint32_t input_unit_size = input.get_logical_shape()[-1] * input.element_size(); + uint32_t rounded_input_unit_size = round_up_to_mul32(input_unit_size); + + uint32_t index_unit_size = index.volume() * index.element_size(); + uint32_t rounded_index_unit_size = round_up_to_mul32(index_unit_size); + + uint32_t output_unit_size = output.get_logical_shape()[-1] * output.element_size(); + uint32_t rounded_output_unit_size = round_up_to_mul32(output_unit_size); + + auto src_cb_index = CB::c_in0; + CircularBufferConfig cb_src_config = + CircularBufferConfig(rounded_input_unit_size, {{src_cb_index, input_data_format}}) + .set_page_size(src_cb_index, rounded_input_unit_size); + auto cb_src = CreateCircularBuffer(program, all_cores, cb_src_config); + std::map reader_defines; + + switch (dtype) { + case DataType::BFLOAT16: reader_defines["OUTPUT_DTYPE_BFLOAT16"] = "1"; break; + case DataType::INT32: reader_defines["OUTPUT_DTYPE_INT32"] = "1"; break; + case DataType::FLOAT32: reader_defines["OUTPUT_DTYPE_FLOAT32"] = "1"; break; + default: + TT_FATAL(false, "Unsupported datatype"); + break; + } + + auto index_cb_index = CB::c_in1; + CircularBufferConfig cb_index_config = + CircularBufferConfig(rounded_index_unit_size, {{index_cb_index, index_data_format}}) + .set_page_size(index_cb_index, rounded_index_unit_size); + auto cb_index = CreateCircularBuffer(program, all_cores, cb_index_config); + + auto dst_cb_index = CB::c_out0; + CircularBufferConfig dst_cb_config = + CircularBufferConfig(rounded_output_unit_size, {{dst_cb_index, output_data_format}}) + .set_page_size(dst_cb_index, rounded_output_unit_size); + auto cb_dst = CreateCircularBuffer(program, all_cores, dst_cb_config); + + bool in_is_dram = input.buffer()->is_dram(); + bool index_is_dram = index.buffer()->is_dram(); + bool out_is_dram = output.buffer()->is_dram(); + + // Create Kernels + // reader + std::vector reader_compile_time_args = { + (std::uint32_t)in_is_dram, + (std::uint32_t)index_is_dram, + (std::uint32_t)src_cb_index, + (std::uint32_t)index_cb_index, + (std::uint32_t)(dim == n - 1), + (std::uint32_t)index.volume()}; + + auto reader_kernel_id = CreateKernel( + program, + "ttnn/cpp/ttnn/operations/index_fill/device/kernels/reader_index_fill.cpp", + all_cores, + ReaderDataMovementConfig(reader_compile_time_args)); + + std::vector writer_compile_time_args = {(std::uint32_t)out_is_dram}; + + auto writer_kernel_id = CreateKernel( + program, + "ttnn/cpp/ttnn/operations/index_fill/device/kernels/writer_index_fill.cpp", + all_cores, + WriterDataMovementConfig(writer_compile_time_args)); + + uint32_t unit_offset = 0; + uint32_t num_cores_group_1 = core_group_1.num_cores(); + auto cores = grid_to_cores(num_cores, num_cores_x, num_cores_y); + for (uint32_t i = 0; i < cores.size(); i++) { + const auto& core = cores[i]; + uint32_t num_rows_per_core = i < num_cores_group_1 ? num_rows_per_core_group_1 : num_rows_per_core_group_2; + if (core_group_1.core_coord_in_core_ranges(core)) { + num_rows_per_core = num_rows_per_core_group_1; + } else if (core_group_2.core_coord_in_core_ranges(core)) { + num_rows_per_core = num_rows_per_core_group_2; + } else { + TT_FATAL(false, "Core not in specified core ranges"); + } + SetRuntimeArgs( + program, + reader_kernel_id, + core, + {input.buffer()->address(), + index.buffer()->address(), + u_fill_value.u32, + input_unit_size, + index_unit_size, + unit_offset, + num_rows_per_core, + num_rows_to_fill_per_index, + input_shape[dim]}); + SetRuntimeArgs( + program, + writer_kernel_id, + core, + {output.buffer()->address(), num_rows_per_core, unit_offset, output_unit_size}); + + unit_offset += num_rows_per_core; + } + + return {std::move(program), {reader_kernel_id, writer_kernel_id, num_cores, num_cores_y}}; +} + +void IndexFillOperation::MultiCore::override_runtime_arguments( + cached_program_t& cached_program, + const operation_attributes_t& operation_attributes, + const tensor_args_t& tensor_args, + tensor_return_value_t& output) { + auto& program = cached_program.program; + auto& reader_kernel_id = cached_program.shared_variables.reader_kernel_id; + auto& writer_kernel_id = cached_program.shared_variables.writer_kernel_id; + auto& num_cores = cached_program.shared_variables.num_cores; + auto& num_cores_y = cached_program.shared_variables.num_cores_y; + + auto src_buffer = tensor_args.input.buffer()->address(); + auto index_buffer = tensor_args.index.buffer()->address(); + auto output_buffer = output.buffer()->address(); + + for (uint32_t i = 0; i < num_cores; i++) { + CoreCoord core = {i / num_cores_y, i % num_cores_y}; + { + auto& runtime_args = GetRuntimeArgs(program, reader_kernel_id, core); + runtime_args[0] = src_buffer; + runtime_args[1] = index_buffer; + } + + { + auto& runtime_args = GetRuntimeArgs(program, writer_kernel_id, core); + runtime_args[0] = output_buffer; + } + } +} + +} // namespace ttnn::operations::index_fill diff --git a/ttnn/cpp/ttnn/operations/index_fill/device/kernels/reader_index_fill.cpp b/ttnn/cpp/ttnn/operations/index_fill/device/kernels/reader_index_fill.cpp new file mode 100644 index 00000000000..1da1cba100a --- /dev/null +++ b/ttnn/cpp/ttnn/operations/index_fill/device/kernels/reader_index_fill.cpp @@ -0,0 +1,106 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "dataflow_api.h" +#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp" +typedef union { + float f; + uint32_t u; +} value; + +bool is_in_indices(uint32_t *index_ptr, uint32_t size, uint32_t row_id) { + for (uint32_t i = 0; i < size; i++) { + if (row_id == index_ptr[i]) { + return true; + } + } + return false; +} + +void kernel_main() { + uint32_t input_addr = get_arg_val(0); + uint32_t index_addr = get_arg_val(1); + uint32_t fill_value = get_arg_val(2); + uint32_t input_page_size = get_arg_val(3); + uint32_t index_page_size = get_arg_val(4); + uint32_t start_row_id = get_arg_val(5); + uint32_t num_rows_per_core = get_arg_val(6); + uint32_t num_rows_to_fill_per_index = get_arg_val(7); + uint32_t dim = get_arg_val(8); + + constexpr bool input_is_dram = get_compile_time_arg_val(0) == 1; + constexpr bool index_is_dram = get_compile_time_arg_val(1) == 1; + constexpr uint32_t src_cb_id = get_compile_time_arg_val(2); + constexpr uint32_t index_cb_id = get_compile_time_arg_val(3); + constexpr bool is_last_dim = get_compile_time_arg_val(4) == 1; + constexpr uint32_t index_size = get_compile_time_arg_val(5); + + constexpr uint32_t onetile = 1; + + const InterleavedAddrGen s0 = {.bank_base_address = input_addr, .page_size = input_page_size}; + + const InterleavedAddrGen s1 = {.bank_base_address = index_addr, .page_size = index_page_size}; + + value val; + val.u = fill_value; + + cb_reserve_back(index_cb_id, onetile); + + uint32_t index_cb_reader = get_write_ptr(index_cb_id); + uint64_t index_noc_addr = get_noc_addr(0, s1); + noc_async_read(index_noc_addr, index_cb_reader, index_page_size); + noc_async_read_barrier(); + uint32_t *index_ptr = reinterpret_cast(index_cb_reader); + if (is_last_dim) { + for (uint32_t row_id = start_row_id; row_id < start_row_id + num_rows_per_core; row_id++) { + cb_reserve_back(src_cb_id, onetile); + uint32_t src_cb_reader = get_write_ptr(src_cb_id); + uint64_t input_noc_addr = get_noc_addr(row_id, s0); + noc_async_read(input_noc_addr, src_cb_reader, input_page_size); + noc_async_read_barrier(); + + uint32_t *input_ptr = reinterpret_cast(src_cb_reader); + + for (uint32_t i = 0; i < index_size; i++) { + uint32_t current_index = index_ptr[i]; + input_ptr[current_index] = fill_value; + } + + cb_push_back(src_cb_id, onetile); + } + } else { + for (uint32_t row_id = start_row_id; row_id < start_row_id + num_rows_per_core; row_id++) { + cb_reserve_back(src_cb_id, onetile); + uint32_t src_cb_reader = get_write_ptr(src_cb_id); + uint64_t input_noc_addr = get_noc_addr(row_id, s0); + noc_async_read(input_noc_addr, src_cb_reader, input_page_size); + noc_async_read_barrier(); + + if (is_in_indices(index_ptr, index_size, row_id / num_rows_to_fill_per_index % dim)) { +#ifdef OUTPUT_DTYPE_BFLOAT16 + auto ptr = reinterpret_cast(write_addr); + for (uint32_t i = 0; i < index_size; ++i) { + ptr[i] = val.u >> 16; + } +#endif +#ifdef OUTPUT_DTYPE_INT32 + auto ptr = reinterpret_cast(write_addr); + for (uint32_t i = 0; i < index_size; ++i) { + ptr[i] = fill_value; + } +#endif +#ifdef OUTPUT_DTYPE_FLOAT32 + auto ptr = reinterpret_cast(write_addr); + for (uint32_t i = 0; i < index_size; ++i) { + ptr[i] = val.f; + } +#endif + } + cb_push_back(src_cb_id, onetile); + } + } + cb_push_back(index_cb_id, onetile); +} diff --git a/ttnn/cpp/ttnn/operations/index_fill/device/kernels/writer_index_fill.cpp b/ttnn/cpp/ttnn/operations/index_fill/device/kernels/writer_index_fill.cpp new file mode 100644 index 00000000000..3ecfca0c0a7 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/index_fill/device/kernels/writer_index_fill.cpp @@ -0,0 +1,33 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "dataflow_api.h" + +void kernel_main() { + uint32_t output_buffer_address = get_arg_val(0); + uint32_t num_rows_per_core = get_arg_val(1); + uint32_t start_id = get_arg_val(2); + uint32_t output_unit_size = get_arg_val(3); + + constexpr uint32_t dst_cb_id = tt::CB::c_out0; + constexpr uint32_t src_cb_id = tt::CB::c_in0; + constexpr bool output_is_dram = get_compile_time_arg_val(0) == 1; + + constexpr uint32_t onetile = 1; + + const InterleavedAddrGen s = { + .bank_base_address = output_buffer_address, + .page_size = output_unit_size, + }; + for (uint32_t i = start_id; i < start_id + num_rows_per_core; i++) { + cb_wait_front(src_cb_id, onetile); + + uint32_t writer_ptr = get_read_ptr(src_cb_id); + uint64_t output_noc_addr = get_noc_addr(i, s); + noc_async_write(writer_ptr, output_noc_addr, output_unit_size); + noc_async_write_barrier(); + + cb_pop_front(src_cb_id, onetile); + } +} diff --git a/ttnn/cpp/ttnn/operations/index_fill/index_fill.cpp b/ttnn/cpp/ttnn/operations/index_fill/index_fill.cpp new file mode 100644 index 00000000000..513d7f42190 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/index_fill/index_fill.cpp @@ -0,0 +1,21 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "index_fill.hpp" + +#include "ttnn/decorators.hpp" +#include "ttnn/operations/index_fill/device/index_fill_device_operation.hpp" + +namespace ttnn::operations::index_fill { + +Tensor IndexFill::invoke( + const Tensor &input, + const uint32_t dim, + const Tensor &index, + const std::variant value, + const std::optional &memory_config) { + return ttnn::prim::index_fill(input, dim, index, value, memory_config); +} + +} // namespace ttnn::operations::index_fill diff --git a/ttnn/cpp/ttnn/operations/index_fill/index_fill.hpp b/ttnn/cpp/ttnn/operations/index_fill/index_fill.hpp new file mode 100644 index 00000000000..9f0393873b5 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/index_fill/index_fill.hpp @@ -0,0 +1,23 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "ttnn/decorators.hpp" + +namespace ttnn::operations::index_fill { + +struct IndexFill { + static Tensor invoke( + const Tensor &input, + const uint32_t dim, + const Tensor &index, + const std::variant value, + const std::optional &memory_config); +}; +} // namespace ttnn::operations::index_fill + +namespace ttnn { +constexpr auto index_fill = + ttnn::register_operation_with_auto_launch_op<"ttnn::index_fill", ttnn::operations::index_fill::IndexFill>(); +} // namespace ttnn diff --git a/ttnn/cpp/ttnn/operations/index_fill/index_fill_pybind.cpp b/ttnn/cpp/ttnn/operations/index_fill/index_fill_pybind.cpp new file mode 100644 index 00000000000..64275bbf3a5 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/index_fill/index_fill_pybind.cpp @@ -0,0 +1,44 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "index_fill_pybind.hpp" + +#include +#include + +#include "index_fill.hpp" +#include "ttnn/cpp/pybind11/decorators.hpp" +#include "ttnn/operations/index_fill/device/index_fill_device_operation.hpp" + +namespace py = pybind11; + +namespace ttnn::operations::index_fill { + +void bind_index_fill_operation(py::module& module) { + auto doc = + R"doc(index_fill(input: Tensor, dim: uint32, index: Tensor, value: int or float, memory_config: MemoryConfig) -> Tensor + Create or fill a tensor with the given value, with the specified `memory_config`. + This operation only supports ROW_MAJOR_LAYOUT for now. + Args: + * :attr:`input`: The tensor that we will operate on + * :attr:`dim`: The dimension that we need to fill the value along. + * :attr:`index`: The index that we need to fill the value in. + * :attr:`value`: The value which will be used to fill the output tensor + * :attr:`memory_config`: The memory configuration for the output tensor. + )doc"; + + bind_registered_operation( + module, + ttnn::index_fill, + doc, + ttnn::pybind_arguments_t{ + py::arg("input"), + py::arg("dim"), + py::arg("index"), + py::arg("value"), + py::kw_only(), + py::arg("memory_config") = std::nullopt}); +} + +} // namespace ttnn::operations::index_fill diff --git a/ttnn/cpp/ttnn/operations/index_fill/index_fill_pybind.hpp b/ttnn/cpp/ttnn/operations/index_fill/index_fill_pybind.hpp new file mode 100644 index 00000000000..49e664decdc --- /dev/null +++ b/ttnn/cpp/ttnn/operations/index_fill/index_fill_pybind.hpp @@ -0,0 +1,13 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "pybind11/pybind_fwd.hpp" + +namespace py = pybind11; + +namespace ttnn::operations::index_fill { +void bind_index_fill_operation(py::module& module); +} diff --git a/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op_multi_core.cpp index 4e7719e53ef..11a8a1b9bb6 100644 --- a/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op_multi_core.cpp +++ b/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op_multi_core.cpp @@ -169,7 +169,7 @@ operation::ProgramWithCallbacks update_cache_multi_core(const Tensor& cache_tens all_cores, tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args)); - vector compute_kernel_args = { + std::vector compute_kernel_args = { src0_cb_index, src1_cb_index, interm0_cb_index, diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_program_factory.cpp index 09b633211ea..fca73b736cd 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_program_factory.cpp @@ -109,7 +109,7 @@ operation::ProgramWithCallbacks matmul_multi_core(const Tensor &a, const Tensor all_cores, tt_metal::WriterDataMovementConfig(writer_compile_time_args)); - vector compute_args_group_1 = { + std::vector compute_args_group_1 = { 1, // B 1, // Mt Kt, // Kt @@ -127,7 +127,7 @@ operation::ProgramWithCallbacks matmul_multi_core(const Tensor &a, const Tensor .compile_args = compute_args_group_1}); if (!core_group_2.ranges().empty()) { - vector compute_args_group_2 = { + std::vector compute_args_group_2 = { 1, // B 1, // Mt Kt, // Kt diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp index a292a31be18..af4003a2b24 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp @@ -469,7 +469,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0( uint32_t out_subblock_num_tiles = out_subblock_h * out_subblock_w; - vector compute_kernel_args = { + std::vector compute_kernel_args = { in0_block_w, // in0_block_w in0_num_subblocks, // in0_num_subblocks in0_block_num_tiles, // in0_block_num_tiles @@ -1213,7 +1213,7 @@ operation::ProgramWithCallbacks create_program_mcast_in1( uint32_t out_subblock_num_tiles = out_subblock_h * out_subblock_w; - vector compute_kernel_args = { + std::vector compute_kernel_args = { in0_block_w, // in0_block_w in0_num_subblocks, // in0_num_subblocks in0_block_num_tiles, // in0_block_num_tiles diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp index 330dce8c720..8fece73c387 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp @@ -626,7 +626,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( uint32_t out_subblock_num_tiles = out_subblock_h * out_subblock_w; - vector compute_kernel_args = { + std::vector compute_kernel_args = { in0_block_w, // in0_block_w in0_num_subblocks, // in0_num_subblocks in0_block_num_tiles, // in0_block_num_tiles diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp index 96d914fbff2..a79f70be4bc 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp @@ -765,7 +765,7 @@ operation::ProgramWithCallbacks create_program_dram_sharded( uint32_t in1_per_core_w = per_core_N_unpad; uint32_t out_subblock_num_tiles = out_subblock_h * out_subblock_w; - vector compute_kernel_args = { + std::vector compute_kernel_args = { in0_block_w, // in0_block_w in0_num_subblocks, // in0_num_subblocks in0_block_num_tiles, // in0_block_num_tiles diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp index 4a6fd50be09..f7424c5e55a 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp @@ -192,7 +192,7 @@ operation::ProgramWithCallbacks create_program( reader_writer_compile_time_args, mm_kernel_in1_reader_writer_defines)); - vector compute_kernel_args_group_1 = { + std::vector compute_kernel_args_group_1 = { in0_block_w, // in0_block_w in0_num_subblocks, // in0_num_subblocks in0_block_num_tiles, // in0_block_num_tiles @@ -234,7 +234,7 @@ operation::ProgramWithCallbacks create_program( .compile_args = compute_kernel_args_group_1, .defines = mm_kernel_defines}); if (!core_group_2.ranges().empty()) { - vector compute_kernel_args_group_2 = { + std::vector compute_kernel_args_group_2 = { in0_block_w, // in0_block_w in0_num_subblocks, // in0_num_subblocks in0_block_num_tiles, // in0_block_num_tiles diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_program_factory.cpp index 482e6c8bca9..2fe8f327b72 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_program_factory.cpp @@ -62,7 +62,7 @@ tt_metal::operation::ProgramWithCallbacks create_program( uint32_t out_subblock_num_tiles = out_subblock_h * out_subblock_w; - vector compute_kernel_args = { + std::vector compute_kernel_args = { in0_block_w, // in0_block_w in0_num_subblocks, // in0_num_subblocks in0_block_num_tiles, // in0_block_num_tiles diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.cpp index cf0faa72b4d..3cc32ff7ed1 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.cpp @@ -160,25 +160,10 @@ std::tuple tt::stl::hash::hash_t { - return operation::hash_operation( - operation_attributes.beta1, - operation_attributes.beta2, - operation_attributes.eps, - operation_attributes.amsgrad, - operation_attributes.weight_decay, - operation_attributes.memory_config, - operation_attributes.compute_kernel_config, - tensor_args.param_in.memory_config(), - tensor_args.param_in.dtype(), - tensor_args.grad.memory_config(), - tensor_args.grad.dtype(), - tensor_args.exp_avg_in.memory_config(), - tensor_args.exp_avg_in.dtype(), - tensor_args.exp_avg_sq_in.memory_config(), - tensor_args.exp_avg_sq_in.dtype(), - tensor_args.max_exp_avg_sq_in.has_value() ? tensor_args.max_exp_avg_sq_in.value().memory_config() - : MemoryConfig{}, - tensor_args.max_exp_avg_sq_in.has_value() ? tensor_args.max_exp_avg_sq_in.value().dtype() : DataType::INVALID); + auto operation_attributes_without_step_and_lr = operation_attributes; + operation_attributes_without_step_and_lr.step = 0; + operation_attributes_without_step_and_lr.lr = 0.0f; + return tt::stl::hash::hash_objects_with_default_seed(operation_attributes_without_step_and_lr, tensor_args); } } // namespace ttnn::operations::moreh::moreh_adam diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/moreh_adamw_device_operation.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/moreh_adamw_device_operation.cpp index 774d5d63885..1084b7de99e 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/moreh_adamw_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/moreh_adamw_device_operation.cpp @@ -152,24 +152,9 @@ MorehAdamWDeviceOperation::invoke( tt::stl::hash::hash_t MorehAdamWDeviceOperation::compute_program_hash( const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) { - return operation::hash_operation( - operation_attributes.beta1, - operation_attributes.beta2, - operation_attributes.eps, - operation_attributes.amsgrad, - operation_attributes.weight_decay, - operation_attributes.memory_config, - operation_attributes.compute_kernel_config, - tensor_args.param_in.memory_config(), - tensor_args.param_in.dtype(), - tensor_args.grad.memory_config(), - tensor_args.grad.dtype(), - tensor_args.exp_avg_in.memory_config(), - tensor_args.exp_avg_in.dtype(), - tensor_args.exp_avg_sq_in.memory_config(), - tensor_args.exp_avg_sq_in.dtype(), - tensor_args.max_exp_avg_sq_in.has_value() ? tensor_args.max_exp_avg_sq_in.value().memory_config() - : MemoryConfig{}, - tensor_args.max_exp_avg_sq_in.has_value() ? tensor_args.max_exp_avg_sq_in.value().dtype() : DataType::INVALID); + auto operation_attributes_without_step_and_lr = operation_attributes; + operation_attributes_without_step_and_lr.step = 0; + operation_attributes_without_step_and_lr.lr = 0.0f; + return tt::stl::hash::hash_objects_with_default_seed(operation_attributes_without_step_and_lr, tensor_args); } } // namespace ttnn::operations::moreh::moreh_adamw diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_program_factory.cpp index 447de9add9c..cad1e01e515 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_program_factory.cpp @@ -81,7 +81,7 @@ MorehDotOperation::SingleCore::cached_program_t MorehDotOperation::SingleCore::c const auto writer_kernel_id = tt::operations::primary::CreateWriteKernel(program, writer_kernel_file, core, writer_compile_time_args); - vector compute_kernel_args = {}; + std::vector compute_kernel_args = {}; std::map compute_defines; compute_defines["REDUCE_OP"] = "PoolType::SUM"; compute_defines["REDUCE_DIM"] = "ReduceDim::REDUCE_ROW"; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_program_factory.cpp index bdbeead8284..7bcd39d21bf 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_program_factory.cpp @@ -105,7 +105,7 @@ MorehDotBackwardOperation::SingleCore::cached_program_t MorehDotBackwardOperatio const auto writer_kernel_id = tt::operations::primary::CreateWriteKernel(program, writer_kernel_file, core, writer_compile_time_args); - vector compute_kernel_args = {}; + std::vector compute_kernel_args = {}; std::map compute_defines; const auto compute_kernel_file = diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_rm_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_rm_factory.cpp index a6c5bddad33..3cd4ccff68d 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_rm_factory.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_rm_factory.cpp @@ -159,7 +159,7 @@ MorehGetItemOperation::MorehGetItemRmFactory::cached_program_t MorehGetItemOpera CoreCoord core = {i / core_h + core_x_offset, i % core_h + core_y_offset}; uint32_t num_units_per_core = i < g1_numcores ? num_units_per_core_group_1 : num_units_per_core_group_2; - vector reader_args = { + std::vector reader_args = { // buffers input_5d.buffer()->address(), index_info[0].address, @@ -208,7 +208,7 @@ MorehGetItemOperation::MorehGetItemRmFactory::cached_program_t MorehGetItemOpera input_unit_size, }; - vector writer_args = { + std::vector writer_args = { // buffer output.buffer()->address(), diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_factory.cpp index 90b4e864bfe..480c6010841 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_factory.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_factory.cpp @@ -225,7 +225,7 @@ MorehGetItemOperation::MorehGetItemTilizedFactory::create( CoreCoord core = {i / core_h + core_x_offset, i % core_h + core_y_offset}; uint32_t num_units_per_core = i < g1_numcores ? num_units_per_core_group_1 : num_units_per_core_group_2; - vector reader_args = { + std::vector reader_args = { // buffers input.buffer()->address(), index_info[0].address, @@ -284,7 +284,7 @@ MorehGetItemOperation::MorehGetItemTilizedFactory::create( num_alignment_width, }; - vector writer_args = { + std::vector writer_args = { // buffers output.buffer()->address(), @@ -452,7 +452,7 @@ MorehGetItemOperation::MorehGetItemTilizedFactory::create( CoreCoord core = {i / core_h + core_x_offset, i % core_h + core_y_offset}; uint32_t num_units_per_core = i < g1_numcores ? num_units_per_core_group_1 : num_units_per_core_group_2; - vector reader_args = { + std::vector reader_args = { // buffers input.buffer()->address(), index_info[0].address, @@ -509,7 +509,7 @@ MorehGetItemOperation::MorehGetItemTilizedFactory::create( input_unit_size, input.element_size(), }; - vector writer_args = { + std::vector writer_args = { // buffers output.buffer()->address(), diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp index b331eca682b..68964ac8820 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp @@ -129,7 +129,7 @@ std::tuple unpack_to_dest_mode) { + std::vector unpack_to_dest_mode) { std::vector compute_kernel_ids{}; KernelHandle compute_kernel_id{}; for (auto arg : args) { @@ -155,7 +155,7 @@ std::tuple unpack_to_dest_mode) { + std::vector unpack_to_dest_mode) { KernelHandle compute_kernel_id{0}; if (arg.num_tile_per_core_group > 0) { compute_kernel_id = CreateKernel( diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.hpp index 384b9097f4b..7cc17ae86af 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.hpp @@ -86,7 +86,7 @@ struct ComputeKernelArg { struct ComputeKernelConfig { MathFidelity math_fidelity = MathFidelity::HiFi4; bool fp32_dest_acc_en = false; - vector unpack_to_dest_mode; + std::vector unpack_to_dest_mode; bool math_approx_mode = false; std::map defines; }; @@ -99,7 +99,7 @@ struct ComputeKernelConfig { MathFidelity math_fidelity = MathFidelity::HiFi4, bool fp32_dest_acc_en = false, bool math_approx_mode = false, - vector unpack_to_dest_mode = {}); + std::vector unpack_to_dest_mode = {}); [[maybe_unused]] KernelHandle CreateComputeKernel( Program &program, @@ -109,7 +109,7 @@ struct ComputeKernelConfig { MathFidelity math_fidelity = MathFidelity::HiFi4, bool fp32_dest_acc_en = false, bool math_approx_mode = false, - vector unpack_to_dest_mode = {}); + std::vector unpack_to_dest_mode = {}); [[maybe_unused]] std::vector CreateComputeKernel( Program &program, const std::string &file_name, std::vector args, ComputeKernelConfig config); diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_single_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_single_core_program_factory.cpp index a1f82e41ec7..d7ec2a28105 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_single_core_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_single_core_program_factory.cpp @@ -99,7 +99,7 @@ MorehBiasAddBackwardOperation::SingleCoreProgramFactory::create( //////////////////////////////////////////////////////////////////////////// // ComputeKernel SetUp //////////////////////////////////////////////////////////////////////////// - vector compute_kernel_args = {}; + std::vector compute_kernel_args = {}; std::map compute_defines; compute_defines["REDUCE_OP"] = "PoolType::SUM"; compute_defines["REDUCE_DIM"] = "ReduceDim::REDUCE_SCALAR"; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_program_factory.cpp index a1f27385d23..2be95305f90 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_program_factory.cpp @@ -372,7 +372,7 @@ MorehMatmulOperation::MultiCoreProgramFactory::cached_program_t MorehMatmulOpera compute_args_group_1.push_back(static_cast(is_scalar_bias)); } - vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); + std::vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); if (fp32_dest_acc_en) { compute_defines["FP32_DEST_ACC_EN"] = "1"; unpack_to_dest_mode[tt::CB::c_intermed0] = UnpackToDestMode::UnpackToDestFp32; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_nc_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_nc_program_factory.cpp index 6efb0963197..0bd21b787a5 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_nc_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_nc_program_factory.cpp @@ -112,7 +112,7 @@ MorehMeanOperation::MorehMeanNCFactory::cached_program_t MorehMeanOperation::Mor if (fp32_dest_acc_en) { compute_defines["FP32_DEST_ACC_EN"] = 1; } - vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); + std::vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); auto compute_kernel_ids = CreateComputeKernel( program, compute_kernel_file, diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_w_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_w_program_factory.cpp index 541fa26b0a8..5aed254b1c9 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_w_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_w_program_factory.cpp @@ -113,19 +113,19 @@ MorehMeanOperation::MorehMeanWFactory::cached_program_t MorehMeanOperation::More if (fp32_dest_acc_en) { compute_defines["FP32_DEST_ACC_EN"] = 1; } - vector compute_kernel_args_group_1 = { + std::vector compute_kernel_args_group_1 = { units_per_core_group_1, // Ht Wt, // Wt 1, // NC origin_W, }; - vector compute_kernel_args_group_2 = { + std::vector compute_kernel_args_group_2 = { units_per_core_group_2, // Ht Wt, // Wt 1, // NC origin_W, }; - vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); + std::vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); auto compute_kernel_ids = CreateComputeKernel( program, diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_program_factory.cpp index d6e26dd5b27..c088b4ae548 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_program_factory.cpp @@ -159,7 +159,7 @@ MorehMeanBackwardOperation::MorehMeanBackwardFactory::create( "ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/moreh_mean_backward.cpp"; const std::vector compute_args_group_1{num_cols_per_core_group_1, need_bcast_dim[0], need_bcast_dim[1]}; const std::vector compute_args_group_2{num_cols_per_core_group_2, need_bcast_dim[0], need_bcast_dim[1]}; - vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); + std::vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); auto compute_kernel_ids = tt::operations::primary::CreateComputeKernel( program, compute_kernel_file, diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_program_factory.cpp index 5301951bcc6..a552aa2eb05 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_program_factory.cpp @@ -148,7 +148,7 @@ MorehNllLossStep1DeviceOperation::Factory::cached_program_t MorehNllLossStep1Dev } uint32_t element_size = weight_has_value ? weight.value().element_size() : 0; - vector reader_args = { + std::vector reader_args = { target_addr, weight_addr, static_cast(ignore_index), @@ -161,7 +161,7 @@ MorehNllLossStep1DeviceOperation::Factory::cached_program_t MorehNllLossStep1Dev target.element_size(), }; - vector writer_args = {output_addr, num_units_per_core, tile_offset}; + std::vector writer_args = {output_addr, num_units_per_core, tile_offset}; SetRuntimeArgs(program, reader_kernel_id, core, reader_args); SetRuntimeArgs(program, writer_kernel_id, core, writer_args); diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_program_factory.cpp index 92a00767a12..608e82b1b57 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_program_factory.cpp @@ -148,7 +148,7 @@ MorehNllLossStep2DeviceOperation::Factory::cached_program_t moreh_nll_loss_step2 TT_THROW("Core not in specified core ranges"); } - vector reader_args = { + std::vector reader_args = { input_addr, target_addr, weight_addr, @@ -161,7 +161,7 @@ MorehNllLossStep2DeviceOperation::Factory::cached_program_t moreh_nll_loss_step2 input.element_size(), }; - vector writer_args = { + std::vector writer_args = { output_addr, units_per_core, tile_offset, @@ -327,7 +327,7 @@ MorehNllLossStep2DeviceOperation::Factory::cached_program_t moreh_nll_loss_step2 TT_THROW("Core not in specified core ranges"); } - vector reader_args = { + std::vector reader_args = { input_addr, target_addr, weight_addr, @@ -341,7 +341,7 @@ MorehNllLossStep2DeviceOperation::Factory::cached_program_t moreh_nll_loss_step2 input.element_size(), }; - vector writer_args = { + std::vector writer_args = { output_addr, units_per_core, tile_offset, @@ -516,7 +516,7 @@ MorehNllLossStep2DeviceOperation::Factory::cached_program_t moreh_nll_loss_step2 TT_THROW("Core not in specified core ranges"); } - vector reader_args = { + std::vector reader_args = { input_addr, target_addr, weight_addr, @@ -532,7 +532,7 @@ MorehNllLossStep2DeviceOperation::Factory::cached_program_t moreh_nll_loss_step2 input.element_size(), }; - vector writer_args = { + std::vector writer_args = { output_addr, units_per_core, tile_offset, diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_program_factory.cpp index 796aa0e121b..e8be177e76f 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_program_factory.cpp @@ -200,7 +200,7 @@ MorehSgdOperation::ProgramFactory::cached_program_t MorehSgdOperation::ProgramFa u_weight_decay.f = weight_decay; u_one.f = 1.0f; - vector reader_args = { + std::vector reader_args = { param_in.buffer()->address(), grad.buffer()->address(), momentum_buffer_in.has_value() ? momentum_buffer_in.value().buffer()->address() : 0, @@ -213,7 +213,7 @@ MorehSgdOperation::ProgramFactory::cached_program_t MorehSgdOperation::ProgramFa u_one.u, }; - vector writer_args = { + std::vector writer_args = { param_out.buffer()->address(), momentum_buffer_out.has_value() ? momentum_buffer_out.value().buffer()->address() : 0, num_tiles_per_core, diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_c_large/softmax_c_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_c_large/softmax_c_large.cpp index f70bb6b5813..73a87fcf49c 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_c_large/softmax_c_large.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_c_large/softmax_c_large.cpp @@ -128,10 +128,10 @@ MorehSoftmaxOperation::MorehSoftmaxCLargeFactory::create( TT_THROW("Core not in specified core ranges"); } - vector reader_args = { + std::vector reader_args = { input.buffer()->address(), num_tiles_per_core, tile_offset, outer_stride, inner_size, dim_size}; - vector writer_args = { + std::vector writer_args = { output.buffer()->address(), num_tiles_per_core, tile_offset, outer_stride, inner_size, dim_size}; SetRuntimeArgs(program, reader_kernel_id, core, reader_args); diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_large/softmax_h_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_large/softmax_h_large.cpp index 85532b0bcfc..4b38c7c02e3 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_large/softmax_h_large.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_large/softmax_h_large.cpp @@ -127,7 +127,7 @@ MorehSoftmaxOperation::MorehSoftmaxHLargeFactory::create( uint32_t mask_h = input.get_logical_shape()[-2] % tt::constants::TILE_HEIGHT; if (mask_h == 0) mask_h = tt::constants::TILE_HEIGHT; - vector reader_args = { + std::vector reader_args = { input.buffer()->address(), num_tiles_per_core, tile_offset, @@ -136,7 +136,7 @@ MorehSoftmaxOperation::MorehSoftmaxHLargeFactory::create( *reinterpret_cast(&scaler), mask_h}; - vector writer_args = {output.buffer()->address(), num_tiles_per_core, tile_offset, Ht, Wt}; + std::vector writer_args = {output.buffer()->address(), num_tiles_per_core, tile_offset, Ht, Wt}; SetRuntimeArgs(program, reader_kernel_id, core, reader_args); SetRuntimeArgs(program, writer_kernel_id, core, writer_args); diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_small/softmax_h_small.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_small/softmax_h_small.cpp index 7dc08209882..b182ec8d63c 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_small/softmax_h_small.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_small/softmax_h_small.cpp @@ -128,7 +128,7 @@ MorehSoftmaxOperation::MorehSoftmaxHSmallFactory::create( uint32_t mask_h = shape.without_padding()[-2] % tt::constants::TILE_HEIGHT; if (mask_h == 0) mask_h = tt::constants::TILE_HEIGHT; - vector reader_args = { + std::vector reader_args = { input.buffer()->address(), num_tiles_per_core, tile_offset, @@ -137,7 +137,7 @@ MorehSoftmaxOperation::MorehSoftmaxHSmallFactory::create( *reinterpret_cast(&scaler), mask_h}; - vector writer_args = {output.buffer()->address(), num_tiles_per_core, tile_offset, Ht, Wt}; + std::vector writer_args = {output.buffer()->address(), num_tiles_per_core, tile_offset, Ht, Wt}; SetRuntimeArgs(program, reader_kernel_id, core, reader_args); SetRuntimeArgs(program, writer_kernel_id, core, writer_args); diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_large/softmax_w_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_large/softmax_w_large.cpp index 612677427e2..2622708e47f 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_large/softmax_w_large.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_large/softmax_w_large.cpp @@ -128,7 +128,7 @@ MorehSoftmaxOperation::MorehSoftmaxWLargeFactory::create( uint32_t mask_w = shape.without_padding()[-1] % tt::constants::TILE_WIDTH; if (mask_w == 0) mask_w = tt::constants::TILE_WIDTH; - vector reader_args = { + std::vector reader_args = { input.buffer()->address(), num_tiles_per_core, tile_offset, @@ -136,7 +136,7 @@ MorehSoftmaxOperation::MorehSoftmaxWLargeFactory::create( *reinterpret_cast(&scaler), mask_w}; - vector writer_args = {output.buffer()->address(), num_tiles_per_core, tile_offset, Wt}; + std::vector writer_args = {output.buffer()->address(), num_tiles_per_core, tile_offset, Wt}; SetRuntimeArgs(program, reader_kernel_id, core, reader_args); SetRuntimeArgs(program, writer_kernel_id, core, writer_args); diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_small/softmax_w_small.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_small/softmax_w_small.cpp index 3bb8ba52f3d..a43840e1949 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_small/softmax_w_small.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_small/softmax_w_small.cpp @@ -127,7 +127,7 @@ MorehSoftmaxOperation::MorehSoftmaxWSmallFactory::create( uint32_t mask_w = shape.without_padding()[-1] % tt::constants::TILE_WIDTH; if (mask_w == 0) mask_w = tt::constants::TILE_WIDTH; - vector reader_args = { + std::vector reader_args = { input.buffer()->address(), num_tiles_per_core, tile_offset, @@ -135,7 +135,7 @@ MorehSoftmaxOperation::MorehSoftmaxWSmallFactory::create( *reinterpret_cast(&scaler), mask_w}; - vector writer_args = {output.buffer()->address(), num_tiles_per_core, tile_offset, Wt}; + std::vector writer_args = {output.buffer()->address(), num_tiles_per_core, tile_offset, Wt}; SetRuntimeArgs(program, reader_kernel_id, core, reader_args); SetRuntimeArgs(program, writer_kernel_id, core, writer_args); diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_c_large/softmax_backward_c_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_c_large/softmax_backward_c_large.cpp index 6031a007f22..9445e917f7b 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_c_large/softmax_backward_c_large.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_c_large/softmax_backward_c_large.cpp @@ -130,7 +130,7 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardCLargeFactory::create( TT_THROW("Core not in specified core ranges"); } - vector reader_args = { + std::vector reader_args = { output.buffer()->address(), output_grad.buffer()->address(), num_tiles_per_core, @@ -139,7 +139,7 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardCLargeFactory::create( inner_size, dim_size}; - vector writer_args = { + std::vector writer_args = { input_grad.buffer()->address(), num_tiles_per_core, tile_offset, outer_stride, inner_size, dim_size}; SetRuntimeArgs(program, reader_kernel_id, core, reader_args); diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_large/softmax_backward_h_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_large/softmax_backward_h_large.cpp index 0174df56159..7d8f06884dc 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_large/softmax_backward_h_large.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_large/softmax_backward_h_large.cpp @@ -132,7 +132,7 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardHLargeFactory::create( uint32_t mask_h = shape.without_padding()[-2] % tt::constants::TILE_HEIGHT; if (mask_h == 0) mask_h = tt::constants::TILE_HEIGHT; - vector reader_args = { + std::vector reader_args = { output.buffer()->address(), output_grad.buffer()->address(), num_tiles_per_core, @@ -142,7 +142,7 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardHLargeFactory::create( *reinterpret_cast(&scaler), mask_h}; - vector writer_args = {input_grad.buffer()->address(), num_tiles_per_core, tile_offset, Ht, Wt}; + std::vector writer_args = {input_grad.buffer()->address(), num_tiles_per_core, tile_offset, Ht, Wt}; SetRuntimeArgs(program, reader_kernel_id, core, reader_args); SetRuntimeArgs(program, writer_kernel_id, core, writer_args); diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_small/softmax_backward_h_small.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_small/softmax_backward_h_small.cpp index fe72331b129..997d1b56259 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_small/softmax_backward_h_small.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_small/softmax_backward_h_small.cpp @@ -130,7 +130,7 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardHSmallFactory::create( uint32_t mask_h = shape.without_padding()[-2] % tt::constants::TILE_HEIGHT; if (mask_h == 0) mask_h = tt::constants::TILE_HEIGHT; - vector reader_args = { + std::vector reader_args = { output.buffer()->address(), output_grad.buffer()->address(), num_tiles_per_core, @@ -140,7 +140,7 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardHSmallFactory::create( *reinterpret_cast(&scaler), mask_h}; - vector writer_args = {input_grad.buffer()->address(), num_tiles_per_core, tile_offset, Ht, Wt}; + std::vector writer_args = {input_grad.buffer()->address(), num_tiles_per_core, tile_offset, Ht, Wt}; SetRuntimeArgs(program, reader_kernel_id, core, reader_args); SetRuntimeArgs(program, writer_kernel_id, core, writer_args); diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_large/softmax_backward_w_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_large/softmax_backward_w_large.cpp index fd720537431..8090c3c232f 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_large/softmax_backward_w_large.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_large/softmax_backward_w_large.cpp @@ -132,7 +132,7 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardWLargeFactory::create( uint32_t mask_w = shape.without_padding()[-1] % tt::constants::TILE_WIDTH; if (mask_w == 0) mask_w = tt::constants::TILE_WIDTH; - vector reader_args = { + std::vector reader_args = { output.buffer()->address(), output_grad.buffer()->address(), num_tiles_per_core, @@ -141,7 +141,7 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardWLargeFactory::create( *reinterpret_cast(&scaler), mask_w}; - vector writer_args = {input_grad.buffer()->address(), num_tiles_per_core, tile_offset, Wt}; + std::vector writer_args = {input_grad.buffer()->address(), num_tiles_per_core, tile_offset, Wt}; SetRuntimeArgs(program, reader_kernel_id, core, reader_args); SetRuntimeArgs(program, writer_kernel_id, core, writer_args); diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_small/softmax_backward_w_small.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_small/softmax_backward_w_small.cpp index 5e04e023140..213741f30de 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_small/softmax_backward_w_small.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_small/softmax_backward_w_small.cpp @@ -127,7 +127,7 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardWSmallFactory::create( uint32_t mask_w = shape.without_padding()[-1] % tt::constants::TILE_WIDTH; if (mask_w == 0) mask_w = tt::constants::TILE_WIDTH; - vector reader_args = { + std::vector reader_args = { output.buffer()->address(), output_grad.buffer()->address(), num_tiles_per_core, @@ -136,7 +136,7 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardWSmallFactory::create( *reinterpret_cast(&scaler), mask_w}; - vector writer_args = {input_grad.buffer()->address(), num_tiles_per_core, tile_offset, Wt}; + std::vector writer_args = {input_grad.buffer()->address(), num_tiles_per_core, tile_offset, Wt}; SetRuntimeArgs(program, reader_kernel_id, core, reader_args); SetRuntimeArgs(program, writer_kernel_id, core, writer_args); diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_program_factory.cpp index 285e20a337a..594b27f1ff0 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_program_factory.cpp @@ -154,13 +154,13 @@ MorehSumOperation::MorehSumHFactory::cached_program_t MorehSumOperation::MorehSu reduce_defines["FP32_DEST_ACC_EN"] = "1"; } - vector compute_kernel_args_group_1 = { + std::vector compute_kernel_args_group_1 = { Ht, // Ht num_cols_per_core_group_1, // Wt 1, // NC origin_H}; - vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); + std::vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); if (fp32_dest_acc_en) { unpack_to_dest_mode[tt::CB::c_intermed0] = UnpackToDestMode::UnpackToDestFp32; } @@ -177,7 +177,7 @@ MorehSumOperation::MorehSumHFactory::cached_program_t MorehSumOperation::MorehSu .defines = reduce_defines}); if (!core_group_2.ranges().empty()) { - vector compute_kernel_args_group_2 = { + std::vector compute_kernel_args_group_2 = { Ht, // Ht num_cols_per_core_group_2, // Wt 1, // NC diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_program_factory.cpp index d123bc890da..ebd8d432981 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_program_factory.cpp @@ -104,7 +104,7 @@ MorehSumOperation::MorehSumNCFactory::cached_program_t MorehSumOperation::MorehS compute_defines["FP32_DEST_ACC_EN"] = "1"; } // set unpack_to_dest_mode to the same value as fp32_dest_acc_en - vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); + std::vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); auto compute_kernel_file = "ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_sum_nc.cpp"; if (device->arch() == tt::ARCH::GRAYSKULL) { diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp index 392542fdaad..d9e387165e4 100644 --- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp @@ -252,7 +252,7 @@ operation::ProgramWithCallbacks layernorm_multi_core( tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args) ); - vector compute_args = { Wt, block_size, gamma.has_value(), beta.has_value(), fp32_dest_acc_en }; + std::vector compute_args = { Wt, block_size, gamma.has_value(), beta.has_value(), fp32_dest_acc_en }; auto compute_kernels_id = CreateKernel( program, @@ -412,7 +412,7 @@ operation::ProgramWithCallbacks layernorm_multi_core_sharded( bool is_post_all_gather = distributed_norm_stage == DistributedLayerNormStage::POST_ALL_GATHER; //////////////////////////////////////////////////////////////////////////// - // Grayskull Device Setup + // Device Setup //////////////////////////////////////////////////////////////////////////// Device *device = a.device(); @@ -422,8 +422,20 @@ operation::ProgramWithCallbacks layernorm_multi_core_sharded( auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc, dst_full_sync_en] = get_compute_kernel_config_args(device->arch(), compute_kernel_config); - if (fp32_dest_acc_en) { - TT_ASSERT(subblock_wt <= 4, "subblock width must less than 4 in fp32 mode"); + if (dst_full_sync_en == false) { + if (fp32_dest_acc_en) { + TT_FATAL(subblock_wt <= 4, "subblock_wt={}, but subblock width must less than 4 tiles in fp32 mode when dst_full_sync_en is false", subblock_wt); + } + else { + TT_FATAL(subblock_wt <= 8, "subblock_wt={}, but subblock width must less than 8 tiles when dst_full_sync_en is false", subblock_wt); + } + } else { + if (fp32_dest_acc_en) { + TT_FATAL(subblock_wt <= 8, "subblock_wt={}, but subblock width must less than 8 tiles in fp32 mode when dst_full_sync_en is true", subblock_wt); + } + else { + TT_FATAL(subblock_wt <= 16, "subblock_wt={}, but subblock width must less than 16 tiles when dst_full_sync_en is true", subblock_wt); + } } tt::DataFormat out_data_format = tt::tt_metal::datatype_to_dataformat_converter(output.get_dtype()); diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp index 339f881f6fa..3daf51392fc 100644 --- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp @@ -270,7 +270,7 @@ operation::ProgramWithCallbacks layernorm_post_allgather_multi_core( tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args) ); - vector compute_args = { Wt, block_size, stats_tiles_cols, gamma.has_value(), beta.has_value(), fp32_dest_acc_en }; + std::vector compute_args = { Wt, block_size, stats_tiles_cols, gamma.has_value(), beta.has_value(), fp32_dest_acc_en }; auto compute_kernels_id = CreateKernel( program, diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp index a0f86c3f1e8..aedca4b1c2a 100644 --- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp @@ -183,7 +183,7 @@ operation::ProgramWithCallbacks layernorm_pre_allgather_multi_core( tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args) ); - vector compute_args = { Wt, block_size }; + std::vector compute_args = { Wt, block_size }; auto compute_kernels_id = CreateKernel( program, diff --git a/ttnn/cpp/ttnn/operations/numpy/functions.hpp b/ttnn/cpp/ttnn/operations/numpy/functions.hpp index 62c2dc058cc..f6540f6e168 100644 --- a/ttnn/cpp/ttnn/operations/numpy/functions.hpp +++ b/ttnn/cpp/ttnn/operations/numpy/functions.hpp @@ -444,7 +444,7 @@ static Tensor fill_first_val_into_tensor( auto owned_buffer = tt::tt_metal::owned_buffer::create(physical_volume); // ouput auto device_buffer = input_tensor.device_buffer(); uint32_t size_in_bytes = device_buffer->size(); - vector data_vec; + std::vector data_vec; const char* TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE"); if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) { data_vec.resize(size_in_bytes / sizeof(T)); @@ -478,7 +478,7 @@ static Tensor prod_result_computation_GS( auto owned_buffer = tt::tt_metal::owned_buffer::create(input_tensor.volume()); // ouput auto device_buffer = input_tensor.device_buffer(); uint32_t size_in_bytes = device_buffer->size(); - vector data_vec; + std::vector data_vec; const char* TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE"); if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) { data_vec.resize(size_in_bytes / sizeof(T)); @@ -527,7 +527,7 @@ static Tensor prod_result_computation_WH_B0( auto owned_buffer = tt::tt_metal::owned_buffer::create(tt::tt_metal::compute_volume(s_a)); // ouput auto device_buffer = input_tensor.device_buffer(); uint32_t size_in_bytes = device_buffer->size(); - vector data_vec; + std::vector data_vec; const char* TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE"); if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) { data_vec.resize(size_in_bytes / sizeof(T)); @@ -654,7 +654,7 @@ static Tensor manual_insertion( "Required shape volume must match old shape volume"); auto device_buffer = input_tensor.device_buffer(); uint32_t size_in_bytes = device_buffer->size(); - vector data_vec; + std::vector data_vec; const char* TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE"); if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) { data_vec.resize(size_in_bytes / sizeof(T)); diff --git a/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.cpp b/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.cpp index 29e75f98df8..c0b6399660a 100644 --- a/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.cpp @@ -570,7 +570,7 @@ operation::ProgramWithCallbacks downsample_single_core( core_range, tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args)); - vector compute_args = { + std::vector compute_args = { input_cb_index, halo_prev_input_cb_index, halo_next_input_cb_index, @@ -761,7 +761,7 @@ operation::ProgramWithCallbacks downsample_single_core( TT_ASSERT(v.output_flat_h == 0); // Compile runtime args - vector compile_rt_kernel_args = { + std::vector compile_rt_kernel_args = { local_input_num_rows_of_tiles, local_input_offset_rows_of_tiles, halo_prev_read_enabled, @@ -773,7 +773,7 @@ operation::ProgramWithCallbacks downsample_single_core( tt::tt_metal::SetRuntimeArgs(program, downsample_compute_kernel_id, core, compile_rt_kernel_args); // Writer runtime args - vector writer_kernel_args = { + std::vector writer_kernel_args = { (uint32_t)img_height, (uint32_t)img_width, (uint32_t)img_stride_h, diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp index c29bb743c73..cb69c9ec164 100644 --- a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp +++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp @@ -235,7 +235,7 @@ operation::ProgramWithCallbacks bilinear_multi_core(const Tensor &input, Tensor& // runtime args uint32_t reader_nargs = 10; - vector reader_rt_args(reader_nargs); + std::vector reader_rt_args(reader_nargs); reader_rt_args[0] = input_stick_nbytes; reader_rt_args[1] = input_nsticks_per_core / in_w; reader_rt_args[2] = scale_factor_h; diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.cpp index d43270ca96a..da671cb659a 100644 --- a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.cpp +++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.cpp @@ -59,7 +59,7 @@ std::vector UpSample::create_output_tensors(const std::vector &i auto output_shape = compute_output_shapes(inputs).at(0); if (input.memory_config().memory_layout == TensorMemoryLayout::HEIGHT_SHARDED) { auto ncores = input_shard_spec.num_cores(); - array output_shard_shape = {div_up(output_shape[0] * output_shape[1] * output_shape[2], ncores), output_shape[-1]}; + std::array output_shard_shape = {div_up(output_shape[0] * output_shape[1] * output_shape[2], ncores), output_shape[-1]}; auto output_shard_spec = input_shard_spec; output_shard_spec.shape = output_shard_shape; mem_config.shard_spec = output_shard_spec; @@ -72,7 +72,7 @@ std::vector UpSample::create_output_tensors(const std::vector &i auto core_range = *shard_grid.begin(); uint32_t ncores_w = core_range.end_coord.x + 1; uint32_t ncores_h = core_range.end_coord.y + 1; - // array output_shard_shape = {output_shape[0] * output_shape[1] * output_shape[2] / ncores_h, output_shape[-1] / ncores_w}; + // std::array output_shard_shape = {output_shape[0] * output_shape[1] * output_shape[2] / ncores_h, output_shape[-1] / ncores_w}; // auto output_shard_spec = input_shard_spec; // output_shard_spec.shape = output_shard_shape; // mem_config.shard_spec = output_shard_spec; diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_multicore.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_multicore.cpp index e20cf9bfd19..b2deccc8f2f 100644 --- a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_multicore.cpp +++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_multicore.cpp @@ -130,7 +130,7 @@ operation::ProgramWithCallbacks upsample_multi_core(const Tensor &input, Tensor& // runtime args uint32_t writer_nargs = 7; - vector writer_rt_args(writer_nargs); + std::vector writer_rt_args(writer_nargs); writer_rt_args[0] = input_stick_nbytes; writer_rt_args[1] = input_nsticks_per_core / in_w; writer_rt_args[2] = scale_factor_h; diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_h/reduce_op_multi_core_h.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_h/reduce_op_multi_core_h.cpp index 1a5fd46bce4..fd6424cb0bf 100644 --- a/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_h/reduce_op_multi_core_h.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_h/reduce_op_multi_core_h.cpp @@ -149,7 +149,7 @@ operation::ProgramWithCallbacks reduce_multi_core_h( tt_metal::KernelHandle writer_kernel_id; if (out_sharded) { - vector writer_ct_args = { + std::vector writer_ct_args = { output_cb_index, }; writer_kernel_id = CreateKernel( @@ -168,7 +168,7 @@ operation::ProgramWithCallbacks reduce_multi_core_h( tt_metal::WriterDataMovementConfig(writer_compile_time_args)); } std::map reduce_defines = reduce_op_utils::get_defines(reduce_op, ReduceOpDim::H); - vector compute_kernel_args_group_1 = { + std::vector compute_kernel_args_group_1 = { Ht, // Ht num_cols_per_core_group_1, // Wt 1, // NC @@ -185,7 +185,7 @@ operation::ProgramWithCallbacks reduce_multi_core_h( .defines = reduce_defines}); if (!core_group_2.ranges().empty()) { - vector compute_kernel_args_group_2 = { + std::vector compute_kernel_args_group_2 = { Ht, // Ht num_cols_per_core_group_2, // Wt 1, // NC @@ -208,11 +208,11 @@ operation::ProgramWithCallbacks reduce_multi_core_h( uint32_t shard_Wt = num_cols_per_core_group_1 / NC; uint32_t shard_row_size = shard_Wt * src0_single_tile_size; uint32_t shard_batch_size = shard_row_size * Ht; - vector reader_rt_args = { + std::vector reader_rt_args = { num_cols_per_core_group_1 * Ht, shard_Wt, Ht, NC, shard_row_size, shard_batch_size, packed_scaler_value}; tt_metal::SetRuntimeArgs(program, reader_kernel_id, all_cores, reader_rt_args); - vector writer_rt_args = {num_cols_per_core_group_1}; + std::vector writer_rt_args = {num_cols_per_core_group_1}; tt_metal::SetRuntimeArgs(program, writer_kernel_id, all_cores, writer_rt_args); } else { for (uint32_t i = 0, num_cols_read = 0; i < num_cores; i++) { diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_w/reduce_op_multi_core_w.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_w/reduce_op_multi_core_w.cpp index 9205f800f79..1756cf29345 100644 --- a/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_w/reduce_op_multi_core_w.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_w/reduce_op_multi_core_w.cpp @@ -94,7 +94,7 @@ operation::ProgramWithCallbacks reduce_multi_core_w( all_cores, tt_metal::WriterDataMovementConfig(writer_compile_time_args, reduce_defines)); - vector compute_kernel_args_group_1 = { + std::vector compute_kernel_args_group_1 = { num_rows_per_core_group_1, // Ht Wt, // Wt 1, // NC @@ -111,7 +111,7 @@ operation::ProgramWithCallbacks reduce_multi_core_w( .defines = reduce_defines}); if (!core_group_2.ranges().empty()) { - vector compute_kernel_args_group_2 = { + std::vector compute_kernel_args_group_2 = { num_rows_per_core_group_2, // Ht Wt, // Wt 1, // NC diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/single_core_hw/reduce_op_single_core_hw.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/single_core_hw/reduce_op_single_core_hw.cpp index f3cfe56730c..66eaccf2e20 100644 --- a/ttnn/cpp/ttnn/operations/reduction/generic/device/single_core_hw/reduce_op_single_core_hw.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/single_core_hw/reduce_op_single_core_hw.cpp @@ -96,7 +96,7 @@ operation::ProgramWithCallbacks reduce_single_core_hw( core, tt_metal::WriterDataMovementConfig(writer_compile_time_args)); - vector compute_kernel_args = { + std::vector compute_kernel_args = { Ht, // Ht Wt, // Wt NC, // NC diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_all_program_factory.cpp b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_all_program_factory.cpp index 4c7a83cd11c..390dec0034a 100644 --- a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_all_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_all_program_factory.cpp @@ -67,7 +67,7 @@ namespace primary { core, tt_metal::WriterDataMovementConfig{writer_compile_time_args}); - vector compute_kernel_args = { + std::vector compute_kernel_args = { num_tiles, // per_core_block_cnt 1 // per_core_block_size }; diff --git a/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.cpp b/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.cpp index e01bb7997cd..06e8a52e7b5 100644 --- a/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.cpp +++ b/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.cpp @@ -71,7 +71,7 @@ owned_buffer::Buffer conv_using_op_trace_metadata( uint32_t padded_input_w, uint32_t out_tensor_size) { auto conv_tensor_buf = owned_buffer::create(out_tensor_size); - vector input_window; + std::vector input_window; uint32_t out_idx = 0; for (auto anchor : op_trace_metadata) { for (uint32_t h = 0; h < filter_h; h++) { @@ -135,10 +135,10 @@ owned_buffer::Buffer conv_using_shard_boundaries( owned_buffer::Buffer conv_using_sliding_window_op_config( const owned_buffer::Buffer &input_padded_tensor_buf, - const vector &filter_vector, + const std::vector &filter_vector, const std::vector &op_trace_metadata, - const vector> &shard_boundaries, - const vector> &sharded_input_top_left_indices, + const std::vector> &shard_boundaries, + const std::vector> &sharded_input_top_left_indices, uint32_t input_h, uint32_t input_w, uint32_t stride_h, @@ -149,7 +149,7 @@ owned_buffer::Buffer conv_using_sliding_window_op_config( uint32_t out_tensor_size) { auto conv_tensor_buf = owned_buffer::create(out_tensor_size); - vector input_window; + std::vector input_window; uint32_t out_idx = 0; for (auto j = 0; j < sharded_input_top_left_indices.size(); j++) { @@ -176,7 +176,7 @@ owned_buffer::Buffer conv_using_sliding_window_op_config( } std::vector pad_metadata_from_tensor_metadata(const std::vector> &tensor_metadata) { - vector ref_pad_metadata; + std::vector ref_pad_metadata; for (auto i = 0; i < tensor_metadata.size(); i++) { auto is_pad_stick = tensor_metadata[i].first; if (is_pad_stick) { diff --git a/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.hpp b/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.hpp index 90a899ccb86..c6de3eeec0d 100644 --- a/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.hpp +++ b/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.hpp @@ -25,14 +25,14 @@ owned_buffer::Buffer ref_conv_op( const Shape &input_nchw_shape, uint32_t stride_h, uint32_t stride_w, - const vector &filter_vector, + const std::vector &filter_vector, const Shape &filter_pyt_tensor_shape, const Shape &out_golden_pyt_tensor_shape); // Calculate convolution using op_trace_metadata on padded input buffer. owned_buffer::Buffer conv_using_op_trace_metadata( const owned_buffer::Buffer &input_padded_tensor_buf, - const vector &filter_vector, + const std::vector &filter_vector, const std::vector &op_trace_metadata, uint32_t stride_h, uint32_t stride_w, @@ -44,8 +44,8 @@ owned_buffer::Buffer conv_using_op_trace_metadata( // Calculate convolution using shards on padded input buffer. owned_buffer::Buffer conv_using_shard_boundaries( const owned_buffer::Buffer &input_padded_tensor_buf, - const vector &filter_vector, - const vector> &shard_boundaries, + const std::vector &filter_vector, + const std::vector> &shard_boundaries, uint32_t stride_h, uint32_t stride_w, uint32_t padded_input_h, @@ -59,10 +59,10 @@ owned_buffer::Buffer conv_using_shard_boundaries( // Calculate convolution using sliding window op configs on padded input buffer. owned_buffer::Buffer conv_using_sliding_window_op_config( const owned_buffer::Buffer &input_padded_tensor_buf, - const vector &filter_vector, + const std::vector &filter_vector, const std::vector &op_trace_metadata, - const vector> &shard_boundaries, - const vector> &sharded_input_top_left_indices, + const std::vector> &shard_boundaries, + const std::vector> &sharded_input_top_left_indices, uint32_t input_h, uint32_t input_w, uint32_t stride_h, @@ -73,23 +73,23 @@ owned_buffer::Buffer conv_using_sliding_window_op_config( uint32_t out_tensor_size); // Calculate Padding using tensor metadata. -vector pad_metadata_from_tensor_metadata(const vector> &tensor_metadata); +std::vector pad_metadata_from_tensor_metadata(const std::vector> &tensor_metadata); // Calculate Indices of pads in padded input buffer using halo kernel config's flattened pad config. -vector pad_indices_from_flattened_pad_config( - const vector> &flattened_pad_config, - const vector> &shard_boundaries); +std::vector pad_indices_from_flattened_pad_config( + const std::vector> &flattened_pad_config, + const std::vector> &shard_boundaries); // Calculate Indices of valid inputs in padded input buffer using halo kernel config's flattened local configs. -vector input_indices_from_flattened_local_config( - const vector> &flattened_local_config, - const vector> &shard_boundaries); +std::vector input_indices_from_flattened_local_config( + const std::vector> &flattened_local_config, + const std::vector> &shard_boundaries); // Calculate Indices of valid inputs in padded input buffer using halo kernel config's flattened remote configs. -vector input_indices_from_flattened_remote_config( +std::vector input_indices_from_flattened_remote_config( tt::tt_metal::Device *device, - const vector> &flattened_remote_config, - const vector> &shard_boundaries, + const std::vector> &flattened_remote_config, + const std::vector> &shard_boundaries, bool remote_read = false, bool is_block_sharded = false, bool transpose_mcast = false); diff --git a/ttnn/cpp/ttnn/operations/uniform/uniform_pybind.cpp b/ttnn/cpp/ttnn/operations/uniform/uniform_pybind.cpp index dc31fff1b31..39fd1275da4 100644 --- a/ttnn/cpp/ttnn/operations/uniform/uniform_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/uniform/uniform_pybind.cpp @@ -9,21 +9,27 @@ namespace ttnn::operations::uniform { void bind_uniform_operation(py::module &module) { - auto doc = - R"doc(uniform(input: Tensor, from: float = 0, to: float = 1, memory_config: Optional[MemoryConfig] = None, compute_kernel_config: Optional[ComputeKernelConfig] = None) -> Tensor - Generates a tensor with values drawn from a uniform distribution [`from`, `to`). The input tensor provides the shape for the output tensor, while the data type remains unchanged. - This operation allows configuration of memory allocation using `memory_config` and computation settings via `compute_kernel_config`. - - Args: - * :attr:`input`: The tensor that provides the shape for the generated uniform tensor. - * :attr:`from`: The lower bound of the uniform distribution. Defaults to 0. - * :attr:`to`: The upper bound of the uniform distribution. Defaults to 1. - * :attr:`memory_config`: The memory configuration for the generated tensor. - * :attr:`compute_kernel_config`: Optional configuration for the compute kernel used during generation. - - Returns: - Tensor: A new tensor with the same shape as `input` and values drawn from the specified uniform distribution. - )doc"; + std::string doc = + R"doc( + Update in-place the input tensor with values drawn from the continuous uniform distribution 1 / (`to` - `from`). + + Args: + input (ttnn.Tensor): The tensor that provides the shape for the generated uniform tensor. + from (float32): The lower bound of the uniform distribution. Defaults to 0. + to (float32): The upper bound of the uniform distribution. Defaults to 1. + + Keyword args: + memory_config (ttnn.MemoryConfig, optional): Memory configuration for the operation. Defaults to `None`. + compute_kernel_config (ttnn.DeviceComputeKernelConfig, optional): Configuration for the compute kernel. Defaults to `None`. + + Returns: + ttnn.Tensor: The `input` tensor with updated values drawn from the specified uniform distribution. + + Example: + >>> input = ttnn.to_device(ttnn.from_torch(torch.ones(3, 3), dtype=torch.bfloat16)), device=device) + >>> ttnn.uniform(input) + + )doc"; bind_registered_operation( module, diff --git a/ttnn/cpp/ttnn/reports.hpp b/ttnn/cpp/ttnn/reports.hpp index 9392f8eda7c..0eee2efedbc 100644 --- a/ttnn/cpp/ttnn/reports.hpp +++ b/ttnn/cpp/ttnn/reports.hpp @@ -7,6 +7,7 @@ #include #include "tt_metal/impl/buffers/buffer.hpp" +#include "tt_metal/impl/device/device_pool.hpp" namespace ttnn { @@ -64,49 +65,52 @@ struct BufferInfo { std::vector get_buffers() { std::vector buffer_infos; - for (const auto &[key, buffer] : tt::tt_metal::detail::BUFFER_MAP.value()) { - auto [device_id, address] = key; - auto device = buffer->device(); - - auto num_pages = buffer->num_pages(); - auto page_size = buffer->page_size(); - auto num_banks = device->num_banks(buffer->buffer_type()); - - std::map bank_to_num_pages; - if (buffer->buffer_layout() == tt::tt_metal::TensorMemoryLayout::INTERLEAVED) { - uint32_t bank_id = 0; - for (int page_index = 0; page_index < num_pages; page_index++) { - if (bank_to_num_pages.find(bank_id) == bank_to_num_pages.end()) { - bank_to_num_pages[bank_id] = 0; + for (const auto &device : tt::DevicePool::instance().get_all_active_devices()) { + for (const auto &buffer : device->get_allocated_buffers()) { + auto device_id = device->id(); + auto address = buffer->address(); + + auto num_pages = buffer->num_pages(); + auto page_size = buffer->page_size(); + auto num_banks = device->num_banks(buffer->buffer_type()); + + std::map bank_to_num_pages; + if (buffer->buffer_layout() == tt::tt_metal::TensorMemoryLayout::INTERLEAVED) { + uint32_t bank_id = 0; + for (int page_index = 0; page_index < num_pages; page_index++) { + if (bank_to_num_pages.find(bank_id) == bank_to_num_pages.end()) { + bank_to_num_pages[bank_id] = 0; + } + bank_to_num_pages[bank_id]++; + bank_id = (bank_id + 1) % num_banks; } - bank_to_num_pages[bank_id]++; - bank_id = (bank_id + 1) % num_banks; - } - } else { - const auto& buffer_page_mapping = *buffer->get_buffer_page_mapping(); - for (int page_index = 0; page_index < num_pages; page_index++) { - auto dev_page_index = buffer_page_mapping.host_page_to_dev_page_mapping_[page_index]; - auto core = buffer_page_mapping.all_cores_[buffer_page_mapping.dev_page_to_core_mapping_[dev_page_index]]; - auto bank_id = device->bank_ids_from_logical_core(buffer->buffer_type(), core)[0]; - - if (bank_to_num_pages.find(bank_id) == bank_to_num_pages.end()) { - bank_to_num_pages[bank_id] = 0; + } else { + const auto &buffer_page_mapping = *buffer->get_buffer_page_mapping(); + for (int page_index = 0; page_index < num_pages; page_index++) { + auto dev_page_index = buffer_page_mapping.host_page_to_dev_page_mapping_[page_index]; + auto core = + buffer_page_mapping.all_cores_[buffer_page_mapping.dev_page_to_core_mapping_[dev_page_index]]; + auto bank_id = device->bank_ids_from_logical_core(buffer->buffer_type(), core)[0]; + + if (bank_to_num_pages.find(bank_id) == bank_to_num_pages.end()) { + bank_to_num_pages[bank_id] = 0; + } + bank_to_num_pages[bank_id]++; } - bank_to_num_pages[bank_id]++; } - } - auto max_num_pages = - std::max_element(bank_to_num_pages.begin(), bank_to_num_pages.end(), [](const auto &a, const auto &b) { - return a.second < b.second; - }); - - BufferInfo buffer_info = {}; - buffer_info.device_id = device_id; - buffer_info.address = address; - buffer_info.max_size_per_bank = (*max_num_pages).second * page_size; - buffer_info.buffer_type = buffer->buffer_type(); - buffer_infos.push_back(buffer_info); + auto max_num_pages = + std::max_element(bank_to_num_pages.begin(), bank_to_num_pages.end(), [](const auto &a, const auto &b) { + return a.second < b.second; + }); + + BufferInfo buffer_info = {}; + buffer_info.device_id = device_id; + buffer_info.address = address; + buffer_info.max_size_per_bank = (*max_num_pages).second * page_size; + buffer_info.buffer_type = buffer->buffer_type(); + buffer_infos.push_back(buffer_info); + } } return buffer_infos; } @@ -125,23 +129,35 @@ struct BufferPageInfo { std::vector get_buffer_pages() { std::vector buffer_page_infos; - for (const auto &[key, buffer] : tt::tt_metal::detail::BUFFER_MAP.value()) { - if (not buffer->is_l1()) { - continue; - } + for (const auto &device : tt::DevicePool::instance().get_all_active_devices()) { + for (const auto &buffer : device->get_allocated_buffers()) { + if (not buffer->is_l1()) { + continue; + } - auto [device_id, address] = key; - auto device = buffer->device(); + auto device_id = device->id(); + auto address = buffer->address(); - uint32_t page_size = buffer->page_size(); - auto num_pages = buffer->num_pages(); - auto num_banks = device->num_banks(buffer->buffer_type()); + auto page_size = buffer->page_size(); + auto num_pages = buffer->num_pages(); + auto num_banks = device->num_banks(buffer->buffer_type()); - if (buffer->buffer_layout() == tt::tt_metal::TensorMemoryLayout::INTERLEAVED) { uint32_t bank_id = 0; for (int page_index = 0; page_index < num_pages; page_index++) { - auto page_address = buffer->page_address(bank_id, page_index); - auto core = buffer->logical_core_from_bank_id(bank_id); + CoreCoord core; + DeviceAddr page_address = 0; + + if (buffer->buffer_layout() == tt::tt_metal::TensorMemoryLayout::INTERLEAVED) { + page_address = buffer->page_address(bank_id, page_index); + core = buffer->logical_core_from_bank_id(bank_id); + bank_id = (bank_id + 1) % num_banks; + } else { + const auto &buffer_page_mapping = *buffer->get_buffer_page_mapping(); + auto dev_page_index = buffer_page_mapping.host_page_to_dev_page_mapping_[page_index]; + core = buffer_page_mapping.all_cores_[buffer_page_mapping.dev_page_to_core_mapping_[dev_page_index]]; + bank_id = device->bank_ids_from_logical_core(buffer->buffer_type(), core)[0]; + page_address = buffer->sharded_page_address(bank_id, dev_page_index); + } BufferPageInfo buffer_page_info = {}; buffer_page_info.device_id = device_id; @@ -153,28 +169,7 @@ std::vector get_buffer_pages() { buffer_page_info.page_address = page_address; buffer_page_info.page_size = page_size; buffer_page_info.buffer_type = buffer->buffer_type(); - buffer_page_infos.push_back(buffer_page_info); - bank_id = (bank_id + 1) % num_banks; - } - } else { - const auto& buffer_page_mapping = *buffer->get_buffer_page_mapping(); - for (int page_index = 0; page_index < num_pages; page_index++) { - auto dev_page_index = buffer_page_mapping.host_page_to_dev_page_mapping_[page_index]; - auto core = buffer_page_mapping.all_cores_[buffer_page_mapping.dev_page_to_core_mapping_[dev_page_index]]; - auto bank_id = device->bank_ids_from_logical_core(buffer->buffer_type(), core)[0]; - auto page_address = buffer->sharded_page_address(bank_id, dev_page_index); - - BufferPageInfo buffer_page_info = {}; - buffer_page_info.device_id = device_id; - buffer_page_info.address = address; - buffer_page_info.core_y = core.y; - buffer_page_info.core_x = core.x; - buffer_page_info.bank_id = bank_id; - buffer_page_info.page_index = page_index; - buffer_page_info.page_address = page_address; - buffer_page_info.page_size = page_size; - buffer_page_info.buffer_type = buffer->buffer_type(); buffer_page_infos.push_back(buffer_page_info); } } diff --git a/ttnn/cpp/ttnn/tensor/tensor_impl.cpp b/ttnn/cpp/ttnn/tensor/tensor_impl.cpp index bea72879c2d..d72df0a512e 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_impl.cpp +++ b/ttnn/cpp/ttnn/tensor/tensor_impl.cpp @@ -664,7 +664,7 @@ Tensor to_host_helper(const Tensor& tensor, bool blocking = true, uint8_t cq_id auto device = tensor.device(); TT_ASSERT(device != nullptr && "Need device to be set copy data from device to host!"); uint32_t size_in_bytes = device_buffer->size(); - vector data_vec; + std::vector data_vec; const char* TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE"); if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) { data_vec.resize(size_in_bytes / sizeof(T)); diff --git a/ttnn/cpp/ttnn/tensor/tensor_impl.hpp b/ttnn/cpp/ttnn/tensor/tensor_impl.hpp index a27b2e0bfdb..c9aaaf31b24 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_impl.hpp +++ b/ttnn/cpp/ttnn/tensor/tensor_impl.hpp @@ -271,7 +271,7 @@ inline void read_data_from_device_buffer( } template -inline void read_data_from_device_buffer(DeviceBuffer device_buffer, vector& host_buffer) { +inline void read_data_from_device_buffer(DeviceBuffer device_buffer, std::vector& host_buffer) { std::vector host_buffer_uint32; ::detail::ReadFromBuffer(device_buffer, host_buffer_uint32); host_buffer = unpack_uint32_vec(host_buffer_uint32); diff --git a/ttnn/cpp/ttnn/tensor/tensor_utils.hpp b/ttnn/cpp/ttnn/tensor/tensor_utils.hpp index 692e5b361fa..09f436acafd 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_utils.hpp +++ b/ttnn/cpp/ttnn/tensor/tensor_utils.hpp @@ -64,7 +64,7 @@ static std::vector compute_strides(const ttnn::SimpleShape& shape) { return strides; } -static int compute_flat_indices(const vector& indices, const vector strides) { +static int compute_flat_indices(const std::vector& indices, const std::vector strides) { int flat_index = 0; for (auto i = 0; i < indices.size(); i++) { flat_index += indices[i] * strides[i];